Example #1
0
    def data_prep(self):
        logger.debug(
            'Split the comments into sentences and obtain the vocabulary')
        all_words = []
        toxic_sentences = []
        attitude_list = []
        sentences_list = []

        for i in range(len(self.data_sentiment.index)):
            sentences = re.split(
                r'[.!]', self.data_sentiment['text'][i].replace('\n', ''))
            if self.data_sentiment['stars'][i] == 1:
                attitude = 0
            elif self.data_sentiment['stars'][i] == 5:
                attitude = 1
            for sentence in sentences:
                words = list(preprocessing.tokenize(sentence))
                if len(words) >= 3:
                    all_words += words
                    confid_score = sonar.ping(
                        text=sentence)['classes'][1]['confidence']
                    if confid_score > 0.8:
                        toxic_sentences.append(sentence)
                    attitude_list.append(attitude)
                    sentences_list.append(sentence)

        self.all_words = all_words
        self.toxic_sentences = toxic_sentences
        self.attitude_list = attitude_list
        self.sentences_list = sentences_list
Example #2
0
	def train(self, messages: Iterable[Tuple[Hashable, str]]) -> None:
		"""
		Train the model on a series of messages
		:param messages: [(user, msg)]
		"""
		g = defaultdict(lambda: defaultdict(int))
		for user, message in messages:
			for token in tokenize(message):
				g[user][token] += 1
		# at first voc is <token, index>
		self.voc: dict = {}
		self.users = {}
		for user, tokens in g.items():
			self.users[user] = len(self.users)
			for token in tokens:
				if token not in self.voc:
					self.voc[token] = len(self.voc)
		print(f"mat size: {len(self.users)}x{len(self.voc)} = {len(self.users)*len(self.voc):,}")
		self.mat = np.full((len(self.users), len(self.voc)), fill_value=self.alpha)
		for user, tokens in g.items():
			for token, count in tokens.items():
				self.mat[self.users[user], self.voc[token]] = count
		# turn voc into the list of tokens, useful in word_cloud
		self.voc = list(self.voc.keys())
		# normalize user vocabulary
		self.mat = self.mat/self.mat.sum(axis=1)[:, None]
Example #3
0
def Placeholder(sentence, model):
    while True:
        # sentence = "do you use credit cards?
        # sentence and make probabilities
        sentence = tokenize(sentence)
        X = bag_of_words(sentence, all_words)
        X = X.reshape(1, X.shape[0])
        X = torch.from_numpy(X).to(device)

        output = model(X)
        _, predicted = torch.max(output, dim=1)

        tag = tags[predicted.item()]

        probs = torch.softmax(output, dim=1)
        prob = probs[0][predicted.item()]
        if prob.item() > 0.75:
            for intent in intents['intents']:
                if tag == intent["tag"]:
                    if tag == "BMI":
                        return "BMI của bạn là {}".format(bmi)
                    else:
                        return random.choice(intent['responses'])
                    
        else:
            return "Tôi không hiểu bạn nói gì..."
Example #4
0
def main():
    x, y = load_dataset('data/amazon_reviews_multilingual_JP_v1_00.tsv',
                        n=5000)

    print('Tokenization')
    x = [clean_html(text, strip=True) for text in x]
    x = [' '.join(tokenize(text)) for text in x]
    x_train, x_test, y_train.y_test = train_test_split(x,
                                                       y,
                                                       test_size=0.2,
                                                       random_state=42)

    print('Binary')
    vectorizer = CountVectorizer(binary=True)
    train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

    print('COunt')
    vectorizer = CountVectorizer(binary=False)
    train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

    print('TF-IDF')
    vectorizer = TfidfVectorizer()
    train_and_eval(x_train, y_train, x_test, y_test, vectorizer)

    print('Bigram')
    vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    train_and_eval(x_train, y_train, x_test, y_test, vectorizer)
    def prepare_data(self):
        print("Prepare data...")

        #Pasamos a embeddings
        if self._type == "embeddings":
            self._data = []

            for sentence in self._text:
                self._data.append(preprocessing.tokenize(sentence))

            if self._concatenate:
                max_length = 0
                for sentence in self._data:
                    if len(sentence) > max_length:
                        max_length = len(sentence)

                self._data = preprocessing.padding_truncate(
                    self._data, max_length)

            print(self._data[1])
            print(self._data[2])

            self._data = preprocessing.delete_stopwords(self._data)
            self._vectors = np.array(
                preprocessing.word2embeddings(self._data, self._embedding,
                                              self._vocabulary,
                                              self._concatenate))
        else:
            self._text = preprocessing.apply_stemmer_stopword(self._text)
            self._vectors = preprocessing.word2tfidf(self._text)
    def printBibleReferences(self, text):
        tokens = preprocessing.tokenize(text)
        numTokens = len(tokens)
        refStr = ""
        # for name in self.bibleBookNames:
        #     print("name: "+name)
        for tokenI in range(numTokens):
            token = tokens[tokenI]
            # print("token: "+token)
            if token in self.bibleBookNames and tokenI < numTokens - 1:
                nextToken = tokens[tokenI + 1]
                if nextToken[0].isdigit() or nextToken[0] == 'i' or nextToken[
                        0] == 'x' or nextToken[0] == 'v':

                    if tokenI > 0:
                        refStr += tokens[tokenI - 1] + " "
                    refStr += token + " "
                    refStr += nextToken + " "
                    if tokenI < numTokens - 2:
                        refStr += tokens[tokenI + 2] + " "
                    refStr += "\n"
                else:
                    print("unlikely ref: " + token)
        print(refStr)
        return refStr
Example #7
0
    def test_tokenize(self):
        text = "This is an awesome text"
        tokens = tokenize(text)

        self.assertEqual(len(tokens), 5)
        self.assertIs(type(tokens), list)
        for token in tokens:
            self.assertIs(type(token), spacy.tokens.token.Token)
def preprocess_doc(doc):
    doc = preprocessing.tokenize(doc)
    doc = preprocessing.remove_punctuation(doc)
    doc = preprocessing.remove_numbers(doc)
    doc = preprocessing.lower(doc)
    doc = preprocessing.remove_common_stopwords(doc)
    doc = preprocessing.clean_doc(doc)
    return doc
Example #9
0
    def test_remove_stop_words(self):
        text = "This is an awesome text"
        tokens = tokenize(text)
        clean_tokens = remove_stop_words(tokens)

        self.assertEqual(len(clean_tokens), 2)
        self.assertIs(type(clean_tokens), list)
        for token in clean_tokens:
            self.assertIs(type(token), spacy.tokens.token.Token)
Example #10
0
    def line_to_instance(self, line: str) -> LabeledTokenizedDocument:
        m = json.loads(line)

        result = LabeledTokenizedDocument()
        result.id = m["name"]
        result.label = 'Delta' if m['delta'] else 'AH-1'
        result.tokens = preprocessing.tokenize(m["body"])

        return result
Example #11
0
    def test_lemmatization(self):
        text = "seems like awesome text"
        tokens = tokenize(text)
        lemmas = lemmatization(tokens)

        self.assertEqual(len(lemmas), len(tokens))
        self.assertIs(type(lemmas), list)
        for token in lemmas:
            self.assertIs(type(token), str)
Example #12
0
    def line_to_instance(self, line: str) -> LabeledTokenizedDocument:
        m = json.loads(line)

        result = LabeledTokenizedDocument()
        result.id = m["name"]
        result.label = 'AH' if m["violated_rule"] == 2 else 'None'
        text_without_quotations = JSONPerLineDocumentReader.replace_quoted_text_with_special_token(m["body"])

        result.tokens = preprocessing.tokenize(text_without_quotations)

        return result
Example #13
0
    def advQueryProcessing(self, engine):
        """Processing based on self.advOptions values
            options is a dictionary of following terms
            "allterms"
            "songname"
            "songend"
            "artist",
            "artistend"
            "genre"
            "pos"
            "from"
            "to"
            Arguments:
                engine -- object to which the Query object belongs
        """
        print("******Advanced query processing******")
        starttime = datetime.datetime.now()
        print("Start time", str(starttime))
        print("Query---", self.queryText)

        self.queryEngine = engine
        q_tokens = tokenize(self.queryText)  #tokenize queryText
        qs_tokens, qs_dict = stem(q_tokens)
        #    if len(qs_tokens) == 0:   ## uncomment to disallow blank query text
        #engine.noResult = True
        #return

        # all the documents satisfying the criteria
        querydocs = self.getAdvResults(qs_tokens)

        #Ranking based on score
        songHeap = []
        for doc in querydocs:
            if (type(querydocs) == type(
                [])):  # case where score is not available
                song = selSong(doc, {})
            else:
                song = selSong(
                    doc, querydocs[doc])  # calculates score document at a time
            songHeap.append(song)
        if len(songHeap) == 0:
            #engine.noResult = True
            self.noResult = True
            printDuration(starttime)
            return
        self.queryResult = songHeap
        songHeap.sort(reverse=True)

        #Fetching song details
        count = engine.displayLength
        self.nextSongListPrep(0, count)

        # retrieval time
        printDuration(starttime)
Example #14
0
def cli(texts_file, model_file, output_file, batch_size):
    texts: List[str] = json.load(texts_file)
    _, tokens_ids = tokenize(texts)

    model = BertBinaryClassifier().cuda()
    model.load_state_dict(torch.load(model_file))
    predictions, logits = predict(model, tokens_ids, batch_size)

    json.dump({
        "predictions": predictions,
        "logits": logits
    }, output_file, **JSON_ARGS)
Example #15
0
def build_indexes():
    """Indexing the documents and updating into db"""

    print("Building all indexes ")
    conn = create_connection(dbname)
    cur = conn.cursor()
    cur.execute(
        "CREATE TABLE IF NOT EXISTS terms(term TEXT PRIMARY KEY,cfreq INTEGER,dfreq INTEGER)"
    )
    cur.execute(
        """CREATE TABLE IF NOT EXISTS termdoc(term INTEGER,docid INTEGER,
                    tfreq INTEGER,dscore REAL,posList TEXT,
                    FOREIGN KEY (term)REFERENCES terms(term),
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (term, docid))""")
    cur.execute("""CREATE TABLE IF NOT EXISTS permArtist(key Text,docid Text,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (key))""")
    cur.execute("""CREATE TABLE IF NOT EXISTS permName(key Text,docid Text,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (key))""")
    cur.execute(
        """CREATE TABLE IF NOT EXISTS genreDoc(genre Text,docid INTEGER,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (genre, docid))""")
    cur.execute("""CREATE TABLE IF NOT EXISTS yearDoc(year Text,docid INTEGER,
                    FOREIGN KEY (docid)REFERENCES songs(id),
                    PRIMARY KEY (year, docid))""")
    cur.execute('SELECT * FROM songs')
    for row in cur:
        tokens = tokenize(row[5])
        if not tokens:
            continue
        stem_tokens, term_dict_local = stem(tokens)  #PorterStemmer
        updateTermTable(row[0], term_dict_local, conn)
        updateGenreYear(row[4], row[2], row[0], conn)
        permuterm(row[3], row[0], conn, "permArtist")  # perm artist
        permuterm(row[1], row[0], conn, "permName")  #Name
    print("calculating tfidf")
    calculate_Tf_Idf(cur, conn)
    now = datetime.datetime.now()
    print(str(now))
    conn.commit()
    conn.close()

    now = datetime.datetime.now()
    print(str(now))
    def preprocess(self, document, info=[]):
        document = to_unicode(document, info)
        words = tokenize(document)

        if self.split:
            words = split(words)

        if self.lower:
            words = (word.lower() for word in words)

        if self.remove_stops:
            words = remove_stops(words, STOPS)

        def include(word):
            return len(word) >= self.min_len and len(word) <= self.max_len
        words = (word for word in words if include(word))
        return words
Example #17
0
    def encoder_transform(self):
        logger.debug('Transfer the words to the coders')

        X_sentiment = []
        for sentence in self.sentences_list:
            words = list(preprocessing.tokenize(sentence))
            if len(words) >= 3:
                try:
                    words = words[:self.max_length]
                except:
                    pass
                words_idx = np.array(self.encoder.transform(words))
                arr = np.full(self.max_length, 0)
                arr[:len(words)] = words_idx
                X_sentiment.append(arr)
        self.X_sentiment = np.array(X_sentiment)
        self.X_sentiment_label = np.array(self.attitude_list)
Example #18
0
    def queryProcessing(self, engine):
        """query processing for basic Search
            Arguments:
                engine -- object to which the Query object belongs
        """
        self.queryEngine = engine
        print("*********Query processing********")
        starttime = datetime.datetime.now()
        print("Start time", str(starttime))
        print("Query---", self.queryText)

        q_tokens = tokenize(self.queryText)  #tokenize queryText
        qs_tokens, qs_dict = stem(q_tokens)

        if len(qs_tokens) == 0:
            #engine.noResult = True
            self.noResult = True  #choose one
            return

        # all the documents that contain any of the terms
        querydocs = self.getIndexes(qs_tokens)

        #Ranking based on score
        songHeap = []
        for doc in querydocs:  #scoring document at a time
            song = selSong(doc, querydocs[doc])
            songHeap.append(song)

        if len(songHeap) == 0:
            #engine.noResult = True
            self.noResult = True
            return

        self.queryResult = songHeap
        songHeap.sort(reverse=True)
        #Fetching song details
        start = 0
        count = None
        if self.topSearch == True:
            count = 1
        else:
            count = engine.displayLength
        self.nextSongListPrep(0, count)

        # retrieval time
        printDuration(starttime)
Example #19
0
    def preprocess(self, document, info=[]):
        document = preprocessing.to_unicode(document, info)
        words = preprocessing.tokenize(document)

        if self.split:
            words = preprocessing.split(words)

        if self.lower:
            words = (word.lower() for word in words)

        if self.remove_stops:
            words = preprocessing.remove_stops(words, preprocessing.FOX_STOPS)
            words = preprocessing.remove_stops(words, preprocessing.JAVA_RESERVED)

        def include(word):
            return len(word) >= self.min_len and len(word) <= self.max_len
        words = (word for word in words if include(word))
        return words
    def preprocess(self, document, info=[]):
        document = preprocessing.to_unicode(document, info)
        words = preprocessing.tokenize(document)

        if self.split:
            words = preprocessing.split(words)

        if self.lower:
            words = (word.lower() for word in words)

        if self.remove_stops:
            words = preprocessing.remove_stops(words, preprocessing.FOX_STOPS)
            words = preprocessing.remove_stops(words, preprocessing.JAVA_RESERVED)

        def include(word):
            return len(word) >= self.min_len and len(word) <= self.max_len
        words = (word for word in words if include(word))
        return words
    def prepare_data(self):
        print("Prepare data...")

        #Pasamos a embeddings
        if self._type == "embeddings":
            self._data = []

            for sentence in self._text:
                self._data.append(preprocessing.tokenize(sentence))

            self._data = preprocessing.delete_stopwords(self._data)

            self._vectors = np.array(
                preprocessing.word2embeddings(self._data, self._embedding,
                                              self._vocabulary,
                                              self._concatenate))
        else:
            self._text = preprocessing.apply_stemmer_stopword(self._text)
            self._vectors = preprocessing.word2tfidf(self._text)
Example #22
0
def preprocess_doc(row, context=True):
    citation_sentence = str(row['context'])
    if lda_params['markers']:
        citation_sentence = preprocessing.remove_markers(citation_sentence)
    if lda_params['tokenize']:
        citation_sentence = preprocessing.tokenize(citation_sentence)
    if lda_params['pos_tags'] != ():
        tags = preprocessing.lower(
            preprocessing.filter_pos_tags(citation_sentence,
                                          tags=lda_params['pos_tags']))
    if lda_params['punctuation']:
        citation_sentence = preprocessing.remove_punctuation(citation_sentence)
    if lda_params['numbers']:
        citation_sentence = preprocessing.remove_numbers(citation_sentence)
    citation_sentence = preprocessing.lower(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.get_bigrams(citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.get_trigrams(citation_sentence)
    if lda_params['common_stopwords']:
        citation_sentence = preprocessing.remove_common_stopwords(
            citation_sentence)
    if lda_params['custom_stopwords']:
        citation_sentence = preprocessing.remove_custom_stopwords(
            citation_sentence)
    if lda_params['pos_tags'] != ():
        citation_sentence = preprocessing.filter_pos(citation_sentence, tags)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence)
    if lda_params['bigrams'] and not lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams
    if lda_params['trigrams'] and not lda_params['bigrams']:
        citation_sentence = citation_sentence + trigrams
    if lda_params['bigrams'] and lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams + trigrams
    if lda_params['lemmatize']:
        citation_sentence = preprocessing.lemmatize(citation_sentence)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    return citation_sentence
Example #23
0
def prepare_train_data() -> (List[List[str]], List[List[str]]):
    import pandas as pd

    inputs = []
    labels = []

    if not os.path.isdir('ner-uk'):
        import subprocess

        subprocess.run(['git',
                        'clone',
                        'https://github.com/lang-uk/ner-uk'])

    for root, _, files in os.walk('ner-uk/data/'):
        for file in files:
            path = os.path.join(root, file)
            try:
                path_without_ext, ext = os.path.splitext(path)
                ann_path = path_without_ext + '.ann'

                if os.path.isfile(path) \
                        and ext == '.txt' and not path.endswith('.tok.txt') \
                        and os.path.isfile(ann_path):
                    with open(path, 'r') as f:
                        content = f.read()

                    ann = pd.read_csv(ann_path,
                                      sep='\t',
                                      header=None,
                                      names=['Index', 'Type', 'Snippet'])
                    ann[['Type', 'Start', 'End']] = ann['Type'].str.split(' ', expand=True)
                    ann[['Start', 'End']] = ann[['Start', 'End']].astype(int)

                    sub_inputs, sub_labels = tokenize(content, ann)

                    inputs.extend(sub_inputs)
                    labels.extend(sub_labels)
            except Exception as ex:
                print(f'{path} -> {ex}')

    return inputs, labels
    def build_from_text_file(self,
                             text_file: str,
                             max_lines: int = -1,
                             top_k: int = -1) -> None:
        """
        Builds vocabulary from a (possibly large) text file. Each line is tokenized (see "tokenize()"
        in "preprocessing.py".
        :param text_file: Input plain text file
        :param max_lines: Maximum lines read (if negative, no limit)
        :param top_k: Final vocabulary size (if negative, no limit)
        """
        from preprocessing import tokenize

        line_counter = 0

        counter = collections.Counter()

        with open(text_file) as f:
            for line in f:
                counter.update(tokenize(line))
                line_counter += 1
                if line_counter % 1000 == 0:
                    print("%d lines processed" % line_counter)
                if 0 < max_lines < line_counter:
                    print("Maximum line limit (%d) reached, reading stopped" %
                          max_lines)
                    break

        # trunk to top K words if required
        if top_k > 0:
            self.__vocabulary = counter.most_common(top_k)
        else:
            self.__vocabulary = counter.most_common()

        # make sure it's a list of tuples
        assert isinstance(self.__vocabulary, list)
        assert isinstance(self.__vocabulary[0], tuple)
        assert len(self.__vocabulary[0]) == 2
Example #25
0
    def file_to_instance(self, file_name: str) -> LabeledTokenizedDocument:
        relative_name = file_name.split('/')[-1]
        label = relative_name.split('_', 2)[1]
        file_id = relative_name.split('_', 2)[2]

        assert label in ['ah', 'delta']

        result = LabeledTokenizedDocument()
        result.label = label
        result.id = relative_name

        # read all lines first
        lines = []
        with open(file_name) as f:
            for line in f:
                lines.append(line)

        # remove last comment if this is AH
        if 'ah' == label:
            lines = lines[:-1]

        # here we can adjust the total size of the context; now there are 3 last comments
        # lines = lines[-2:]  # would leave only last two ones
        # print(len(lines))

        for line in lines:
            m = json.loads(line)
            result.tokens.append('___' + m["name"] + '___start__')
            result.tokens.extend(
                preprocessing.tokenize(JSONPerLineDocumentReader.replace_quoted_text_with_special_token(m["body"])))

        # print(result.id)
        # print(result.label)
        # print(result.tokens)

        return result
Example #26
0
def main() -> NoReturn:
    with open(sys.argv[1], 'r') as f:
        text = f.read()

    inputs, _ = tokenize(text)
    encoded_inputs = encode_inputs(inputs, remove_too_long=False)

    model = create_model()

    checkpoint_file_path = os.environ.get('CHECKPOINT_FILE',
                                          os.path.join(os.environ.get('CHECKPOINTS_DIR', '.'),
                                                       CHECKPOINT_FILE_NAME))
    model.load_weights(checkpoint_file_path)
    print(f'Weights is loaded from: {checkpoint_file_path}')

    classes = model.predict_classes(encoded_inputs,
                                    batch_size=BATCH_SIZE,
                                    verbose=1)

    for i in range(len(inputs)):
        for j in range(len(inputs[i])):
            token = inputs[i][j]
            if j < len(classes[i]) - 1:
                cl = classes[i][j + 1]
            else:
                cl = 0

            if token.startswith('##'):
                print(token[2:], end='')
            else:
                if j > 0:
                    print(' ', end='')
                if cl > 0:
                    print(f'[{CLASSES[cl]}] ', end='')
                print(token, end='')
        print('\n', end='')
Example #27
0
# Load file
with open('intents.json', 'r', encoding='utf-8') as f:
    intents = json.load(f)

# preprocessing word
all_words = []
tags = []
xy = []
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    tag = intent['tag']
    # add to tag list
    tags.append(tag)
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = tokenize(pattern)
        # add to our words list
        all_words.extend(w)
        # add to xy pair
        xy.append((w, tag))

ignore_words = ['?', '.', '!']
all_words = [w for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))

# Create training set
X_train = []
y_train = []
for (pattern_sentence, tag) in xy:
    # X: bag of words for each pattern_sentence
Example #28
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--path", help="address of file", type=str)

    parser.add_argument("--batch_size",
                        help="batch_size",
                        type=int,
                        default=12)
    parser.add_argument("--embedding_size",
                        help="dimension of vectors",
                        default=300,
                        type=int)
    parser.add_argument("--lr", type=float, help="learning rate", default=1e-5)
    parser.add_argument("--decay", help="L2 loss", type=float, default=1e-2)
    parser.add_argument("--iterator",
                        type=int,
                        help="number of iteration",
                        default=10)

    args = parser.parse_args()

    data = load_pickle(args.path)

    context = data["context"]
    question = data["question"]
    answer = data["answer"]

    cxt = []
    query = []
    ans = []

    for c, q, a in zip(context, question, answer):
        cxt.append(c.lower())
        query.append(q.lower())
        ans.append(a.lower())

    cxt = tokenize(cxt)
    query = tokenize(query)
    ans = tokenize(ans)

    word2idx, idx2word = make_dictionary(cxt, query, ans)

    query_ix = convert2idx(query, word2idx)
    context_ix = convert2idx(cxt, word2idx)
    answer_ix = convert2idx(ans, word2idx)

    ##preprocess data
    q_data, c_data, a_data, start_index, end_index = preprocess_data(
        query_ix, context_ix, answer_ix)

    train_data = makeBatch(q_data, c_data, start_index, end_index)

    train_loader = DataLoader(train_data,
                              collate_fn=pad_sequence,
                              batch_size=args.batch_size)
    ################################################################################################

    ## train
    model = BIDAF(
        embedder=WordEmbedder(args.embedding_size, len(word2idx)),
        encoder=Encoder(args.embedding_size, args.embedding_size),
        attention_flow=AttentionFlow(),
        modeling_layer=ModelingLayer(d_vector=args.embedding_size,
                                     bidirectional=True),
        output_layer=OutputLayer(d_vector=args.embedding_size)).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.decay)

    train(model, args.iterator, optimizer, criterion, train_loader)
Example #29
0
def buildGraph():
	
	database  = GraphDatabase()
	name = '20_class'
	filename = '/../vol/tensusers/nwidmann/processedDocuments/'+ name +'.pkl'
	minFrequency = 10 

	if not os.path.exists(filename):
        	print 'Load Documents'
		data = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
		#pdb.set_trace()
		#data = fetch_20newsgroups(categories=['rec.sport.baseball', 'rec.sport.hockey'], remove=('headers', 'footers', 'quotes'))
        	#data = fetch_20newsgroups(categories=['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'], remove=('headers', 'footers', 'quotes'))
		#data = fetch_20newsgroups(categories=['talk.politics.guns', 'rec.motorcycles'], remove=('headers', 'footers', 'quotes'))
		categories = data.target_names
		data = pd.DataFrame({'text': data['data'], 'category': data['target']})

        	for index, category in enumerate(categories):
			print 'Category: ' + category + '   N: ' + str(len(data[data.category==index]))

        	print 'Preprocessing'
        	docs = data.text.tolist()
		
		stopwords = getStopwords()
		vectorizer = CountVectorizer(min_df=minFrequency, stop_words=stopwords, tokenizer=tokenize, max_df=0.5, max_features=7000) 
        	wordCounts = vectorizer.fit_transform(docs)
        	vocabulary = vectorizer.get_feature_names()

		#pdb.set_trace()
        	print('Number of Unique words: %d' % len(vocabulary))
        	print('Minimal Frequency: %d' % minFrequency)

		docsSplitInSentences = [sent_tokenize(doc) for doc in docs]
		tokenizedCollection = [[tokenize(sentence) for sentence in sentences] for sentences in docsSplitInSentences]

		cleanedTokens = [[[lemmatizeAll(word) for word in sentence if word in vocabulary] for sentence in doc] for doc in tokenizedCollection]
		cleanedTokens = [filter(None, doc) for doc in cleanedTokens]
		data['sentences'] = cleanedTokens
		vocabulary = generateVocabulary(data.sentences.tolist())
		
		fullCleanText = [' '.join(sum(post, [])) for post in data.sentences.tolist()]
		data['cleanText'] = fullCleanText

		tfIdf = TfidfVectorizer(vocabulary=vocabulary, tokenizer=tokenize)
		docs = data.cleanText.tolist()
		tfidf_vec = tfIdf.fit_transform(docs)
		data['tfIdf'] = [list(elem) for elem in tfidf_vec.toarray()]

		tf = CountVectorizer(vocabulary=vocabulary, tokenizer=tokenize)
		tf_vec = tf.fit_transform(docs)
		data['tf'] = [list(elem) for elem in tf_vec.toarray()]

		# Remove posts with no features
		for index in range(len(data)):
			tfIdfSum = np.sum(data.loc[index, 'tfIdf'])
			if tfIdfSum==0:
				print index
				data.drop(index, inplace=True)
		data.index = range(len(data))

        	data.to_pickle(filename)

	
	data = pd.read_pickle(filename)
	vocabulary = generateVocabulary(data.sentences.tolist())
	#data.sentences = data.sentences[0:70]
	#pdb.set_trace()

	#toydata = [[0, [['This','is','it','.'],['it','.']]], [1,[['it','is','here','is','.']]]]
	#data = pd.DataFrame(toydata, columns=['category', 'sentences'])

	print 'Graph Construction'
	startNode = database.createFeatureNode(-1,'$Start$')
	endNode = database.createFeatureNode(len(vocabulary), '$End$')
	for index, text in enumerate(data.sentences):
		print 'Document' + str(index)
		label = data.category.loc[index]
		docNode = database.createDocumentNode(index, label)
		currNodes = []
		for sentence in text:
			preceedingWord = startNode
			database.createWeightedRelation(startNode,docNode, 'is_in')
			for ind, word in enumerate(sentence):
				exists = len(list(database.graph.find('Feature', property_key='word', property_value=word))) > 0
				if not exists:
					wordID = vocabulary[word]
					wordNode = database.createFeatureNode(wordID, word)
				else:
					wordNode = database.getFeatureNode(word)
				database.createWeightedRelation(wordNode, docNode, 'is_in')
				database.createWeightedRelation(preceedingWord, wordNode, 'followed_by')
				preceedingWord = wordNode
				if ind==len(sentence)-1:
					database.createWeightedRelation(wordNode, endNode, 'followed_by')
					database.createWeightedRelation(endNode, docNode, 'is_in')

	print 'Normalize relationships'
	docNodes = database.getNodes('Document')
	database.normalizeRelationships(docNodes, 'is_in')
	
	featureNodes = database.getNodes('Feature')
	database.normalizeRelationships(featureNodes, 'followed_by')

	print 'Create Matrix'
	docMatrix = identity(len(docNodes))
	featureMatrix = database.getMatrix(featureNodes)
	featureDocMatrix = database.getMatrix(featureNodes, docNodes, 'is_in')
	docAll = np.concatenate((docMatrix, np.transpose(featureDocMatrix)), axis=1)
	featureAll = np.concatenate((featureDocMatrix, featureMatrix), axis=1)
	combinedMatrix = np.concatenate((docAll, featureAll))
	print combinedMatrix.shape
	np.save('/../vol/tensusers/nwidmann/matrices/' + name, combinedMatrix)


	print 'Set Context Similarity'
	database.cypherContextSim()
	contextSim = database.getMatrix(featureNodes, relation='related_to', propertyType = 'contextSim')
	np.save('/../vol/tensusers/nwidmann/matrices/' + name + '_contextSim', contextSim)
def unique_nouns(nouns):
  unique_nouns = set([])
  for noun in nouns:
    unique_nouns.update(tokenize(noun))
  return unique_nouns
Example #31
0
File: main.py Project: musicjae/NLP
import numpy as np
import preprocessing
import similarity
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

f = open("plato.txt", 'r')
text = f.read()

## 전처리 ##

pre_text,w2i,i2w = preprocessing.tokenize(text)

## co 행렬 ##

mat = preprocessing.create_co_matrix(pre_text,vocab_size=len(w2i))

## 단어 벡터 간 유사도 확인 ##

c1 =mat[w2i['plato']]
c2 = mat[w2i['socrates']]


print(f'유사도 값: {similarity.cos_sim(c1,c2)}')
print('\n유사도랭킹: ')
#print(similarity.most_similar('socrates',w2i,i2w,mat,top=7))

W = similarity.ppmi(mat)
np.set_printoptions(precision=3) #유효자릿수 3
#print('\nPPMI로 구한 유사도 랭킹')
#print(similarity.most_similar('socrates',w2i,i2w,W,top=7)) ########### 시간이 오래 걸려서 기다려야 한다 ###################
Example #32
0
# In[4]:

words, word_to_id, id_to_word = ps.get_words(data_train + data_test + data_val,
                                             max_features)
class_set, cls_to_id, id_to_cls = ps.get_classes(label_val)

max_features = len(words)

# In[5]:

max_features

# In[6]:

X_train, y_train = ps.tokenize(data_train, label_train, word_to_id, cls_to_id,
                               len(class_set))
X_test, y_test = ps.tokenize(data_test, label_test, word_to_id, cls_to_id,
                             len(class_set))
X_val, y_val = ps.tokenize(data_val, label_val, word_to_id, cls_to_id,
                           len(class_set))

# In[7]:

print(max(map(len, X_train)))
print(max(map(len, X_test)))
print(max(map(len, X_val)))

# In[8]:

if ngram_range > 1:
    maxlen = 100
Example #33
0
words_df
#at this point words df is just a column of review texts and their associated scores
# preprocessing.preprocessForSentimentAnalsis(words_df['reviewText'][4], preprocessing.stopwords,preprocessing.lemmatizer);
# words_df['documents']=words_df['reviewText'].map(preprocessing.preprocess
# words_df = words_df[words_df['documents'] != False]
#documents column now is just the preprocessed words stripped of fluff, ready to be turned into a sparse matrix
#First we just need a list of all of the words

#now we construct our CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#generate a sparce array from the things
words_df['documents'] = [
    " ".join(preprocessing.tokenize(doc)).split(" ")
    for doc in words_df['documents']
]
all_words = vectorization.getAllWordsFromDF(words_df, 'documents')
docList = [" ".join(doc) for doc in words_df['documents']]

# docList = vectorization.ListToString(words_df,'documents')
v, sparceVector = vectorization.vectorize(CountVectorizer, all_words, docList)
sv_array = sparceVector.toarray()

#now we just need to form our labels in whatever way we want them to
words_df["pos_neg"] = words_df['overall'].map(vectorization.binarizeRating)
import sklearn
import numpy as np
xTrain, xTest, yTrain, yTest = sklearn.model_selection.train_test_split(
    sv_array, list(words_df['pos_neg']), test_size=.3)