def preprocess(self, docs): bigram = Phrases(docs, min_count=self._min_count) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words dictionary.filter_extremes(no_below=self._no_below, no_above=self._no_above) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] self._corpus = corpus self._dictionary = dictionary if self._verbose: print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus))
def main(args): logger = logging.getLogger(__name__) logger.info('Preprocessing ' + args.input) results = [] stopword_list = build_stop_words() df = pd.DataFrame([]) docs = np.array([]) with timer("Load & Clean"): with open(f"{args.dir}/{args.input}", 'r', newline='', encoding='utf-8') as f: reader = csv.reader(f, delimiter=',') for row in reader: abstract_col = clean_str(row[8], stopword_list) # title, abstract, publish_time, authors, url results.append( [row[3], abstract_col, row[9], row[10], row[17]]) df = pd.DataFrame(results[1:], columns=results[0]) with timer("Drop NA & Duplicates"): df = df.drop_duplicates(subset='abstract', keep='first') df = df.dropna(subset=["abstract"]) with timer("Drop Non-English Papers"): df = check_language(df) with timer("Vectorize abstract column"): docs = np.array(list(sent_to_words(df.abstract))) with timer("Add bigrams to docs"): # Add bigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) with timer("Export results"): df.to_csv(f"{args.dir}/df_cleaned.csv", encoding='utf-8', index=False) np.save(f"{args.dir}/docs.npy", docs)
def preprocess(sentences, spacy_model, stopwords, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): dataset = [] for sentence in sentences: words = [token.lemma_ for token in spacy_model(sentence) if token.pos_ in allowed_postags] words = simple_preprocess(' '.join(words), deacc=True) words = [word for word in words if not word in stopwords] dataset.append(words) # Build the bigram and trigram models bigram = Phrases(dataset, min_count=5, threshold=100) # higher threshold fewer phrases. # trigram = Phrases(bigram[words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = Phraser(bigram) # trigram_mod = Phraser(trigram) bigrams = [bigram_mod[sentence] for sentence in dataset] return bigrams
def bigphrase_tfidf_feats(dataset): corpus = preprocessing_txt(dataset) lemmetized_sent = [] for each_sent in nlp.pipe(corpus, batch_size=50, n_threads=-1): if each_sent.is_parsed: res = [ tok.lemma_ for tok in each_sent if not tok.is_punct or tok.is_space or tok.is_stop or tok.like_num ] lemmetized_sent.append(res) else: lemmetized_sent.append(None) bigram = Phraser(Phrases(lemmetized_sent)) bigram_lem = list(bigram[lemmetized_sent]) parsed = [] for k in range(0, len(bigram_lem)): joined = ' '.join(bigram_lem[k]) parsed.append(joined) return parsed, bigram_lem
def build_gensim_model(features, num_features=100, min_word_count=100, context=5, downsampling=1e-3, verbose=True): """ """ from gensim.models import Phrases from gensim.models import word2vec import time import logging import multiprocessing start = time.time() if (verbose): # Lets make sure that we are logging—this will take a long time and its good to get updates logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) # Transforming to bigram representation bigram_transformer = Phrases(features) if (verbose): print("Training model...") # Initialize and train the model model = word2vec.Word2Vec(bigram_transformer[features], workers=multiprocessing.cpu_count(), \ size=num_features, \ min_count = min_word_count, \ window = context, \ sample = downsampling) # We don't plan on training the model any further, so calling # init_sims will make the model more memory efficient by normalizing the # vectors in-place. model.init_sims(replace=True) return (model)
def bow_corpus(original_corpus): docs = list(original_corpus) # Tokenize the documents. # Split the documents into tokens. tokenizer = RegexpTokenizer(r'\w+') for idx in range(len(docs)): docs[idx] = docs[idx].lower() # Convert to lowercase. docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words. # Remove numbers, but not words that contain numbers. docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are only one character. docs = [[token for token in doc if len(token) > 2] for doc in docs] # Lemmatize the documents. lemmatizer = WordNetLemmatizer() docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] # Compute bigrams. from gensim.models import Phrases # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # Remove rare and common tokens. # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=2, no_above=0.5) print(docs[0]) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] return corpus, dictionary
def __init__(self, num_topics=100, min_word_count=20, top_most_common_words=10, min_doc_length=40, max_doc_length=1000, random_state=None): self.num_topics = num_topics self.min_word_count = min_word_count self.top_most_common_words = top_most_common_words assert max_doc_length > min_doc_length, \ "max_doc_length must be greater than min_doc_length" self.min_doc_length = min_doc_length self.max_doc_length = max_doc_length self.random_state = random_state # natural language processing self.stop_words = self.getEnglishStopWords() self.bigramizer = Phrases()
def get_topic_extraction_glda(self, message, id): # self.load_lda_topic_model() tf_vectorizer = CountVectorizer(max_df=1, min_df=1, vocabulary=self.glda_tf_feature_names) docs = [] logger.propagate = False message = re.sub('\n', ' ', message) docs = self.message_corpus(message) print('Building BiGrams from the message...') bigram = Phrases(docs, min_count=2, threshold=2, delimiter=b' ') bigram_phraser = Phraser(bigram) texts = [bigram_phraser[line] for line in docs] bg_message = ' '.join(texts[0]) tf = tf_vectorizer.fit_transform([bg_message]) doc_topic = self.glda.transform(tf) self.config_dict = dict(self.config.items('TOPIC_LABEL')) list_topic_names = eval(self.config_dict['list_topic_names']) document_topics = [(list_topic_names[topicid], topicvalue) for topicid, topicvalue in enumerate(doc_topic[0]) if topicvalue >= 0.01] document_topics = sorted(document_topics, key=lambda score: score[1], reverse=True) # print(document_topics) return document_topics
def process_docs(self): for name in self.f_list: longName = (self.os_dir + name) if ".pdf" in name: if PDFPage.get_pages(longName, check_extractable=False): try: text = self.convert_pdf(longName) text = self.preprocess(text) self.corpus.append(text) self.data.append(name) except: print("Unable to parse PDF file: " + longName) if ".docx" in name: text = docx.process(longName) text = self.preprocess(text) self.corpus.append(text) self.data.append(name) elif name.split('.')[-1] == 'doc': print( "Detected .doc file: " + name + ". Please convert to .docx or .pdf if you want this file to be included." ) trigrams = Phrases(self.corpus, min_count=1, threshold=2, delimiter=b' ') trigram_phraser = Phraser(trigrams) trigram_token = [] for i in self.corpus: trigram_token.append(trigram_phraser[i]) self.corpus = trigram_token for x, arr in enumerate(self.corpus): self.corpus[x] = np.array(self.corpus[x]) self.corpus[x] = self.remove_exemptions(self.corpus[x]) self.corpus[x] = self.corpus[x].tolist()
def make_w2v(series, stopwords=[], size=200, window=5, min_count=5, workers=-1, epochs=20, lowercase=True, sg=0, seed=17, cbow_mean=1, alpha=0.025, sample=0.001, use_bigrams=True, threshold=10, bigram_min=5): # turn the series into a list, lower it, clean it sentences = [sentence for sentence in series] if lowercase: cleaned = [] for sentence in sentences: cleaned_sentence = [word.lower() for word in sentence] cleaned_sentence = [word for word in sentence if word not in stopwords] cleaned.append(cleaned_sentence) else: cleaned = [] for sentence in sentences: cleaned_sentence = [word for word in sentence] cleaned_sentence = [word for word in sentence if word not in stopwords] cleaned.append(cleaned_sentence) # incorporate bigrams if use_bigrams: bigram = Phrases(cleaned, min_count=bigram_min, threshold=threshold, delimiter=b' ') bigram_phraser = Phraser(bigram) tokens_list = [] for sent in cleaned: tokens_ = bigram_phraser[sent] tokens_list.append(tokens_) cleaned = tokens_list else: cleaned = cleaned # build the model model = Word2Vec(cleaned, size=size, window=window, min_count=min_count, workers=workers, seed=seed, sg=sg, cbow_mean=cbow_mean, alpha=alpha, sample=sample) model.train(series, total_examples=model.corpus_count, epochs=epochs) model_wv = model.wv # clear it to avoid unwanted transference del model return model_wv
def write_to_file_chartssb(no_delexi_charts: List[str], all_sents: List[List[str]]) -> None: with open(os.path.join('chartssb/original_data/', 'chartssb.box'), 'w') as g: with open(os.path.join('chartssb/original_data/', 'train.box'), 'w') as train: with open(os.path.join('chartssb/original_data/', 'test.box'), 'w') as test: with open(os.path.join('chartssb/original_data/', 'valid.box'), 'w') as valid: for chart in no_delexi_charts: chart_descs, _ = turn_chart_info_into_sentences(chart) #print(chart_descs) bigram2 = Phrases(all_sents, min_count=1, threshold=2) bigram2.add_vocab([["Financial", "Groups"], ["Law", "Firms"], ["Computer", "Science"]]) print("vocab=", bigram2.vocab) chart_infos_sentb = turn_dict_into_sent_b(chart_descs) new_infos = convert_chartssb_to_bigrams( chart_infos_sentb, bigram2) chart_lines_sentb = generate_files_sb(new_infos) len_all_chart_sentences = len(chart_lines_sentb) print("len=", len_all_chart_sentences) g.write(''.join(chart_lines_sentb)) for line_idx, chart_line in enumerate( chart_lines_sentb): if line_idx in list(range(5)): #print("test=", line_idx) test.write(chart_line) elif line_idx in list(range(5, 10)): #print("valid=", line_idx) valid.write(chart_line) elif line_idx in list( range(10, len_all_chart_sentences)): #print("train=", line_idx) train.write(chart_line)
def train_word2vec_bigram(word_statements, name='word2vec_fa_model'): phrases = Phrases(word_statements, min_count=30, progress_per=10000) bigram = Phraser(phrases) sentences = bigram[word_statements] num_cores = multiprocessing.cpu_count() w2v_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=num_cores - 1) w2v_model.build_vocab(sentences, progress_per=10000) w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) w2v_model.save(name) w2v_model.init_sims(replace=True) return w2v_model
def count_phrases(tokens_list): ''' 用 Bi-gram 來判斷連續詞(=短語),例如 new-york 或是 data-set 儘管 sklearn 的 CountVectorizer 寫法比較直覺,效果也比較好,但會直接轉成向量 因此為了保留原本的詞庫,方便之後可以分別跑 TF-IDF 和 Word2Vec 等不同路線時詞庫一致,因此使用 Phrases @param tokens_list: 斷詞後的詞串列,如 [['詞1-1','詞1-2'],['詞2-1','詞2-2']] @return: 銜接短語後的詞串列,如 [['詞1-1','詞1-2'],['詞2-1_詞2-2']] @note: 口試後已停用 ''' i = 0 bigram_tokens = [] bigram = Phrases(tokens_list, min_count=1, threshold=2) bigram_phraser = Phraser(bigram) for tokens in tokens_list: bigram_tokens.append(bigram_phraser[tokens]) i += 1 print('[%s] 正在篩選短語: %6d / %6d' % (t.now(), i, len(tokens_list)), end='\r') print('[%s] 短語已辨識完畢' % t.now()) return bigram_tokens
def ngrams(input_docs): """ Add bigrams (and possibly trigrams) to docs (only ones that appear 20 times or more). Uncomment trigram lines for trigram addition. :param input_docs: input docs file (gensim format) :return: docs file (list of lists) with appended ngrams """ output_docs = input_docs bigram = Phrases(output_docs, min_count=20) # trigram = Phrases(bigram[output_docs], min_count=20) for idx in range(len(output_docs)): for bigram_ in bigram[output_docs[idx]]: if '' in bigram_: # Token is a bigram, add to document. output_docs[idx].append(bigram_) # for token in trigram[bigram[bigram_]]: # if '' in token: # output_docs[idx].append(token) return output_docs
def lemmantizator(): file = open('arquivao.txt', 'r') docs = file.readlines() # Split the documents into tokens. tokenizer = RegexpTokenizer(r'\w+') for idx in range(len(docs)): docs[idx] = docs[idx].lower() # Convert to lowercase. docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words. # Remove numbers, but not words that contain numbers. docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are only one character. docs = [[token for token in doc if len(token) > 3] for doc in docs] nltk.download('wordnet') from nltk.stem.wordnet import WordNetLemmatizer lemmatizer = WordNetLemmatizer() docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] # Remove rare and common tokens. from gensim.corpora import Dictionary # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: ', len(dictionary)) print('Number of documents: ', len(corpus)) # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) docs = [bigram[d] for d in docs] with open('bigrammed.txt', 'w') as f: for item in docs: f.write("%s" % item) f.close()
def prep_text_lda(docs, vocab_size=20000): """ docs: (pd.Series str) cleaned text """ english_stopwords = set([s.replace("\'", "") for s in stopwords.words("english")]) tqdm.pandas(desc="Tokenizing") tokenized_docs = docs.progress_apply(lambda x: [w.lower() for w in tokenize(x)]) bigram = Phrases(tokenized_docs.values.tolist()) phraser = Phraser(bigram) tqdm.pandas(desc="Bigrams") bigrammed_docs = tokenized_docs.progress_apply(lambda tokens_: phraser[tokens_]) id2word = Dictionary(bigrammed_docs.values.tolist()) id2word.filter_extremes(keep_n=vocab_size, no_above=0.5) id2word.filter_tokens(bad_ids=[id2word.token2id[a] for a in english_stopwords if a in id2word.token2id]) id2word.compactify() tqdm.pandas(desc="Cleaning") tokenized = bigrammed_docs.progress_apply(lambda doc_tokens: " ".join([w for w in doc_tokens if w in id2word.token2id])) reconst_docs = tokenized.apply(lambda x: x.split()) return id2word, reconst_docs
def convert_features(df): bigram_transformer = Phrases(common_texts) model = Word2Vec(bigram_transformer[common_texts], min_count=1) model.save("word2vec.model") for col in df.columns: num_unique = len(df[col].unique()) if np.issubdtype(df[col].dtype, np.number): # numerical print(col, "[numerical", "#unique =", num_unique, "]") continue elif num_unique < 60 or (num_unique < 0.01 * df.shape[0] and num_unique < 100): # categorical print(col, "[categorical", "#unique =", num_unique, "]") df = convert_onehot(df, col) else: # text print(col, "[text", "#unique =", num_unique, "]") df = convert_word2vec(df, col) return df
def phrs_model(sentences): ''' Generate Phrases model to find potential phrases, save its phrases into csv file Input: sentences(list of list of words): sentences without stop words ''' model_ph = Phrases(sentences) #model_ph.save(PHRS_MODEL_NAME) gensim_phrs = model_ph.export_phrases(sentences) gensim_phrs = list(set(gensim_phrs)) gensim_phrs = [g[0].decode("utf-8") for g in gensim_phrs \ if g[0].split()[0]!=g[0].split()[1]] with open(PHRS_OUTFILE, 'w', newline='') as csvfile: writer = csv.writer(csvfile) sent = set() for i in gensim_phrs: if i not in sent: writer.writerow([i]) sent |= {i}
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. kwargs = {'max_vocab_size': 40000000, 'threshold': 8.}.update(kwargs) print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs) print('Saving...') bigram.save(out)
def read_data(config): """Reads data from provided csv file of with processed doc text""" data = config['csv_filenames'][0] docwords = [] file_rownames = [] with open(data, 'r') as f: csv_text = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar="", escapechar="\\") csv.field_size_limit(sys.maxsize) row_num = 0 for row in csv_text: tokens = [] # Reformat the incoming text text = row[-1][2:-2].replace("'", "").split(",") for token in text: token = token.strip() if token != ' ': tokens.append(token) # topicvecDir.py needs this nested list format # in order to run correctly. tokens = [tokens] file_rownames.append(data + str(row_num)) docwords.append(tokens) row_num += 1 # Add bigrams for outer_list in docwords: bigram = Phrases(outer_list) for i in range(len(outer_list)): for token in bigram[outer_list[i]]: if '_' in token: outer_list[i].append(token) return docwords, file_rownames
def __init__(self, fromdate, todate): self.fromdate = fromdate self.todate = todate print('Start reading sentences') documents = [ line.strip() for line in open(PATH + FILENAME).readlines() if len(line) > 1 and len(line) < 200 ] sentences = [ " ".join([w for w in sentence.split() if w not in stopWords]) for sentence in documents ] print("start tokenization...") #self.corpus = [nltk.word_tokenize(sentence) for sentence in self.sentences] self.corpus = [x.split(" ") for x in sentences] #print('Start tokenization') # self.corpus = [nltk.word_tokenize(sentence) for sentence in self.sentences] #print("CORPUS", self.corpus) print('Start phrases') self.phrases = Phrases(sentences=self.corpus, min_count=25, threshold=50) self.bigram = Phraser(self.phrases) # for sent in self.bigram[self.sentences]: # apply model to text corpus # pass for index, sentence in enumerate(self.corpus): self.corpus[index] = self.bigram[sentence] self.model = gensim.models.Word2Vec(**W2V_PARAMETERS) self.model.build_vocab(self.corpus) print('Build Word2Vec vocabulary') self.model.train(self.corpus, total_examples=self.model.corpus_count, epochs=self.model.iter) print('Estimated Word2Vec model')
def word2vec_measure(): article_names = ["expressen", "aftonbladet", "svd", "dn"] #, sentences = [] for single_article in article_names: print(" \n *** " + single_article + " *****") articles = db.get_articles(single_article) bigram = Phrases() for row in articles: row = IO.filter_text(row.lower()) sentence = [ word for word in row if word not in stopwords.words('swedish') ] sentences.append(sentence) bigram.add_vocab([sentence]) print(len(sentences)) num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count num_workers = 8 # Number of threads to run in parallel context = 5 # `context window` is the maximum distance between the current and predicted word within a sentence. downsampling = 1e-3 # Downsample setting for frequent words # bigram_model = Word2Vec(bigram[sentences], size=100) bigram_model = Word2Vec(bigram[sentences], workers=num_workers, \ size=num_features, sg=1, min_count = min_word_count, \ window = context, sample = downsampling) word2vec_result = bigram_model.most_similar( positive=['muslimska_brödraskapet'], topn=200) # filepath = prop.word2vec_count+single_article+".tsv" filepath = prop.word2vec_count + "all_10.tsv" IO.write_tuple(word2vec_result, filepath)
def generateTokens(data, data_Full, n_dim, myStopWords): phrases = Phrases(data) biggram = Phraser(phrases) """ #---------Check multiple token for a word biggram = Phraser(phrases) biggram[reviews[0]] """ #lstReviews = list(train_Dataset.Reviews.apply(lambda x: get_bigrams(x))) modelW2V = Word2Vec(biggram[data], size=n_dim, min_count=minFreq, window=2, sg=1) """ Term document frequency for weighted average of features """ tfVectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=myStopWords) nmf = TruncatedSVD(n_components=n_dim) tfFeatureSet = nmf.fit_transform(tfVectorizer.fit_transform(data_Full)) countVectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=myStopWords, max_df=.8) nmf = TruncatedSVD(n_components=n_dim) cvFeatureSet = nmf.fit_transform(countVectorizer.fit_transform(data_Full)) tfFeatureSet = mergeFeatureSet(tfFeatureSet, cvFeatureSet) #tfFeatureSet = tfVectorizer.transform(data_Full) tfidf = dict( zip( list( map(lambda x: str.replace(x, " ", "_"), tfVectorizer.get_feature_names())), tfVectorizer.idf_)) return modelW2V, tfidf, tfFeatureSet
def preprocess_doc(doc): '''Preprocess a document for training.''' # Remove some stuff before tokenization. # Remove email addresses. doc = re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', doc) # Tokenize the document. # Split the document into tokens. tokenizer = RegexpTokenizer(r'\w+') doc = doc.lower() # Convert to lowercase. doc = tokenizer.tokenize(doc) # Split into words. # Remove numbers, but not words that contain numbers. doc = [token for token in doc if token.isalpha()] # Remove words that are only one character. doc = [token for token in doc if len(token) > 1] # Remove stop words stop_words = set(stopwords.words('english')) stop_words = stop_words.union(['one', 'ax', 'max']) doc = [token for token in doc if not token in stop_words] # Lemmatize the documents. lemmatizer = WordNetLemmatizer() doc = [lemmatizer.lemmatize(token) for token in doc] # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(doc, min_count=20) for token in bigram[doc]: if '_' in token: # Token is a bigram, add to document. doc.append(token) return doc
def get_bigrams_from_preprocessed(self, min_count=0.1, threshold=10., scoring='default'): """ Computes bigrams after preprocessing. NOTE: overwrites preprocessed_text_ attribute. ------PARAMETERS------ min_count: minimum count of bigrams to be included threshold: scoring threshold for bigrams for inclusion scoring: gensim Phrases scoring function to evaluate bigrams for threshold """ x = Phrases(self.preprocessed_text_, min_count=min_count, threshold=threshold, scoring=scoring) x = Phraser(x) bigram_token = [] for sent in self.preprocessed_text_: bigram_token.append(x[sent]) self.preprocessed_text_ = bigram_token
def word2vec_train(self, model_f_name): _bigram = 0 import gensim, logging if (_bigram): bigram = Phrases(RW.texts_ko) self.model = word2vec.Word2Vec(bigram[self.texts_ko], **self.config) else: self.model = word2vec.Word2Vec(self.texts_ko, **self.config) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) """ outv = KeyedVectors() outv.vocab = self.model.wv.vocab outv.index2word = self.model.wv.index2word outv.syn0 = self.model.syn1neg """ #inout_sim = outv.most_similar('navi') #print (inout_sim) #fname = str(self.config['size'])+'_'+str(self.config['window'])+ '_'+model_f_name self.model.save(model_f_name) #test self.model.init_sims(replace=True)
def __init__(self, df): df = df[df['Type'] == 'Article'] word2vecSamples = list(df['Abstract']) stop_words = set(stopwords.words('english')) t0 = time() data = [] for i in word2vecSamples: temp = [] for j in word_tokenize(i): if j.lower() not in stop_words: # if j == 'amino': # print(j) temp.append(j.lower().translate( str.maketrans('', '', string.punctuation))) data.append(temp) self.data = data self.bigram_transformer = Phrases(data) print("done in %0.3fs." % (time() - t0))
def bigramGenerator(self): corpusStream = self.sentenceStream() phrases = Phrases(corpusStream, min_count=self.bigramMinCount, threshold=self.thresholdBigram) bigram = Phraser(phrases) inputStream = self.sentenceStream() bigramSentenceList = (bigram[sentence] for sentence in inputStream) bigramList = set() for bigramSentence in bigramSentenceList: for item in bigramSentence: if "_" in item: bigramList.add(item) print("Number of Unique Bigrams = ", len(bigramList)) for item in sorted(bigramList): if not os.path.exists(self.trainingLocation): os.makedirs(self.trainingLocation) with open(os.path.join(self.trainingLocation, "TC-phrases-bi.txt"), "a") as outFile: outFile.write(item + "\n")
def build_word_vec(show_log=True): section, year = volume.split(".") texts_path = "../arxiv/{0}/{1}/".format(section, year) files_list = shared.random_glob(texts_path, n_proc_articles) sentences = prepare_sentences(files_list, n_proc_articles) if show_log: logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if config.biGram: bigram_transformer = Phrases(sentences, min_count=10) sentences = list(bigram_transformer[sentences]) sentences = shared.plural_filter(sentences) return Word2Vec(sentences, min_count=min_count, size=size, window=window, workers=4)
def bigrams_with_gensim(data): from gensim.models import Phrases bigram = Phrases() sentences = [] for row in data: title = row['Headings'].replace('[','').replace(']','').replace("'",'') title = title + '.' #title = title.replace('--',' -- ') sentence = [word for word in nltk.word_tokenize(title.lower()) if word not in string.punctuation] sentences.append(sentence) bigram.add_vocab([sentence]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in stopwords.words("english"): spl = re.split(b'\_',key) spl = [s for s in spl if s !=''] if len(spl) > 1: bigram_counter[key] += bigram.vocab[key] print('Bigrams with gensim') for key, counts in bigram_counter.most_common(50): print('{}: {}'.format(key, counts)) return bigram