def process_text(corpus, stoplist=None, bigrams=None, trigrams=None, keep_all=False, no_below=10, no_above=0.8): """ Extracts text data from the corpus Cleans and tokenizes text data Computes most frequent phrases, creates a dictionary and converts the corpus to a BOW model :param corpus: :return: processed corpus with phrases, dictionary and BOW corpus """ logging.info("Cleaned and tokenzed dataset") text_dataset = clean_and_tokenize(corpus, stoplist=stoplist, keep_all=keep_all) if bigrams is not None: bi_grams = Phrases(text_dataset, threshold=bigrams, min_count=no_below) text_dataset = bi_grams[text_dataset] elif trigrams is not None: bi_grams = Phrases(text_dataset, threshold=bigrams) tri_grams = Phrases(bi_grams[text_dataset], threshold=trigrams) text_dataset = tri_grams[bi_grams[text_dataset]] dictionary = Dictionary(text_dataset) dictionary.filter_extremes(no_below=no_below, no_above=no_above) bow_corpus = [dictionary.doc2bow(text) for text in text_dataset] return text_dataset, dictionary, bow_corpus
def _build_vocab(self, max_vocab_cnt): all_words = [] for data in self.valid + self.non_valid: all_words.append(data["title"] + data["content"]) vocab = Dictionary(all_words) raw_vocab_size = len(vocab) vocab.filter_extremes(no_below=5) vocab.filter_extremes(keep_n=max_vocab_cnt) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["a", "i"] and True or False, vocab.values())) vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words))) if self.config.use_dict == "seq" and self.config.enable_pad: vocab.token2id[PAD] = len(vocab) vocab.compactify() self.pad_wid = vocab.token2id.get(PAD) self.vocab_seq = vocab # seq dictionary # build bow dictionary self.vocab_bow = copy.deepcopy(vocab) self.vocab_bow.filter_tokens( map(self.vocab_bow.token2id.get, STOPWORDS)) # filter stop words self.vocab_bow.compactify() if self.config.tfidf: tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words] self.tfidf_model = TfidfModel(tfidf_corpus) print("Load corpus with non_valid size %d, valid size %d, " "raw vocab size %d seq vocab size %d, bow vocab size %d" % (len(self.non_valid), len(self.valid), raw_vocab_size, len(self.vocab_seq), len(self.vocab_bow)))
def buildDictionary(self, corpus, txt2tokens, opts): ''' Tokenize texts and add tokens to dictionary. :param corpus: Corpus-like or id :param txt2tokens: txt2tokens or id :param opts: GensimDictBuildOptions :param ctx: pytopia context :return: gensim Dictionary ''' t = clock() corpus, txt2tokens = self.resolve(corpus, txt2tokens) # fill the dictionary with tokens from corpus texts dictionary = Dictionary(documents=None) numDocs = 0; numTokens = 0 for txto in corpus: tokens = txt2tokens(txto.text) numDocs += 1; numTokens += len(tokens) dictionary.doc2bow(tokens, allow_update=True) # form filtering options and run filtering no_below = opts.docLowerLimit if opts.docLowerLimit is not None else 0 if opts.docUpperLimit is None: no_above = 1.0 elif isinstance(opts.docUpperLimit, float): no_above = opts.docUpperLimit else: no_above = opts.docUpperLimit/float(numDocs) if opts.words2keep is None: keep_n = numTokens else: keep_n = opts.words2keep dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() # force id2token map building someId = dictionary.token2id.values()[0] dictionary[someId] return GensimDictAdapter(dictionary, corpus.id, txt2tokens.id, opts)
def preprocess(tweets): tweet_list = [preprocess_one(tweet) for tweet in tweets] print("Passed initial Processing...") # Train bigrams/trigrams model only when there is a list of many tweets def n_grams(tweets): ngram = Phrases(tweets) for ind in range(len(tweets)): for word in ngram[tweets[ind]]: if '_' in word: tweets[ind].append(word) return tweets tweet_list = n_grams(tweet_list) print("Passed ngram Processing...") # Use to create Bag-of-Words when possessing a list of tweets dictionary = Dictionary(tweet_list) print("Passed dictionary creation...") # Filter out words that occur less than 10 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=10, no_above=0.5) corpus = [dictionary.doc2bow(tweet) for tweet in tweet_list] print("Number of Unique Words:", str(len(dictionary))) print("Number of documents:", str(len(corpus))) return tweet_list, dictionary, corpus
def clean_docs(self, docs): """Removes uneccessary words (noise) or in this case words that will bring our models to the worse case scenario""" # Remove numbers, but not words that contain numbers. docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are only one character. docs = [[ token for token in doc if len(token) > 1 and token not in stop_words ] for doc in docs] # lemmatizer = WordNetLemmatizer() # docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] return docs, dictionary, corpus
def vectorize(corpus): tokenized = [Tokenizer.tokenize(doc) for doc in corpus] dictionary = Dictionary(tokenized) dictionary.filter_extremes(no_below=10, no_above=0.66) bows = [dictionary.doc2bow(doc) for doc in tokenized] return dictionary, bows
def __prep_texts(self, include_bigrams=False): print("--- Preparing Texts for Model ---\n") cleaned_text = str(self.text_column) + "_clean" if self.algo == 'gensim': doc_lst = self.processed_df[cleaned_text].tolist() doc_lst = [word_tokenize(str(doc)) for doc in doc_lst] if include_bigrams: # Compute bigrams. # Add bigrams to docs (as per the linked NPMI paper). bigram = Phrases(doc_lst, threshold=10e-5, scoring='npmi') for idx in range(len(doc_lst)): temp_bigram = [] for token in bigram[doc_lst[idx]]: if '_' in token: # Token is a bigram, add to document. temp_bigram.append(token) doc_lst.append(temp_bigram) # Create Corpus dictionary = Dictionary(doc_lst) dictionary.filter_extremes(no_above=0.9) corpus = [dictionary.doc2bow(text) for text in doc_lst] self.texts = doc_lst self.dictionary = dictionary self.corpus = corpus else: doc_lst = self.processed_df[cleaned_text].tolist() self.texts = doc_lst vectorizer = CountVectorizer(strip_accents='unicode', max_df=0.9, lowercase=True) data_vectorized = vectorizer.fit_transform(self.texts) self.lda_vectorizer = vectorizer self.lda_dtm = data_vectorized
class ExtraWordFilter(object): def __init__(self): self.dct = None self.stopwords = None def fit(self, docs, no_above, **kwargs): segmented_docs = [doc.lower().split() for item in docs for doc in item] self.dct = Dictionary(segmented_docs) self.dct.filter_extremes(no_above=no_above, **kwargs) print("Extra Dct size:{}".format(len(self.dct.token2id))) # print("Dct keys: {}".format(self.dct.token2id.keys())) return self.dct.token2id def transform(self, docs): segmented_docs = [[doc.split() for doc in item] for item in docs] transformed_docs = [[ " ".join([ word for word in doc if word.lower() in self.dct.token2id.keys() or word in string.punctuation ]) for doc in item ] for item in segmented_docs] return transformed_docs def fit_transform(self, docs, no_above, **kwargs): self.fit(docs, no_above, **kwargs) return self.transform(docs)
def doc_embed_charity_notfidf(processed_docs, word_min=5, word_max_perc=.8): 'Takes a list of preprocessed texts and returns an embedding vector for each document, a dictionary of the words within the corpus, and the glove vectors for each word in the corpus' # Create dictionary from corpus docs_dict = Dictionary(processed_docs) docs_dict.filter_extremes(no_below=word_min, no_above=word_max_perc) docs_dict.compactify() # Convert docs into sparce matricx (N_docs x N_words in dictionary) where the number in each cell indicates the number of time that word appeared in that document docs_corpus = [docs_dict.doc2bow(doc) for doc in processed_docs] docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_corpus]) #Count number of documents and words in dictionary num_docs = np.shape(docs_vecs)[0] num_words = np.shape(docs_vecs)[1] print("Total # of docs: {}".format(num_docs)) print("Total # of words in dict: {}".format(num_words)) # For each word in dict extract embedding vector (Glove vectors) glove_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))]) # Sum glove vectors over words in doc docs_emb = np.dot(docs_vecs, glove_vecs) return docs_emb, docs_dict, glove_vecs
class MyCorpus(object): def __init__(self, input_file, K): self.K = K self.input_file = input_file self.dictionary = Dictionary() with open(input_file, "rt") as f: for line in f: self.dictionary.add_documents([line.split()]) self.dictionary.filter_extremes(no_below = 2, no_above = 0.5, keep_n = K) def __iter__(self): count = 1 with open(self.input_file, "rt") as f: count += 1 for line in f: yield self.dictionary.doc2bow(line.rstrip().split()) def __str__(self): s = "MyCorpus(" + str(self.dictionary.num_docs) + " documents, " s += str(len(self.dictionary.keys())) + " features, " s += str(corpus.dictionary.num_nnz) + " non-zero entries)" return s def __repr__(self): return "MyCorpus('" + self.input_file + "', " + str(self.K) + ")"
def main(): doc = get_doc() print('doc len:', len(doc)) train_texts = list(build_texts(doc)) print('train len:', len(train_texts)) bigram = gensim.models.Phrases( train_texts, min_count=10) # for bigram collocation detection stops = set(stopwords.words('english')) # nltk stopwords list train_texts = process_texts(train_texts, bigram, stops) print('bigramed train_texts', len(train_texts)) vocabulary = Dictionary(train_texts) print('vocab size:', len(vocabulary)) # remove extremes vocabulary.filter_extremes( no_below=3, no_above=0.3 ) # remove words in less than 5 documents and more than 50% documents #vocabulary.filter_n_most_frequent(50) # Filter out 1000 most common tokens # filter_tokens(bad_ids=None, good_ids=None) corpus = [vocabulary.doc2bow(text) for text in train_texts] print('corpus size:', len(corpus)) lda = LdaModel(corpus=corpus, id2word=vocabulary, num_topics=10, chunksize=1500, iterations=200, alpha='auto') print( pd.DataFrame([[word for rank, (word, prob) in enumerate(words)] for topic_id, words in lda.show_topics( formatted=False, num_words=6, num_topics=35)]))
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def make_dict_and_corpus(tweets, upper_limit, lower_limit): twitter_wakati_texts = wakati_tweets(tweets) dictionary = Dictionary(twitter_wakati_texts) if upper_limit is not None and lower_limit is not None: dictionary.filter_extremes(no_below=lower_limit, no_above=upper_limit) corpus = [dictionary.doc2bow(t) for t in twitter_wakati_texts] return dictionary, corpus
class LdaTransformer(BaseEstimator, TransformerMixin): def __init__(self, dim = 2, column = 'whole'): self.dim = dim self.column = column def fit(self, X, y=None): lda_tokens = X[self.column].apply(lambda x: x.split()) # create Dictionary and train it on text corpus self.lda_dic = Dictionary(lda_tokens) self.lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000) lda_corpus = [self.lda_dic.doc2bow(doc) for doc in lda_tokens] # create TfidfModel and train it on text corpus self.lda_tfidf = TfidfModel(lda_corpus) lda_corpus = self.lda_tfidf[lda_corpus] # create LDA Model and train it on text corpus self.lda_model = LdaMulticore( lda_corpus, num_topics=self.dim, id2word=self.lda_dic, workers=4, passes=20, chunksize=1000, random_state=0 ) return self def transform(self, X, y=None): lda_emb_len = len(self.lda_model[[]]) lda_corpus = [self.lda_dic.doc2bow(doc) for doc in X[self.column].apply(lambda x: x.split())] lda_corpus = self.lda_tfidf[lda_corpus] lda_que_embs = self.lda_model.inference(lda_corpus)[0] # append lda question embeddings out = np.zeros((len(X), lda_emb_len)) for i in range(lda_emb_len): out[:, i] = lda_que_embs[:, i] return out
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic): stop_words = set(stopwords.words('english')) stop_words.add(u'rt') print('Loading tweets from ' + tweets_file) tweets = pd.read_pickle(tweets_file) if author_topic: tweets = tweets.groupby('user').agg({'text': 'sum'}) print('%d tweets loaded' % len(tweets.index)) dictionary = Dictionary(tweets['text']) stopword_ids = map(dictionary.token2id.get, stop_words) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in tweets['text']] # print(corpus) print("Writing corpus to " + corpus_file) MmCorpus.serialize(corpus_file, corpus) # print(dictionary) print("Writing dictionary to " + dictionary_file) dictionary.save(dictionary_file)
def filtrar_extremos(docs, max_freq=0.5, min_wordcount=2, n_top=3): dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq) dictionary.filter_n_most_frequent(n_top) _ = dictionary[0] return dictionary
def _prepare(self, dataset): docs = dataset dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=2, no_above=0.5) corpus = [dictionary.doc2bow(doc) for doc in docs] _ = dictionary[0] return corpus, dictionary
class TFIDF(): def __init__(self): pass def preprocess_tfidf(self): return [process_text(r) for r in get_db_records()] def create_tfidf_model(self): self.dataset = self.preprocess_tfidf() self.dct = Dictionary(self.dataset) self.dct.filter_extremes(no_below=50) corpus = [self.dct.doc2bow(line) for line in self.dataset] self.model = TfidfModel(corpus) def infer_tfidf(self): def infer(vector): dim = self.dct.keys()[-1] + 1 text1 = self.model[self.dct.doc2bow(vector)] t1 = [] for d in range(dim): t1_val = [i[1] for i in text1 if i[0] == d] if len(t1_val) == 1: t1.append(t1_val[0]) else: t1.append(0) return t1 return infer @staticmethod def load(filename): with open(filename, "rb") as f: return pickle.load(f)
def texts2corpus(documents, tfidf=False, stopwords=None, filter_below=5, filter_above=0.5, keep_n=100000, logg=print): logg(f'generating {"tfidf" if tfidf else "bow"} corpus and dictionary') dictionary = Dictionary(documents, prune_at=None) dictionary.filter_extremes(no_below=filter_below, no_above=filter_above, keep_n=keep_n) # filter some noice (e.g. special characters) if stopwords: stopword_ids = [dictionary.token2id[token] for token in stopwords] dictionary.filter_tokens(bad_ids=stopword_ids, good_ids=None) bow_corpus = [dictionary.doc2bow(text) for text in documents] if tfidf: tfidf_model = TfidfModel(bow_corpus) corpus = tfidf_model[bow_corpus] else: corpus = bow_corpus return corpus, dictionary
def preprocess(docs,no_below=20,no_above=0.7): # input is a an array of docs; each is one string tokenizer = RegexpTokenizer(r'\w+') for idx in range(len(docs)): docs[idx] = docs[idx].lower() # Convert to lowercase. docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words. # Remove numbers, but not words that contain numbers. docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are less than three characters docs = [[token for token in doc if len(token) > 2] for doc in docs] # Remove short words that are not in the dictionary docs = [[token for token in doc if len(token) > 4 or enchantdict.check(token)] for doc in docs] # Lemmatize all words in documents. lemmatizer = WordNetLemmatizer() docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] # Delete words based on their frequency in the whole corps # Create a dictionary representation of the documents. dictionary = Dictionary(docs) #set_trace() # Filter out words that occur less than 20 documents, or more than 70% of the documents. dictionary.filter_extremes(no_below, no_above) # According to the filtered dictionary, reconstruct the corpus corpus = [dictionary.doc2bow(doc) for doc in docs] return corpus, dictionary
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary of features logger.info("Creating features (including n-grams) from texts") gemsim_dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare features logger.info("Features dictionary contains %d features. Filtering..." % len(gemsim_dictionary.token2id)) gemsim_dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None) gemsim_dictionary.compactify() logger.info("Features Dictionary contains %d features." % len(gemsim_dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(gemsim_dictionary) return dict_model
def preprocess(documents, stem=False, vocab_size=10000, oov_token="<OOV>", oov_id=-1): """Preprocess documents. Args: documents: An array of strings, each string representing a document. stem: (bool) Whether to use a stemmer. Defaults to False. Returns: (gensim Dictionary, tokenized documents) """ porter_stemmer = PorterStemmer() def process_document(doc): tokens = word_tokenize(doc) tokens = [token.lower() for token in tokens if token.isalpha()] if stem: tokens = [porter_stemmer.stem(token) for token in tokens] return tokens tokenized_docs = list(map(process_document, documents)) dictionary = Dictionary(tokenized_docs) dictionary.filter_extremes(no_below=5, no_above=0.8, keep_n=vocab_size) # Add OOV to dictionary dictionary.add_documents([["<OOV>"]]) return dictionary, tokenized_docs
def parse_processed_amazon_dataset(task_files, max_words=10000): """ Code inspired by: https://github.com/sclincha/xrce_msda_da_regularization """ datasets = {} dico = GensimDict() print("Parsing", task_files) # First pass on document to build dictionary for fname in task_files: with open(fname, 'r') as f: for l in f: tokens = l.split(' ') tokens_list = [] for tok in tokens[:-1]: ts, tfreq = tok.split(':') freq = int(tfreq) tokens_list += [ts] * freq dico.doc2bow(tokens_list, allow_update=True) # Preprocessing_options dico.filter_extremes(no_below=2, keep_n=max_words) dico.compactify() for fname in task_files: X, Y = [], [] with open(fname, 'r') as f: for docid, l in enumerate(f): tokens = l.split(' ') label_string = tokens[-1] tokens_list = [] for tok in tokens[:-1]: ts, tfreq = tok.split(':') freq = int(tfreq) tokens_list += [ts] * freq count_list = dico.doc2bow(tokens_list, allow_update=False) idx, freqs = list(zip(*count_list)) one_hot = np.zeros(max_words) one_hot[list(idx)] = np.array(freqs) X.append((docid, one_hot)) #Preprocess Label ls, lvalue = label_string.split(':') if ls == "#label#": if lvalue.rstrip() == 'positive': Y.append(1) elif lvalue.rstrip() == 'negative': Y.append(0) else: raise Exception("Invalid Label Value") else: raise Exception('Invalid Format') datasets[os.path.split(os.path.split(fname)[0])[-1]] = (X, Y) return datasets, dico
def prepare_LDA_input(corpus, LDA_model): # Prepare input to LDA model corpus = [clean_text(text).split() for text in corpus] dict_corpus = Dictionary(corpus) dict_corpus.filter_extremes(no_below=5, no_above=0.3, keep_n=None) bow_corpus = [dict_corpus.doc2bow(c) for c in corpus] # Get topic-doc vector LDA_input = [] for doc in bow_corpus: LDA_input.append(LDA_model.get_document_topics(doc)) # Add missing probabilities for doc in LDA_input: index = [] true_index = set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) for i in range(len(doc)): index.append(doc[i][0]) new_index = true_index- set(index) for j in new_index: doc.extend([(j, 0.0)]) doc.sort() # Create input matrix LDA_doc = [] for doc in LDA_input: LDA_doc.append(np.asarray([doc[0][1], doc[1][1], doc[2][1], doc[3][1], doc[4][1], doc[5][1], doc[6][1], doc[7][1], doc[8][1], doc[9][1], doc[10][1], doc[11][1]], dtype='float32')) LDA_doc = np.array(LDA_doc) return LDA_doc
class LDATransformer: """Preps data for LDA. TODO: add options to slim down vocab and filter words. Also make the methods more efficient. """ def fit(self, texts): all_words = [] docs = [preprocess_string(d) for d in texts] self.vocab = Dictionary(docs) self.vocab.filter_extremes() return self def transform(self, docs): """TODO: speed up for loop.""" all_docs = [] i = 0 for d in docs: words = preprocess_string(d) id_ct = self.vocab.doc2bow(words) if len(id_ct) < 1: continue else: id, ct = zip(*id_ct) all_docs.extend([(i, j) for j in id]) i += 1 return all_docs
def load_data(fname): print 'input file name:', fname target = [] #ラベル source = [] #文書ベクトル #文書リストを作成 document_list = [] word_list = [] for l in open(fname, 'r').readlines(): sample = l.strip().split(' ', 1) label = sample[0] target.append([label]) #ラベル word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング document_list.append(word_list) #文書ごとの単語リスト #辞書を作成 #低頻度と高頻度のワードは除く dct = Dictionary(document_list) dct.filter_extremes(no_below=3, no_above=0.6) #文書のBOWでベクトル化 for doc in document_list: tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0]) source.append(dense) dataset = {} dataset['target'] = np.array(target) dataset['source'] = np.array(source) return dataset #, max_len, width
class vectorizer: num_topics = 5000 def __init__(self): pass def fit_transform(self, data): data = [simple_preprocess(x, deacc=True) for x in data] phrases = Phrases(data, threshold=10) self.phraser = Phraser(phrases) data = self.phraser[data] self.dct = Dictionary(data) self.dct.filter_extremes(keep_n=self.num_topics) docs_bow = [self.dct.doc2bow(line) for line in data] #self.tfidf = TfidfModel(docs_bow) #vectors = list(self.tfidf[docs_bow]) self.lsimodel = None # self.lsimodel = LsiModel(corpus=vectors, num_topics=self.num_topics) retorno = [convert2dense(x, self.num_topics) for x in docs_bow] return retorno def transform(self, text): text = simple_preprocess(text, deacc=True) palavras = self.phraser[text] bow = self.dct.doc2bow(palavras)
def indexing(Corpus, keep_n, length, samples): #samples => number of samples in the corpus to take sentences = [] #list of list of words for i in range(len(Corpus)): sentences.append( Corpus['Output'][i] ) #puts all the lists of words(Output[i]) into the "sentences" list dct = Dictionary(sentences) dct.filter_extremes(keep_n) #keeps the top n words dictionary = (dct.token2id ) #dictionary now has all the words mapped to a number #newsentences will be a list of lists that only includes the top n indexes newsentences = [] for sentence in sentences: #for each list in the sentences list newsentence = [keep_n] #pad the sentence with the </s> token for item in sentence: #for each word in the sentence if (item in dictionary): #check if the word is top n frequent word newsentence.append(dictionary[item]) #append else: pass #otherwise do nothing newsentences.append( newsentence ) #append the sentence after each word has been iterated through #all the sentences in X_train are of length 10, ie if longer - truncate, if shorter - pad X_train = sequence.pad_sequences(newsentences, maxlen=length, value=3001, padding="post", truncating="post") temp = [] for i in range(samples): temp.append(one_hot(X_train[i])) return temp
def generate_tfidf_commit( repository: Repository, stopwords_: Set[str], min_len, cache=None) -> Tuple[tfidfmodel.TfidfModel, Dictionary, Dict]: if cache is None: cache = dict() texts = list() for commit in repository.commits: if commit.c_hash in cache.keys(): texts.append(cache[commit.c_hash]) else: text = text_pipeline(commit, stopwords_, min_len) texts.append(text) cache[commit.c_hash] = text for issue_ in repository.issues: if issue_.id_ in cache.keys(): texts.append(cache[issue_.id_]) else: text = text_pipeline(issue_, stopwords_, min_len) texts.append(text) cache[issue_.id_] = text dictionary_ = Dictionary(texts) dictionary_.filter_extremes(no_below=3, no_above=0.95) working_corpus = [ dictionary_.doc2bow(text, return_missing=True) for text in texts ] # Convert UNK from explicit dictionary to UNK token (id = -1) working_corpus = [ val[0] + [(-1, sum(val[1].values()))] for val in working_corpus ] return tfidfmodel.TfidfModel(working_corpus, id2word=dictionary_), dictionary_, cache
def embed(sent_words, path_word_ind, path_word_vec, path_embed): model = Dictionary(sent_words) model.filter_extremes(no_below=min_freq, no_above=1.0, keep_n=max_vocab) word_inds = model.token2id #print (word_inds) #随机排布 word_inds = tran_dict(word_inds, off=2) with open(path_word_ind, 'wb') as f: pk.dump(word_inds, f) #输出 #print (word_inds) with open(path_word_vec, 'rb') as f: word_vecs = pk.load(f) #print (word_vecs) vocab = word_vecs.vocab print (word_vecs['A'].shape) #200 vocab_num = min(max_vocab + 2, len(word_inds) + 2) embed_mat = np.zeros((vocab_num, embed_len)) for word, ind in word_inds.items(): if word in vocab: if ind < max_vocab: embed_mat[ind] = word_vecs[word] #嵌入规则为word_vecs print (embed_mat.shape) #(3571,200) with open(path_embed, 'wb') as f: pk.dump(embed_mat, f)
def testFilter(self): d = Dictionary(self.texts) d.filter_extremes(no_below=2, no_above=1.0, keep_n=4) dfs_expected = {0: 3, 1: 3, 2: 3, 3: 3} cfs_expected = {0: 4, 1: 3, 2: 3, 3: 3} self.assertEqual(d.dfs, dfs_expected) self.assertEqual(d.cfs, cfs_expected)
def pipeline_lda(que: pd.DataFrame, dim: int) -> (Dictionary, TfidfModel, LdaMulticore): """ Pipeline for training embeddings for questions via LDA algorithm on question titles and bodies :param que: raw questions.csv dataset :param dim: dimension of doc2vec embeddings to train :return: trained tags, industries embeddings and question's Doc2Vec model """ lda_tokens = que['questions_whole'].apply(lambda x: x.split()) # create Dictionary and train it on text corpus lda_dic = Dictionary(lda_tokens) lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000) lda_corpus = [lda_dic.doc2bow(doc) for doc in lda_tokens] # create TfidfModel and train it on text corpus lda_tfidf = TfidfModel(lda_corpus) lda_corpus = lda_tfidf[lda_corpus] # create LDA Model and train it on text corpus lda_model = LdaMulticore(lda_corpus, num_topics=dim, id2word=lda_dic, workers=4, passes=20, chunksize=1000, random_state=0) return lda_dic, lda_tfidf, lda_model
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary logger.info("Building a dictionary from texts") dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare words logger.info("Dictionary contains %d words. Filtering..." % len(dictionary.token2id)) dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None) dictionary.compactify() logger.info("Dictionary contains %d words." % len(dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(dictionary) return dict_model
def topic_model(docs): # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) # Bag-of-words representation of the documents. corpus = [dictionary.doc2bow(doc) for doc in docs] print('Number of unique tokens: %d' % len(dictionary)) print('Number of documents: %d' % len(corpus)) # Set training parameters. num_topics = 10 chunksize = 2000 passes = 20 iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token print("Training LDA Model ...") model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) return model.top_topics(corpus)
def test_run(self, data): dictionary = Dictionary(data) dictionary.filter_extremes(no_above=0.5) bags_of_words = [ dictionary.doc2bow(t) for t in data] #This can take a while to run: lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=2) results = self.assemble_topics(lda) return results
def small_word_conv(dataset_path): docs, y, test_docs, test_y = nli2013_train_test_split(dataset_path) logging.info('preprocessing, padding and binarizing data ...') docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in docs] test_docs = [flatten([sent.split() for sent in doc.split('\n') if sent.strip() != '']) for doc in test_docs] vocab = Dictionary(docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in docs], max_length=100, padding_word=0)) y = bin.fit_transform(y) test_x = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) test_y = bin.transform(test_y) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(11, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x, y, batch_size=32, nb_epoch=10, validation_data=[test_x, test_y]) print(accuracy_score(np.argwhere(test_y)[:, 1], model.predict_classes(test_x)))
def build_dictionary(): corpus = CorpusIterator(dir_list=dir_list) dictionary = Dictionary(corpus) dictionary.save_as_text( '/home/andre/Develop/corpora/lsamodel_wordids.txt.bz2') dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=500000) dictionary.save_as_text( '/home/andre/Develop/corpora/lsamodel_wordids_filtered.txt.bz2')
def testFilterKeepTokens_keepn(self): # keep_tokens should also work if the keep_n parameter is used, but only # to keep a maximum of n (so if keep_n < len(keep_n) the tokens to keep are # still getting removed to reduce the size to keep_n!) d = Dictionary(self.texts) # Note: there are four tokens with freq 3, all the others have frequence 2 # in self.texts. In order to make the test result deterministic, we add # 2 tokens of frequency one d.add_documents([['worda'], ['wordb']]) # this should keep the 3 tokens with freq 3 and the one we want to keep d.filter_extremes(keep_n=5, no_below=0, no_above=1.0, keep_tokens=['worda']) expected = {'graph', 'trees', 'system', 'user', 'worda'} self.assertEqual(set(d.token2id.keys()), expected)
def build_corpora(db): dictionary = Dictionary() corpus = [] for article in db.articles.find(): text = article['clean_text'] dictionary.doc2bow(text, allow_update=True) dictionary.filter_extremes() for article in db.articles.find(): text = article['clean_text'] corpus.append(dictionary.doc2bow(text)) gensim.corpora.MmCorpus.serialize('data/corpus.mm', corpus) dictionary.save('data/cnn.dict') return corpus, dictionary
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def main(): global dictionary try: dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt") #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2") except: dictionary = Dictionary(ReutersCorpus()) dictionary.filter_extremes() dictionary.save_as_text("persist/reuters_dictionary.txt") models = train_models() if settings["models"]["bow"]: bowmodel = BOWmodel() bowmodel.__out_size = len(dictionary) models["bow"] = bowmodel if settings["models"]["noise"]: noisemodel = NoiseModel(1000) noisemodel.__out_size = 1000 models["noise"] = noisemodel num_train_samples = 21578 - settings["held_out_docs"] test_samples = [] class generate_train_samples(object): first_iteration = True def __iter__(self): count = 0 for document in stream_reuters_documents(): sample = document["content"], "acq" in document["topics"] # todo: maybe try "usa" or "earn" if count > num_train_samples: if self.first_iteration: test_samples.append(sample) else: yield sample count += 1 self.first_iteration = False classifiers = train_classifiers(models, generate_train_samples()) classifications = run_evaluation(classifiers, models, test_samples) #output_results(classifications) return classifications
def create(pathtomapping, pathtocorpus, corpusname, window, numtokeep=50000, save_raw=True, shifts=(1, 5, 10)): """ Creates an Shifted Positive Pointwise Mutual Information matrix. :param pathtomapping: The path to the id2word mapping. If this is left empty, the id2word mapping gets recreated. Warning: this takes a long time. :param pathtocorpus: The path to the corpus folder. The corpus can be spread out over multiple files or folders, and is read iteratively. :param corpusname: The name of the corpus. Used for saving the files. :param window: The window used to consider co-occurrences. :param numtokeep: The number of most frequent words to keep. Note that the matrix is non-sparse. Because of this, the memory requirements of the code are quadratic. :param save_raw: Whether to save the raw co-occurrence matrix as a numpy matrix. :param shifts: The shifts to apply to the co-occurrence matrix. Each shifted matrix gets saved as a separate model. """ start = time.time() if not pathtomapping: id2word = Dictionary(SentenceIter(pathtocorpus), prune_at=None) id2word.filter_extremes(no_below=5, keep_n=numtokeep) id2word.compactify() logger.info("Creating the word2id took {0} seconds".format(time.time() - start)) else: id2word = Dictionary.load(pathtomapping) inter = time.time() word2id = gensim.utils.revdict(id2word) corpus = SentenceIter(pathtocorpus) raw = get_cooccur(corpus, word2id, window=window) logger.info("Creating raw co-occurrence matrix took {0} seconds".format(time.time() - inter)) if save_raw: np.save('{0}-cooccur.npy'.format(corpusname), raw) SPPMIFactory._save_word2id(word2id, "{0}mapping.json".format(corpusname)) SPPMIFactory._save_freqs(id2word, "{0}freqs.json".format(corpusname)) raw = SPPMIFactory.raw2pmi(raw) for k in shifts: sparse = SPPMIFactory.shift_clip_pmi(np.copy(raw), k_shift=k) SPPMIFactory._save_sparse_mtr(sparse, "{0}-SPPMI-sparse-{1}-shift.npz".format(corpusname, k)) del sparse
def train_lda_model(articles, num_topics=10): docs = [article_to_bow(a) for a in articles] dict = Dictionary(docs) dict.filter_extremes() dict.compactify() corpus = [dict.doc2bow(article_to_bow(a)) for a in articles] tfidf = TfidfModel(corpus=corpus, id2word=dict) w_corpus = [tfidf[doc] for doc in corpus] lda = LdaModel(corpus=w_corpus, num_topics=num_topics, update_every=0, passes=20, id2word=dict) return lda, tfidf, dict
def prepare_data(): # returns the corpus object required by learn # skips datasets/dspace/2481.json base = 'datasets/dspace' documents = [] for filename in tqdm(os.listdir(base)): path = os.path.join(base, filename) with open(path) as f: d = json.load(f) abstract = d['abstract'] if abstract is not None: words = tokenize(abstract.split()) documents.append(words) dictionary = Dictionary(documents) dictionary.filter_extremes(no_below=5, no_above=0.3) dictionary.save('lda.dict') corpus = map(dictionary.doc2bow, documents) return corpus
class DictionaryLearner(object): '''Learn a gensim dictionary from all available documents.''' def __init__(self, n=4): '''Initialize a DictionaryLearner instance using vocabulary of ngrams of size `n`.''' self._ngram = NgramTransformer(n) self._dictionary = Dictionary() def fit(self, documentstorage, filter_extremes=True): '''Fit a dictonary using documents from given documentstorage.''' for document in documentstorage.load_iterator(u''): text_document = document.text ngrams = self._ngram.transform([text_document]) self._dictionary.add_documents(ngrams) if filter_extremes: self._dictionary.filter_extremes() def get(self): return self._dictionary
def produce(self): doc_n = 0 docs = [] doctokens = [] # AKA gensim "text" stopwords = nltk.corpus.stopwords.words('english') NOALPHA = re.compile('[^a-z]+') def prep_string(my_string,pattern = NOALPHA): return re.sub(pattern, ' ', my_string.strip().lower()) print('Getting src docs') for doc in self.src_doc_generator(): content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator? docs.append(content) doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords]) doc_n += 1 if doc_n % 1000 == 0: print(doc_n) print('Creating the dictionary') dictionary = Dictionary(doctokens) dictionary.compactify() dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating DOC') db.create_table('doc') for i, doc in enumerate(docs): db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc)) print('Creating WORD') db.create_table('word') for item in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item) print('Creating DOCWORD') db.create_table('docword') for i, tokens in enumerate(doctokens): for item in (dictionary.doc2bow(tokens)): db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary of features logger.info("Creating features (including n-grams) from texts") gemsim_dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare features logger.info("Features dictionary contains %d features. Filtering..." % len(gemsim_dictionary.token2id)) gemsim_dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None) gemsim_dictionary.compactify() logger.info("Features Dictionary contains %d features." % len(gemsim_dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(gemsim_dictionary) return dict_model
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary logger.info("Building a dictionary from texts") dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare words logger.info("Dictionary contains %d words. Filtering..." % len(dictionary.token2id)) dictionary.filter_extremes(no_below=minimum_frequency, no_above=0.5, keep_n=None) dictionary.compactify() logger.info("Dictionary contains %d words." % len(dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(dictionary) return dict_model
class TfidfVectorizer(): """ Transform text to tf-idf representation """ def __init__(self): self.base_path = os.path.dirname(__file__) self.dictionary_path = os.path.join(self.base_path, "dictionary") self.tf_idf_model_path = os.path.join(self.base_path, "tfidf") self.stemmer = NepStemmer() self.tf_idf_model = None def get_tokens(self, document): if not self.stemmer: raise Exception("Stemmer not available") return self.stemmer.get_stems(document) def construct_model(self, documents): logging.basicConfig( format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO ) logging.info("Obtaining word tokens") tokens = [self.get_tokens(document) for document in documents] # self.tf_idf_model = TfidfModel(tokens) logging.info("Constructing dictionary") self.dictionary = Dictionary(tokens) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000) self.dictionary.compactify() self.dictionary.save(self.dictionary_path) logging.info("Constructing TF-IDF model") self.tf_idf_model = TfidfModel(dictionary=self.dictionary) self.tf_idf_model.save(self.tf_idf_model_path) def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path) def doc2vector(self, document): """ Returns the sparse tf-idf vector for given document """ tokens = self.get_tokens(document) bag_of_words = self.dictionary.doc2bow(tokens) return (self.tf_idf_model[bag_of_words]) def obtain_feature_vector(self, document): """ Returns a single dense tf-idf vector for a given document """ self.load_data() tf_idf_vector = matutils.sparse2full( self.doc2vector(document), self.no_of_features ).reshape(1, -1) return tf_idf_vector def obtain_feature_matrix(self, documents): """ Returns the tf-idf dense matrix for the given documents """ self.load_data() input_matrix_sparse = [ self.doc2vector(x) for x in documents ] no_of_features = len(self.tf_idf_model.idfs) input_matrix = matutils.corpus2dense( input_matrix_sparse, no_of_features ).transpose() return input_matrix
return [token for token in simple_preprocess(text) if token not in stop_words] def iter_wiki(dump_file): """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple.""" ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() for title, text, pageid in _extract_pages(smart_open(dump_file)): text = filter_wiki(text) tokens = tokenize(text) if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces): continue # ignore short articles and various meta-articles yield title, tokens wiki_stream = (tokens for _, tokens in iter_wiki('enwiki-latest-pages-articles.xml.bz2')) print "making of dictionary started" wiki_dictionary = Dictionary(wiki_stream) print "wikipedia dictionary made" wiki_dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=200000) print "...... saving the dictionary" wiki_dictionary.save('WikiDictionary200k.dict') print "dictionary saved ........" # wiki = WikiCorpus('enwiki-latest-pages-articles.xml.bz2') # make a corpus from wiki dump # MmCorpus.save_corpus('WikiCorpus.mm', wiki) # Saving the corpus
scaling = 'tfidf' elif not opts.scaling: scaling = None else: raise ValueError("Only tfidf scaling is supported") word_model = opts.word_model if word_model: logging.info("Building word model") corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit) else: corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) voc = Dictionary(corpus) voc.filter_extremes(no_below=cutoff) voc.compactify() bow_corpus = (voc.doc2bow(art) for art in corpus) tfidf = None if scaling == 'tfidf': tfidf = TfidfModel(bow_corpus) bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc) model.save(model_fn) if tfidf: tfidf.save(model_fn + '.tfidf')
class MyCorpus(object): ''' Corpus class for streaming review documents ''' def __init__(self, file_list, file_dir, dictionary = None, mindf = MINDF, maxdf = MAXDF, \ maxwords = MAXWORDS, cluster_words = CLUSTER_WORDS, cluster_ul = CLUSTER_UL): self.file_list = file_list # list of cuisine text files self.file_dir = file_dir # directory of cuisine text files self.maxwords = maxwords # maximum number of words to keep after building dictionary from clusters self.cluster_words = cluster_words # maximum number of words to keep from each cluster self.cluster_ul = cluster_ul # upper proportion of reviews to limit for cluster processing self.mindf = mindf # minimum number of documents to keep word self.maxdf = maxdf # max proportion of documents to keep word self.agglomerate = True # return clusters as single documents (True) or return single reviews (False) if dictionary: self.dictionary = dictionary else: self.dictionary = Dictionary() self._build_dict() def __str__(self): return "<MyCorpus at " + str(hex(id(self))) + ">" def __repr__(self): return self.__str__() def _build_dict(self): for filename in self.file_list: dictionary = dict() num_reviews = 0 with open(os.path.join(self.file_dir, filename), "rt") as f: for line in f: num_reviews += 1 words = line[REVIEW_INDEX:].split() for word in set(words): if word not in dictionary: dictionary[word] = 1 else: dictionary[word] += 1 doc = [item for item in dictionary.items() if dictionary[item[0]] > 2 and dictionary[item[0]] / num_reviews < self.cluster_ul] doc.sort(key = lambda x: -x[1]) doc = [word for word, f in doc] self.dictionary.add_documents([doc[:self.cluster_words]]) print("%s added to corpus dictionary!" % (filename,)) self.dictionary.filter_extremes(self.mindf, self.maxdf, self.maxwords) self.dictionary.save("cuisine_dictionary.gensimDict") def __iter__(self): ''' Iterates through cuisines by combining all reviews for each cuisine into a single processed document. Also stores the length of each processed document ''' if self.agglomerate: for filename in self.file_list: with open(os.path.join(self.file_dir, filename), "rt") as f: doc = " ".join([line[REVIEW_INDEX:].rstrip() for line in f]) yield self.dictionary.doc2bow(doc.split()) else: reviewIDs = set() for filename in self.file_list: with open(os.path.join(self.file_dir, filename), "rt") as f: for line in f: id = line[:RATING_INDEX - 1] if id not in reviewIDs: reviewIDs.update([id]) doc = line[REVIEW_INDEX:].rstrip() yield self.dictionary.doc2bow(doc.split())
def testFilterKeepTokens_keepTokens(self): # provide keep_tokens argument, keep the tokens given d = Dictionary(self.texts) d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['human', 'survey']) expected = set(['graph', 'trees', 'human', 'system', 'user', 'survey']) self.assertEqual(set(d.token2id.keys()), expected)
def testFilterKeepTokens_unseenToken(self): # do provide keep_tokens argument with unseen tokens, filter_extremes functionality is unchanged d = Dictionary(self.texts) d.filter_extremes(no_below=3, no_above=1.0, keep_tokens=['unknown_token']) expected = set(['graph', 'trees', 'system', 'user']) self.assertEqual(set(d.token2id.keys()), expected)
def main(): parser = ArgumentParser( description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information" ) parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)") parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it") parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki") parser.add_argument("--model-id", default="model", help="Filename for created model.") parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).") parser.add_argument("--n-topics", default=10, help="Number of topics to model.") parser.add_argument("--n-passes", default=1, help="Number of passes for LDA model.") parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.") parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.") parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents") parser.add_argument("--index", help="Elasticsearch: index to read from.") parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.") parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.") parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.") opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ["es", "wiki", "file"]: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ["wiki"]: logging.error("--dump-file required for wiki dataset") sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == "es" and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = "%s_%s_%d" % (model_id, model_type, n_topics) if data_dir: model_fn = "%s/%s" % (data_dir, model_fn) if model_type == "word2vec": w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == "es": logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset( read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es ) elif data_type == "wiki": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == "file": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words("norwegian")) if not vocab_file or model_type == "vocabulary": vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + ".vocab") else: vocab = Dictionary.load(vocab_file) if model_type == "vocabulary": return tfidf = TfidfModel(dictionary=vocab) if model_type == "lsi": corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == "lda": corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == "word2vec": corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == "hdp": corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)
def testFilter(self): d = Dictionary(self.texts) d.filter_extremes(no_below=2, no_above=1.0, keep_n=4) expected = {0: 3, 1: 3, 2: 3, 3: 3} self.assertEqual(d.dfs, expected)
def testFilterKeepTokens_unchangedFunctionality(self): # do not provide keep_tokens argument, filter_extremes functionality is unchanged d = Dictionary(self.texts) d.filter_extremes(no_below=3, no_above=1.0) expected = {'graph', 'trees', 'system', 'user'} self.assertEqual(set(d.token2id.keys()), expected)
def dbpedia_smallwordconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test]) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def calculate_lda(dataset_raw, n_topics=10, lda_model_name="", mallet=True, mallet_path="/Users/verasazonova/no-backup/JARS/mallet-2.0.7/bin/mallet", dataname="none"): with open(dataname+"_log.txt", 'a') as fout: if dataset_raw.include_date: dates = [text[1] for text in dataset_raw] dataset = [normalize_words(text[0].split(), dataset_raw.stoplist) for text in dataset_raw] else: dates = ["" for _ in dataset_raw] dataset = dataset_raw bi_grams = Phrases(dataset, threshold=3) dataset = bi_grams[dataset] dictionary = Dictionary(dataset) dictionary.filter_extremes(no_below=1, no_above=0.9) bow_corpus = [dictionary.doc2bow(text) for text in dataset] fout.write("# Topics: %s\n" % n_topics) if not os.path.isfile(lda_model_name): if mallet: lda_model = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, workers=4, optimize_interval=10, iterations=1000) lda_model_name = "lda_model_mallet_%s_%i" % (dataname, n_topics) else: lda_model = LdaModel(bow_corpus, id2word=dictionary, num_topics=n_topics, distributed=False, chunksize=2000, passes=5, update_every=10, alpha='asymmetric', eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001) lda_model_name = "lda_model_%s_%i" % (dataname, n_topics) lda_model.save(lda_model_name) else: if mallet: lda_model = LdaMallet.load(lda_model_name) else: lda_model = LdaModel.load(lda_model_name) topic_definition = [] for i, topic in enumerate(lda_model.show_topics(n_topics, num_words=20, formatted=False)): fout.write("%i \n" % i) topic_list = [] freq_list = [] a_list = [] for tup in topic: topic_list.append(tup[1]) freq_list.append(dictionary.dfs[ dictionary.token2id[tup[1]] ] ) a_list.append(tup[0]) fout.write( "%s\n\n" % repr((sorted(zip(topic_list, freq_list), key=itemgetter(1) )))) topic_definition.append("%i, %s" %(i, repr(" ".join(sorted(topic_list)))[2:-1])) fout.write("Total number of documents: %i\n" % dictionary.num_docs ) earliest_date = dateutil.parser.parse("Sun Jun 08 00:00:00 +0000 2014") a = [tup for tup in sorted(zip(bow_corpus, dates), key=get_date ) if dateutil.parser.parse(tup[1]) > earliest_date] print len(a) print a[len(a)-1] latest_date = dateutil.parser.parse(a[len(a)-1][1]) num_bins = 100 time_span = latest_date - earliest_date print time_span time_bin = time_span / num_bins print time_bin bin_lows = [earliest_date] bin_high = earliest_date + time_bin counts = [[0 for _ in range(n_topics)] for _ in range(num_bins+1)] i=0 for text in a: topic_assignments = lda_model[text[0]] date_str = text[1] if date_str is not None: cur_date = dateutil.parser.parse(date_str) if cur_date >= bin_high: i+=1 bin_lows.append(bin_high) bin_high = bin_lows[len(bin_lows)-1] + time_bin #counts[i][max(topic_assignments, key=itemgetter(1))[0]] += 1 for tup in topic_assignments: counts[i][tup[0]] += tup[1] fout.write("Number of documents assigned mostly to the topic: \n") fout.write("%s\n" % counts) a = 1.*np.array(counts) np.savetxt("mpeketoni_cnts.txt", a) with open("mpeketoni_bins.txt", 'w') as fout: for date in bin_lows: fout.write("%s\n" % date) with open("mpeketoni_labels.txt", 'w') as fout: for label in topic_definition: fout.write("%s\n" % label) return a, bin_lows, topic_definition