class process_corpus(object): def __init__(self, sql=None,lemmatize=False,first_sentences=False,n_sentences=10): self.sql=sql self.first_sentences=first_sentences self.n_sentences=n_sentences self.wordnet=WordNetLemmatizer() self.pstemmer=PorterStemmer() self.lemmatize=lemmatize self.dictionary = Dictionary(self.iterrecords()) print('dictionary before:', self.dictionary.token2id) once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1] self.dictionary.filter_tokens(once_ids) self.dictionary.compactify() print('dictionary after filtering:', self.dictionary.token2id) def __iter__(self): self.cl=0 for tokens in self.iterrecords(): # generates the document tokens and creates bow using dictionary self.cl+=1 yield self.dictionary.doc2bow(tokens) def iterrecords(self): # generates document tokens for the dictionary self.index=[] cursor.execute(self.sql) ct=0 for doc in cursor: print ct self.index.append(str(doc[0]).strip()) doc=doc[1] # print to_beautiful(doc[1]) if self.first_sentences: doc=get_first_n_sentences_from_document(doc,self.n_sentences) tokens=clean_text_by_word(doc) ct+=1 yield tokens # or whatever tokenization suits you def __len__(self): return self.cl
def build_tag_vectors(tag_directory_path): """Loads tag files, builds sparse vectors for each song Parameters ---------- tag_directory_path : String, path of directory containing tags Returns ------- id_vec_mapping : dict (song id => list[tuple(tagId, count)]) dictionary : gensim Dictionary containing all tags and ids """ dictionary = Dictionary() for f in listdir(tag_directory_path): with open(tag_directory_path+"/"+f, 'r') as tags: tokens = tags.read().split(sep=' ') dictionary.add_documents([tokens]) dictionary.filter_extremes(no_below=2, no_above=0.5) dictionary.compactify() id_vec_mapping = {} for f in listdir(tag_directory_path): song_id = f[0:-4] with open(tag_directory_path+"/"+f, 'r') as tags: tokens = tags.read().split(sep=' ') sparse_vec = dictionary.doc2bow(tokens) add_to_dictionary(id_vec_mapping, (song_id, sparse_vec)) return id_vec_mapping, dictionary
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary logger.info("Building a dictionary from texts") dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare words logger.info("Dictionary contains %d words. Filtering..." % len(dictionary.token2id)) dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None) dictionary.compactify() logger.info("Dictionary contains %d words." % len(dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(dictionary) return dict_model
def buildDictionary(self, corpus, txt2tokens, opts): ''' Tokenize texts and add tokens to dictionary. :param corpus: Corpus-like or id :param txt2tokens: txt2tokens or id :param opts: GensimDictBuildOptions :param ctx: pytopia context :return: gensim Dictionary ''' t = clock() corpus, txt2tokens = self.resolve(corpus, txt2tokens) # fill the dictionary with tokens from corpus texts dictionary = Dictionary(documents=None) numDocs = 0; numTokens = 0 for txto in corpus: tokens = txt2tokens(txto.text) numDocs += 1; numTokens += len(tokens) dictionary.doc2bow(tokens, allow_update=True) # form filtering options and run filtering no_below = opts.docLowerLimit if opts.docLowerLimit is not None else 0 if opts.docUpperLimit is None: no_above = 1.0 elif isinstance(opts.docUpperLimit, float): no_above = opts.docUpperLimit else: no_above = opts.docUpperLimit/float(numDocs) if opts.words2keep is None: keep_n = numTokens else: keep_n = opts.words2keep dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() # force id2token map building someId = dictionary.token2id.values()[0] dictionary[someId] return GensimDictAdapter(dictionary, corpus.id, txt2tokens.id, opts)
def fit(self, corpus): self._verify_corpus(corpus) self.N = len(corpus) tokens = self.preprocessor.transform(corpus) self.observed_tokens = tokens.apply(len).sum() vocab = Dictionary(tokens) vocab.filter_extremes(no_above=self.max_df, no_below=self.min_df, keep_n=self.vocab_size) vocab.compactify() self.vocab = vocab self.corpus_as_tokens = tokens self.corpus_as_bow = [self.vocab.doc2bow(doc) for doc in tokens] self.corpus_as_csr = corpus2csc(self.corpus_as_bow, num_terms=len(self.vocab)).T self.lengths = [len(d) for d in self.corpus_as_bow] self.num_empty_docs = self.lengths.count(0) time_now = time.localtime() self.created_on = time.strftime("%d %b %Y %H:%M:%S", time_now) return self
def tf_idf_weight(spacy_contexts): """ @param spacy_contexts Spacy-fied contexts Returns list of Dicts, each dictionary corresponds to one document and contains words and their tf-idf weights """ docs_dict = Dictionary(spacy_contexts) docs_dict.compactify() docs_corpus = [docs_dict.doc2bow(doc) for doc in spacy_contexts] model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict) docs_tfidf = model_tfidf[docs_corpus] # Now generate a list of dicts with k,v = "word": tfidf_frequency # each dict contains words from one document (sentence) doc_tfidf_dicts = [] for doc in docs_tfidf: d = dict() for term, freq in doc: d[docs_dict[term]] = freq doc_tfidf_dicts.append(d) return doc_tfidf_dicts
def doc_embed_charity_notfidf(processed_docs, word_min=5, word_max_perc=.8): 'Takes a list of preprocessed texts and returns an embedding vector for each document, a dictionary of the words within the corpus, and the glove vectors for each word in the corpus' # Create dictionary from corpus docs_dict = Dictionary(processed_docs) docs_dict.filter_extremes(no_below=word_min, no_above=word_max_perc) docs_dict.compactify() # Convert docs into sparce matricx (N_docs x N_words in dictionary) where the number in each cell indicates the number of time that word appeared in that document docs_corpus = [docs_dict.doc2bow(doc) for doc in processed_docs] docs_vecs = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_corpus]) #Count number of documents and words in dictionary num_docs = np.shape(docs_vecs)[0] num_words = np.shape(docs_vecs)[1] print("Total # of docs: {}".format(num_docs)) print("Total # of words in dict: {}".format(num_words)) # For each word in dict extract embedding vector (Glove vectors) glove_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))]) # Sum glove vectors over words in doc docs_emb = np.dot(docs_vecs, glove_vecs) return docs_emb, docs_dict, glove_vecs
def get_corpus_dict(self, recalculate=False, from_scratch=True): if not os.path.isfile( self.paths.trigram_dictionary_filepath) or recalculate: if not from_scratch: raise ValueError( 'No corpus Dictionary file exists but from_scratch is False' ) print('Building trigram dict...') trigram_docs = LineSentence(self.paths.trigram_corpus_filepath) # learn the dictionary by iterating over all of the docs trigram_dictionary = Dictionary(trigram_docs) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(self.paths.trigram_dictionary_filepath) print('Done!') else: print('Loading trigram dict...') trigram_dictionary = Dictionary.load( self.paths.trigram_dictionary_filepath) return trigram_dictionary
def _build_vocab(self, max_vocab_cnt): all_words = [] for data in self.valid + self.non_valid: all_words.append(data["title"] + data["content"]) vocab = Dictionary(all_words) raw_vocab_size = len(vocab) vocab.filter_extremes(no_below=5) vocab.filter_extremes(keep_n=max_vocab_cnt) len_1_words = list( filter( lambda w: len(w) == 1 and re.match(r"[\x00-\x7f]", w) and w not in ["a", "i"] and True or False, vocab.values())) vocab.filter_tokens(list(map(vocab.token2id.get, len_1_words))) if self.config.use_dict == "seq" and self.config.enable_pad: vocab.token2id[PAD] = len(vocab) vocab.compactify() self.pad_wid = vocab.token2id.get(PAD) self.vocab_seq = vocab # seq dictionary # build bow dictionary self.vocab_bow = copy.deepcopy(vocab) self.vocab_bow.filter_tokens( map(self.vocab_bow.token2id.get, STOPWORDS)) # filter stop words self.vocab_bow.compactify() if self.config.tfidf: tfidf_corpus = [self.vocab_bow.doc2bow(line) for line in all_words] self.tfidf_model = TfidfModel(tfidf_corpus) print("Load corpus with non_valid size %d, valid size %d, " "raw vocab size %d seq vocab size %d, bow vocab size %d" % (len(self.non_valid), len(self.valid), raw_vocab_size, len(self.vocab_seq), len(self.vocab_bow)))
def create_LDA_dict(): #ONE TIME USE, to create and save LDA model trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict' trigram_reviews = LineSentence( '../Dataset/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(trigram_dictionary_filepath) print('LDA dict saved.') trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm' MmCorpus.serialize( trigram_bow_filepath, trigram_bow_generator( '../Dataset/trigram_transformed_reviews_all.txt')) trigram_bow_corpus = MmCorpus(trigram_bow_filepath) lda_model_filepath = '../Models/lda_model_all' #lda_model_all_30, lda_model_10topic # created LDA model with 10, 30, 50 topics, found 30 has best result with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore( trigram_bow_corpus, num_topics=30, #10, 30, 50 id2word=trigram_dictionary, workers=8) lda.save(lda_model_filepath) print('LDA model saved.')
class MiCorpus: """ Iterable: en cada iteración devuelve vectores bag-of-words, uno por documento. Procesa un documento a la vez usando generators. Nunca carga todo el corpus a RAM. """ def __init__(self, directorio, lenguaje, otros=None): self.directorio = directorio self.lenguaje = lenguaje self.otros = otros self.ngramas = model_ngrams( iter_sentences(self.directorio, self.lenguaje, self.otros)) self.diccionario = Dictionary( iter_documents(self.ngramas, self.directorio, self.lenguaje, self.otros)) self.diccionario.filter_extremes(no_above=0.8) self.diccionario.filter_tokens( bad_ids=(tokid for tokid, freq in self.diccionario.dfs.items() if freq == 1)) self.diccionario.compactify() def __iter__(self): """ CorpusConsultivos es un streamed iterable. """ for tokens in iter_documents(self.ngramas, self.directorio, self.lenguaje, self.otros): yield self.diccionario.doc2bow(tokens)
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic): stop_words = set(stopwords.words('english')) stop_words.add(u'rt') print('Loading tweets from ' + tweets_file) tweets = pd.read_pickle(tweets_file) if author_topic: tweets = tweets.groupby('user').agg({'text': 'sum'}) print('%d tweets loaded' % len(tweets.index)) dictionary = Dictionary(tweets['text']) stopword_ids = map(dictionary.token2id.get, stop_words) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in tweets['text']] # print(corpus) print("Writing corpus to " + corpus_file) MmCorpus.serialize(corpus_file, corpus) # print(dictionary) print("Writing dictionary to " + dictionary_file) dictionary.save(dictionary_file)
def parse_processed_amazon_dataset(task_files, max_words=10000): """ Code inspired by: https://github.com/sclincha/xrce_msda_da_regularization """ datasets = {} dico = GensimDict() print("Parsing", task_files) # First pass on document to build dictionary for fname in task_files: with open(fname, 'r') as f: for l in f: tokens = l.split(' ') tokens_list = [] for tok in tokens[:-1]: ts, tfreq = tok.split(':') freq = int(tfreq) tokens_list += [ts] * freq dico.doc2bow(tokens_list, allow_update=True) # Preprocessing_options dico.filter_extremes(no_below=2, keep_n=max_words) dico.compactify() for fname in task_files: X, Y = [], [] with open(fname, 'r') as f: for docid, l in enumerate(f): tokens = l.split(' ') label_string = tokens[-1] tokens_list = [] for tok in tokens[:-1]: ts, tfreq = tok.split(':') freq = int(tfreq) tokens_list += [ts] * freq count_list = dico.doc2bow(tokens_list, allow_update=False) idx, freqs = list(zip(*count_list)) one_hot = np.zeros(max_words) one_hot[list(idx)] = np.array(freqs) X.append((docid, one_hot)) #Preprocess Label ls, lvalue = label_string.split(':') if ls == "#label#": if lvalue.rstrip() == 'positive': Y.append(1) elif lvalue.rstrip() == 'negative': Y.append(0) else: raise Exception("Invalid Label Value") else: raise Exception('Invalid Format') datasets[os.path.split(os.path.split(fname)[0])[-1]] = (X, Y) return datasets, dico
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary of features logger.info("Creating features (including n-grams) from texts") gemsim_dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare features logger.info("Features dictionary contains %d features. Filtering..." % len(gemsim_dictionary.token2id)) gemsim_dictionary.filter_extremes(no_below=minimum_frequency, no_above=1, keep_n=None) gemsim_dictionary.compactify() logger.info("Features Dictionary contains %d features." % len(gemsim_dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(gemsim_dictionary) return dict_model
def dic_tr(clean_revs_file): tri_rv = LineSentence(clean_revs_file) tri_dict = Dictionary(tri_rv) tri_dict.filter_extremes(no_below=5, no_above=0.3) tri_dict.compactify() tri_dict.save(trigram_dict_path)
def fetch_dict(): global dictionary dictionary=Dictionary([i for i in my_dictionary]) once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() dictionary.save("Topic/dic.loc") return dictionary
def build_vocabulary_and_corpus(): ''' Build the vocabularies and stem sequences for each type of entities. ''' # Vocabulary (same for question and answers) v = Dictionary() # Stemmer. stemmer = PorterStemmer() # Tokenizer. tokenizer = TweetTokenizer() # Read indexes user_index, question_index, answer_index, comment_index = read_indexes() # Question, answer q = {} a = {} # Read entities. with open(entity_path, 'rb') as obj: entities = pickle.load(obj) # Browse question and answers to first build vocabulary. for e in entities: # Question or answer. if e['type'] == 'Q' or e['type'] == 'A': # String content. title = str(e['title']).encode('utf-8').lower() content = str(e['content']).encode('utf-8').lower() # Tokenize d = tokenizer.tokenize(title + content) # Stem word d = [stemmer.stem(s) for s in d] # Process vocabulary. v.add_documents([d]) # Question if e['type'] == 'Q': q[question_index[e['id']]] = d # Answer if e['type'] == 'A': a[answer_index[e['id']]] = d # Write question corpus. with open(os.path.join(data_path, 'q.corpus'), 'wb') as f: pickle.dump(q, f) # Write answer corpus. with open(os.path.join(data_path, 'a.corpus'), 'wb') as f: pickle.dump(a, f) # Write to analyse. v.filter_extremes(no_below=1000, keep_n=10000) v.compactify() v.save(os.path.join(data_path, "raw_vocabulary.gensim"))
def docs_to_dict(docs, **kw): """Convert docs to Dictionary and BOW, filtering common/rare words. Returns (dictionary, BOW)""" no_below = kw.pop("no_below", .02) no_above = kw.pop("no_above", 0.9) d = Dictionary(docs) d.filter_extremes(no_below=no_below, no_above=no_above, **kw) d.compactify() return d, docs.apply(d.doc2bow)
def get_corpus(df): words = clean_text(df["combined_text"].values) bigram = bigrams(words) bigram = [bigram[tweet] for tweet in words] id2word = Dictionary(bigram) id2word.filter_extremes(no_below=50, no_above=0.40) id2word.compactify() corpus = [id2word.doc2bow(text) for text in bigram] return corpus, id2word, bigram
def _get_docs_dict(self, docs): docs_dict = Dictionary(docs) # CAREFUL: For small corpus docs_dict.filter_extremes(no_below=5, no_above=0.2) # docs_dict.filter_extremes(no_below=5) # after some tokens have been removed remove the gaps docs_dict.compactify() print('docs_dict', docs_dict) return docs_dict
def generate_dictionary(input_file_path, applyExtreem=True, no_below=5, no_above=0.4): lineSentence = LineSentence(input_file_path) dictionary = Dictionary(lineSentence) if applyExtreem: dictionary.filter_extremes(no_below=no_below, no_above=no_above) dictionary.compactify() return dictionary
def main(subreddit): const = get_constants(subreddit) if os.path.exists(const['CORPUS']): print("Loading preexisting corpus...") corpus = util.load_pickle(const['CORPUS']) else: print("Getting and writing dictionary...") with open(const['OUTPUTS'], "r") as f: num_lines = sum(1 for line in f) with open(const['OUTPUTS'], "r") as f: dicts = (json.loads(comment) for comment in tqdm(f, total=num_lines)) if const["INTERVAL"] is not None: corpuses = [[] for interval in const["ALL_INTERVALS"]] for comment in dicts: i = get_interval_idx(comment["score"]) corpuses[i].append(normalize_text(comment["body"], const['STEMMING'])) for i, interval in enumerate(const["ALL_INTERVALS"]): util.write_pickle(corpuses[i], get_interval_fname(subreddit, interval)) corpus = corpuses[0] else: corpus = [normalize_text(comment["body"], const['STEMMING']) for comment in dicts] gdict = Dictionary( corpus ) gdict.filter_extremes(no_above=const['NO_ABOVE_1'], no_below=const['NO_BELOW']) gdict.compactify() util.write_pickle(gdict.token2id, const['INDICES']) util.write_pickle(gdict, const['DICTS']) print("Generating word co-occurrences...") cooccurgen.run( word_gen(corpus, gdict, subreddit, len(corpus)), gdict.token2id, 4, const['COUNTS'] ) print("Generating PPMI vectors...") ppmigen.run(subreddit, cds=True) print("Generating SVD vectors...") makelowdim.run(const['INDICES'], const['PPMI'], const['VECS'])
def fetch_dict(): global dictionary dictionary = Dictionary([i for i in my_dictionary]) once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens(once_ids) dictionary.compactify() dictionary.save("Topic/dic.loc") return dictionary
def prep_corpus(docs, additional_stopwords=set(), no_below=2, no_above=0.05): dictionary = Dictionary(docs) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def get_dictionary(documents: Dict[int, List[str]]) -> Dictionary: if os.path.exists(DICTIONARY_FILE_NAME): print(f"loading dictionary from {DICTIONARY_FILE_NAME}") gensim_dict = Dictionary.load(DICTIONARY_FILE_NAME) else: print("creating dictionary") gensim_dict = Dictionary() gensim_dict.add_documents(documents.values()) gensim_dict.compactify() print(f"saving dictionary to {DICTIONARY_FILE_NAME}") gensim_dict.save(DICTIONARY_FILE_NAME) return gensim_dict
class SentenceCorpus(TextCorpus): def __init__(self, sentences, max_size=None): self.metadata = False self.sentences = sentences self.dictionary = Dictionary(self.get_texts(), prune_at=max_size) self.dictionary.compactify() self.bows = [self.dictionary.doc2bow(tokens) for tokens in self.get_texts()] def get_texts(self): for sentence in self.sentences: yield sentence.tokens
class BaseWordFilter: def __init__(self, documents: List[str], labels: List, stopwords=None, **vocab_options): self._filtered_words = [] self._labels = labels self.__generate_vocab(documents, **vocab_options) self.__vectorize_documents(documents, stopwords) def __generate_vocab(self, docs, vocab_size=2000, no_below=100, no_above=0.9): doc_tokens = [simple_preprocess(d) for d in docs] self._words = Dictionary(doc_tokens) self.words.filter_extremes(no_below=no_below, no_above=no_above, keep_n=vocab_size) self._words.compactify() def __vectorize_documents(self, docs, stopwords): vocab = {w: i for i, w in enumerate(self._words.values())} vectorizer = CountVectorizer(stop_words=stopwords, vocabulary=vocab) self._doc_vecs = vectorizer.fit_transform(docs) def fit(self): pass def save_filter(self, file='models/filter.txt'): with open(file, 'wt') as f: for word in self.filtered_words: f.write('%s\n' % word) f.close() @property def words(self): return self._words @property def filtered_words(self): return self._filtered_words @property def doc_vecs(self): return self._doc_vecs @property def labels(self): return self._labels
def lsa(corpus, size=8): dic = Dictionary(corpus) dic.filter_extremes( no_below=5, no_above=0.8, ) dic.filter_n_most_frequent(remove_n=10) dic.compactify() index_corpus = [dic.doc2bow(sent) for sent in corpus] tfidf = TfidfModel(index_corpus, dictionary=dic) normed_corpus = [tfidf[sent] for sent in index_corpus] lsi = LsiModel(normed_corpus, num_topics=size) return [[x[1] for x in lsi[sent]] for sent in normed_corpus]
def get_dict(pro_texts): # create a dictionary dictionary = Dictionary(pro_texts) # filter out words with frequency< 5 in a document and filter out words apprear in more than %30 of the documents dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None) #remove gaps in ids after the filter dictionary.compactify() #create bow of the data corpus = [dictionary.doc2bow(text) for text in pro_texts] return dictionary, corpus
def Gensim_Dic(sentences, tem_fname): dct = Dictionary(sentences) a = [] for w in stopwords: if w in dct.token2id.keys(): a.append(dct.token2id[w]) dct.filter_extremes(no_below=10) dct.filter_tokens(bad_ids=a) dct.compactify() dct.save_as_text(tmp_fname)
class SentenceCorpus(TextCorpus): def __init__(self, sentences, no_below=3, no_above=0.8, max_size=None): self.metadata = False self.sentences = sentences self.dictionary = Dictionary(self.get_texts(), prune_at=max_size) self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=max_size) self.dictionary.compactify() self.bows = [self.dictionary.doc2bow(tokens) for tokens in self.get_texts()] def get_texts(self): for sentence in self.sentences: yield sentence.tokens
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) dictionary.filter_tokens(stopword_ids) dictionary.compactify() dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def fetch_dict(): print "Fetching Dictionary...", try: dictionary=Dictionary().load("Topic/dic.tm") print "Dictionary loaded!" except IOError: print "Dictionary not found, building Dictionary..." dictionary=Dictionary(i for i in MyDictionary()) once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() print "\rDictionary Built!" print dictionary dictionary.save("Topic/dic.tm") return dictionary
def extract_topics(words): word_id_map=Dictionary([words]) word_id_map.filter_tokens([id for id, occurance in word_id_map.dfs.iteritems() if occurance == 2]) word_id_map.compactify() deals_corpus=[word_id_map.doc2bow(words)] lda=LdaModel(corpus=deals_corpus, id2word=word_id_map, num_topics=15, update_every=1, chunksize=1000,passes=1) topics=[] for i in range(15): tokens=lda.print_topic(i).split('+') topic_scores=[] for token in tokens: score,token_val=token.split('*') topic_scores.append((token_val,score)) topics.append(topic_scores) return topics
def create(pathtomapping, pathtocorpus, corpusname, window, numtokeep=50000, save_raw=True, shifts=(1, 5, 10)): """ Creates an Shifted Positive Pointwise Mutual Information matrix. :param pathtomapping: The path to the id2word mapping. If this is left empty, the id2word mapping gets recreated. Warning: this takes a long time. :param pathtocorpus: The path to the corpus folder. The corpus can be spread out over multiple files or folders, and is read iteratively. :param corpusname: The name of the corpus. Used for saving the files. :param window: The window used to consider co-occurrences. :param numtokeep: The number of most frequent words to keep. Note that the matrix is non-sparse. Because of this, the memory requirements of the code are quadratic. :param save_raw: Whether to save the raw co-occurrence matrix as a numpy matrix. :param shifts: The shifts to apply to the co-occurrence matrix. Each shifted matrix gets saved as a separate model. """ start = time.time() if not pathtomapping: id2word = Dictionary(SentenceIter(pathtocorpus), prune_at=None) id2word.filter_extremes(no_below=5, keep_n=numtokeep) id2word.compactify() logger.info("Creating the word2id took {0} seconds".format(time.time() - start)) else: id2word = Dictionary.load(pathtomapping) inter = time.time() word2id = gensim.utils.revdict(id2word) corpus = SentenceIter(pathtocorpus) raw = get_cooccur(corpus, word2id, window=window) logger.info("Creating raw co-occurrence matrix took {0} seconds".format(time.time() - inter)) if save_raw: np.save('{0}-cooccur.npy'.format(corpusname), raw) SPPMIFactory._save_word2id(word2id, "{0}mapping.json".format(corpusname)) SPPMIFactory._save_freqs(id2word, "{0}freqs.json".format(corpusname)) raw = SPPMIFactory.raw2pmi(raw) for k in shifts: sparse = SPPMIFactory.shift_clip_pmi(np.copy(raw), k_shift=k) SPPMIFactory._save_sparse_mtr(sparse, "{0}-SPPMI-sparse-{1}-shift.npz".format(corpusname, k)) del sparse
def train_lda_model(articles, num_topics=10): docs = [article_to_bow(a) for a in articles] dict = Dictionary(docs) dict.filter_extremes() dict.compactify() corpus = [dict.doc2bow(article_to_bow(a)) for a in articles] tfidf = TfidfModel(corpus=corpus, id2word=dict) w_corpus = [tfidf[doc] for doc in corpus] lda = LdaModel(corpus=w_corpus, num_topics=num_topics, update_every=0, passes=20, id2word=dict) return lda, tfidf, dict
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def produce(self): doc_n = 0 docs = [] doctokens = [] # AKA gensim "text" stopwords = nltk.corpus.stopwords.words('english') NOALPHA = re.compile('[^a-z]+') def prep_string(my_string,pattern = NOALPHA): return re.sub(pattern, ' ', my_string.strip().lower()) print('Getting src docs') for doc in self.src_doc_generator(): content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator? docs.append(content) doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords]) doc_n += 1 if doc_n % 1000 == 0: print(doc_n) print('Creating the dictionary') dictionary = Dictionary(doctokens) dictionary.compactify() dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating DOC') db.create_table('doc') for i, doc in enumerate(docs): db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc)) print('Creating WORD') db.create_table('word') for item in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item) print('Creating DOCWORD') db.create_table('docword') for i, tokens in enumerate(doctokens): for item in (dictionary.doc2bow(tokens)): db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
def _create_from_texts(cls, tokenized_texts, name, dataset, settings, minimum_frequency=2): from gensim.corpora import Dictionary as GensimDictionary # build a dictionary logger.info("Building a dictionary from texts") dictionary = GensimDictionary(tokenized_texts) # Remove extremely rare words logger.info("Dictionary contains %d words. Filtering..." % len(dictionary.token2id)) dictionary.filter_extremes(no_below=minimum_frequency, no_above=0.5, keep_n=None) dictionary.compactify() logger.info("Dictionary contains %d words." % len(dictionary.token2id)) dict_model = cls(name=name, dataset=dataset, settings=settings) dict_model.save() dict_model._populate_from_gensim_dictionary(dictionary) return dict_model
class ArticlesCollection: """Class which holds all articles (perhaps over several years) -- with ability to perform LDA on it.""" def __init__(self, year_range, text_output_dirpath, lang=DE_LANG): self.year_range = year_range self.text_output_dirpath = text_output_dirpath self.lang = lang self.articles = [] self.bow_corpus = None self.identifier = '' self.wordsids_filepath = '' self.bowmm_filepath = '' self.tfidf_filepath = '' self.number_of_docs = 0 self.number_of_tokens = 0 self.number_of_types = 0 # gensim data structures self.dictionary = None # Read in collection & clean it & start LDA process self._read_collection() self._collection_identifier() self._set_filepaths() self._create_dictionary() self._create_bow_representation() self._set_number_of_docs() self._set_number_of_tokens() self._set_number_of_types() # Create tf*idf matrix if requested. if USE_TFIDF: self._create_tfidf_matrix() def show_lda(self): """Show latent topics found.""" model = None # Only use tf*idf input if requested. corpus = self.bow_corpus if USE_TFIDF: corpus = MmCorpus(self.tfidf_filepath) # k = number of documents = number of topics (for now) num_topics = self.number_of_docs if NUM_TOPICS != -1: num_topics = NUM_TOPICS print('Number of docs presented: ' + str(self.number_of_docs)) print('Number of origin. tokens: ' + str(self.number_of_tokens)) print('Number of original types: ' + str(self.number_of_types)) print('Number of types at usage: ' + str(len(self.dictionary.\ keys()))) print('Number of topics to find: ' + str(num_topics)) print('Number of topics to show: ' + str(TOPICS_DISPLAY)) if MODEL == 'LdaMallet': model = LdaMallet(PATH_TO_MALLET_BIN, corpus=corpus, num_topics=num_topics, id2word=self.dictionary, iterations=ITERATIONS) elif MODEL == 'HdpModel': model = HdpModel(corpus, self.dictionary) else: model = LdaModel(corpus=corpus, id2word=self.dictionary, num_topics=num_topics, iterations=ITERATIONS, update_every=1, chunksize=10, passes=1, distributed=False) ''' More possible options above: chunksize=1, update_every=1, decay=0.5, ''' if MODEL == 'LdaModel' or MODEL == 'LdaMallet': topic_number = 0 for topic in model.show_topics(topics=TOPICS_DISPLAY, topn=WORDS_DISPLAY, formatted=True): topic_number += 1 print('Topic#' + str(topic_number) + ': ', topic) else: # For MODEL 'HdpModel' for topic in model.print_topics(topics=TOPICS_DISPLAY, \ topn=WORDS_DISPLAY): print topic def _set_number_of_types(self): """Set number of types (from tokens).""" self.number_of_types = len(set(list(itertools.\ chain(*self.articles)))) def _set_number_of_tokens(self): """Set number of tokens gotten in all documents.""" self.number_of_tokens = sum(len(article) \ for article in self.articles) def _set_number_of_docs(self): """Set number of docs found in collection read in.""" self.number_of_docs = len(self.articles) def _set_filepaths(self): """Sets filepaths for intermediate data.""" # Filepaths necessary for topic modeling self.wordsids_filepath = WORDSIDS_DIR + self.identifier + \ '_' + 'wordsids.txt' self.bowmm_filepath = BOWMM_DIR + self.identifier + '_' + \ 'bow.mm' self.tfidf_filepath = TFIDF_DIR + self.identifier + '_' + \ 'tfidf.mm' def _create_dictionary(self): """Create a mapping of ids and surface froms (=words).""" print('Create dictionary of collection.') self.dictionary = Dictionary(self.articles) self.dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE) self.dictionary.save_as_text(self.wordsids_filepath) self.dictionary.compactify() print(self.dictionary) def _create_bow_representation(self): """Create bag-of-words representation of collection, and save it in Matrix Matrix format to disk.""" print('Create bag-of-words matrix representation.') self.bow_corpus = [self.dictionary.doc2bow(article) for article in self.articles] MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus) def _create_tfidf_matrix(self): """Create TF-IDF matrix and save it in Matrix Matrix format to disk""" print('Create TF-IDF matrix of collection.') tfidf = TfidfModel(self.bow_corpus, id2word=self.dictionary, normalize=True) MmCorpus.serialize(self.tfidf_filepath, tfidf[self.bow_corpus]) print('Number of documents:', tfidf.num_docs) def _collection_identifier(self): """Collection id is important for the caching files and the file naming of the corresponding files.""" start_year = self.year_range[0] end_year = self.year_range[-1] if start_year == end_year: self.identifier = str(start_year) + '_' + self.lang else: self.identifier = str(start_year) + '-' + str(end_year) + \ '_' + self.lang def _read_collection(self): """Iterate through all years in order to get all articles read in.""" for year in self.year_range: # Not every single yearbook is available. try: self._read_book(year) except: print('Skip (inexistent) yearbook ' + str(year) + '.') def _read_book(self, year): """Read in a a single book and save its articles.""" filepath = sac_filepath(year, lang=self.lang) print('Read in yearbook ' + str(year) + '.') sac_xml = etree.parse(SAC_XML_DIR + filepath) sac_xml_articles_list = sac_xml.xpath('.//article') # For each article for sac_xml_article in sac_xml_articles_list: # Prepare file to write out words sac_xml_article_no = sac_xml_article.attrib['n'] out_filename = str(year) + '-' + str(self.lang) + '-' \ + sac_xml_article_no + '.txt' out_filepath = self.text_output_dirpath + sep + out_filename print(out_filepath) out_filehdl = open(out_filepath, 'w') article_word_list = [] sac_xml_sentences_list = \ sac_xml_article.xpath('.//s[@lang=\'' + \ self.lang + '\']') # For each sentence (in the article) for sac_xml_sentence in sac_xml_sentences_list: sac_xml_words_list = \ sac_xml_words_list = sac_xml_sentence.xpath('.//w') # For each word (in the sentence of the article) for sac_xml_word in sac_xml_words_list: word = None try: if WITH_POS_FILTER is False: if WITH_LEMMATA: word = sac_xml_word.attrib['lemma'].lower() if self._is_lemma_bogus(word): word = sac_xml_word.text.lower() if WITH_LEMMATA is False: word = sac_xml_word.text.lower() elif WITH_POS_FILTER: word = self._get_pos_filtered_word(sac_xml_word) except: pass # Don't add stop words, in any case if not word in STOPWORDS[self.lang] \ and word is not None and len(word) >= MIN_WORDLEN: article_word_list.append(self.\ _normalize_word(word).\ encode(ENCODING)) # Save article as bag-of-words (of the sentences) self.articles.append(article_word_list) out_filehdl.write(' '.join(article_word_list)) out_filehdl.close() def _get_pos_filtered_word(self, sac_xml_word): """ Get word by PoS filter """ # There are words without PoS tags, i. e. try try: if sac_xml_word.attrib['pos'] \ in POS_FILTER[self.lang]: if WITH_LEMMATA: word = sac_xml_word.attrib['lemma'].lower() if self._is_lemma_bogus(word): return sac_xml_word.text.lower() else: return sac_xml_word.attrib['lemma'].lower() else: return sac_xml_word.text.lower() else: return None except: return None def _is_lemma_bogus(self, lemma): """ Return true if the lemma is not useful for LDA, otherwise false. """ for bogus_symbol in SURFACE_TRIGGERS: if bogus_symbol in lemma: return True # That's the last resort return False def _normalize_word(self, word_to_normalize): """ This function helps to normalize words, because of encoding issues of some LDA tools ... @return: Normalized word as str type """ # Transform umlauts to ASCII friendly form word = word_to_normalize.replace(u"ä","ae").replace(u"ö","oe"). \ replace(u"ü","ue").replace(u"ß","ss") return word def __str__(self): """ Return a string which shows document number, number of words and number of types. """ ret_string = '' art_number = 0 for article in self.articles: art_number += 1 ret_string += 'Doc#' + str(art_number) + ': ' ret_string += str(len(article)) + ' [' + \ str(len(set((article)))) + ']' ret_string += '\n' return ret_string
class TfidfVectorizer(): """ Transform text to tf-idf representation """ def __init__(self): self.base_path = os.path.dirname(__file__) self.dictionary_path = os.path.join(self.base_path, "dictionary") self.tf_idf_model_path = os.path.join(self.base_path, "tfidf") self.stemmer = NepStemmer() self.tf_idf_model = None def get_tokens(self, document): if not self.stemmer: raise Exception("Stemmer not available") return self.stemmer.get_stems(document) def construct_model(self, documents): logging.basicConfig( format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO ) logging.info("Obtaining word tokens") tokens = [self.get_tokens(document) for document in documents] # self.tf_idf_model = TfidfModel(tokens) logging.info("Constructing dictionary") self.dictionary = Dictionary(tokens) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000) self.dictionary.compactify() self.dictionary.save(self.dictionary_path) logging.info("Constructing TF-IDF model") self.tf_idf_model = TfidfModel(dictionary=self.dictionary) self.tf_idf_model.save(self.tf_idf_model_path) def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path) def doc2vector(self, document): """ Returns the sparse tf-idf vector for given document """ tokens = self.get_tokens(document) bag_of_words = self.dictionary.doc2bow(tokens) return (self.tf_idf_model[bag_of_words]) def obtain_feature_vector(self, document): """ Returns a single dense tf-idf vector for a given document """ self.load_data() tf_idf_vector = matutils.sparse2full( self.doc2vector(document), self.no_of_features ).reshape(1, -1) return tf_idf_vector def obtain_feature_matrix(self, documents): """ Returns the tf-idf dense matrix for the given documents """ self.load_data() input_matrix_sparse = [ self.doc2vector(x) for x in documents ] no_of_features = len(self.tf_idf_model.idfs) input_matrix = matutils.corpus2dense( input_matrix_sparse, no_of_features ).transpose() return input_matrix
elif not opts.scaling: scaling = None else: raise ValueError("Only tfidf scaling is supported") word_model = opts.word_model if word_model: logging.info("Building word model") corpus = LimitCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), word_limit) else: corpus = SublexicalizedCorpus(WikiCorpus(dump_fn, dictionary=Dictionary()), order=order, word_limit=word_limit) voc = Dictionary(corpus) voc.filter_extremes(no_below=cutoff) voc.compactify() bow_corpus = (voc.doc2bow(art) for art in corpus) tfidf = None if scaling == 'tfidf': tfidf = TfidfModel(bow_corpus) bow_corpus = (tfidf[voc.doc2bow(art)] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=num_topics, id2word=voc) model.save(model_fn) if tfidf: tfidf.save(model_fn + '.tfidf')
def main(): parser = ArgumentParser( description="wrapper script for churning datasets of wiki or elasticsearch kind through gensim to produce topic models please see gensim documentation for more information" ) parser.add_argument("-ds", "--dataset", default="wiki", help="What kind of dataset to use. (wiki,es,file)") parser.add_argument("-d", "--dump-file", help="Wiki: bz2 dump file with wiki in it") parser.add_argument("-l", "--limit", help="Wiki: How many documents to extract from wiki") parser.add_argument("--model-id", default="model", help="Filename for created model.") parser.add_argument("--model-type", default="lsi", help="Model type (lsi, lda, word2vec, hdp, vocabulary).") parser.add_argument("--n-topics", default=10, help="Number of topics to model.") parser.add_argument("--n-passes", default=1, help="Number of passes for LDA model.") parser.add_argument("--w2v-size", default=100, help="size of Word2Vec context.") parser.add_argument("--w2v-window", default=5, help="window for Word2Vec.") parser.add_argument("-q", "--query", default=None, help="Elasticsearch: Query to use to fetch documents") parser.add_argument("--index", help="Elasticsearch: index to read from.") parser.add_argument("--doc_type", default="doc", help="Elasticsearch: data type in index.") parser.add_argument("--data-dir", help="Directory to save the generated models and vocabularies into.") parser.add_argument("--vocab", help="Prebuilt Vocabulary file. Use this to avoid having to generate one.") opts = parser.parse_args() model_type = opts.model_type.lower() if model_type not in ["lsi", "lda", "word2vec", "hdp", "vocabulary"]: logging.error("Invalid model type %s" % model_type) parser.print_usage() exit(-1) logging.info("Using model type %s" % model_type) dump_fn = opts.dump_file limit = int(opts.limit) if opts.limit else None data_type = opts.dataset.lower() if data_type not in ["es", "wiki", "file"]: logging.error("Invalid dataset type %s" % data_type) parser.print_usage() exit(-1) limit = None if opts.limit: limit = int(opts.limit) if not dump_fn and data_type in ["wiki"]: logging.error("--dump-file required for wiki dataset") sys.exit(1) query = opts.query index = opts.index doc_type = opts.doc_type if data_type == "es" and index is None: logging.error( "Please be kind to at least specify the index you want to fetch from elasticsearch using the --index parameter" ) sys.exit(1) n_topics = int(opts.n_topics) n_passes = int(opts.n_passes) logging.info("Using %d topics." % n_topics) data_dir = opts.data_dir model_id = opts.model_id model_fn = "%s_%s_%d" % (model_id, model_type, n_topics) if data_dir: model_fn = "%s/%s" % (data_dir, model_fn) if model_type == "word2vec": w2v_size = int(opts.w2v_size) w2v_window = int(opts.w2v_window) model_fn = "%s_w_%s_s_%s" % (model_fn, w2v_window, w2v_size) logging.info("Writing models to %s." % model_fn) if data_type == "es": logging.info("Using data type %s with index %s, doc_type %s query %s" % (data_type, index, doc_type, query)) dataset = ElasticsearchDataset( read_index=index, read_doc_type=doc_type, query=query, normalize_func=normalize_es ) elif data_type == "wiki": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = WikipediaDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_wiki) elif data_type == "file": logging.info("Using data type %s with dump_file %s and limit %s" % (data_type, dump_fn, limit)) dataset = FileDataset(dump_fn=dump_fn, num_articles=limit, normalize_func=normalize_file) vocab_file = opts.vocab vocab = Dictionary() sw = set(stopwords.words("norwegian")) if not vocab_file or model_type == "vocabulary": vocab.add_documents([get_tokenized(page, sw) for page in dataset]) vocab.filter_extremes() vocab.compactify() vocab.save(model_fn + ".vocab") else: vocab = Dictionary.load(vocab_file) if model_type == "vocabulary": return tfidf = TfidfModel(dictionary=vocab) if model_type == "lsi": corpus = IterableDataset(dataset, sw, vocab) model = LsiModel(corpus=tfidf[corpus], num_topics=n_topics, id2word=vocab) elif model_type == "lda": corpus = IterableDataset(dataset, sw, vocab) model = LdaModel(corpus=tfidf[corpus], num_topics=n_topics, passes=n_passes, id2word=vocab) elif model_type == "word2vec": corpus = IterableDataset(dataset, sw, vocab, doc2bow=False) corpus.dictionary = vocab model = Word2Vec(sentences=corpus, window=w2v_window, size=w2v_size) elif model_type == "hdp": corpus = IterableDataset(dataset, sw, vocab) model = HdpModel(corpus=tfidf[corpus], id2word=vocab) logging.info(model) model.save(model_fn)