def merge_dictionaries(dictionaries_path, merged_dictionary_path=None): dict_paths = list(iglob(dictionaries_path)) final_dictionary = Dictionary.load(dict_paths[0]) for dict_path in dict_paths[1:]: dictionary = Dictionary.load(dict_path) final_dictionary.merge_with(dictionary) if merged_dictionary_path: final_dictionary.save(merged_dictionary_path) return final_dictionary
def loadModelfromFile(modelPath, readOnly=False): if readOnly == True: lda_model = LdaModel.load(fname=modelPath, mmap='r') dictionary = Dictionary.load(fname=modelPath.replace( '.topic', '.dict'), mmap='r') else: lda_model = LdaModel.load(fname=modelPath) dictionary = Dictionary.load( fname=modelPath.replace('.topic', '.dict')) print('load lda_model model from {0} ok!'.format(modelPath)) return lda_model, dictionary
def analyze_top_dfs(tokendict, tagdict, cutoff_factor=1): ''' Provided gensim-dicts `tokendict` and `tagsdict`, show the top word frequencies. ''' if type(tokendict) == str: tokendict = Dictionary.load(tokendict) if type(tagdict) == str: tagdict = Dictionary.load(tagdict) max_tag_df = max(tagdict.dfs.iteritems(), key=operator.itemgetter(1)) sorted_dfs = sorted(tokendict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True) print "count threshold: %-15s\t%d" % (tagdict[max_tag_df[0]], max_tag_df[1]) print "----------------------------------------------" for tup in sorted_dfs[:100]: if tup[1] > max_tag_df[1] * cutoff_factor: print "%-15s\t%d" % (tokendict[tup[0]][:15], tup[1]) else: break
def load(lsi_path=None, id2word_path=None, index_path=None): """ If specified, attempts to load gensim LsiModel from `lsi_path` and gensim Dictionary from `dictionary_path`. Parameters ---------- lsi_path: str File-path designating where self.model should be saved. id2word_path: str File-path designating where self.dictionary should be saved. """ if lsi_path is not None: from gensim.models import LsiModel if not os.path.exists(lsi_path): raise IOError( 'The provided file path to the LsiModel was not found.' 'Please ensure that the argument is the correct path.') return LsiModel.load(lsi_path) if id2word_path is not None: from gensim.corpora.dictionary import Dictionary if not os.path.exists(id2word_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return Dictionary.load(id2word_path) if index_path is not None: from gensim.similarities import MatrixSimilarity if not os.path.exists(index_path): raise IOError( 'The provided file path to the Dictionary was not found.' 'Please ensure that the argument is the correct path.') return MatrixSimilarity.load(index_path)
def main(args): if args.corpus_type != "wiki": if args.processed_corpus_save_path is not None: raise ValueError("Processed corpus saving only supported " "for 'wiki' corpus type") kwargs = {} if args.dictionary_path is not None: kwargs["dictionary"] = Dictionary.load(args.dictionary_path) if args.dictionary_out_path is not None: kwargs["dictionary_save_path"] = args.dictionary_out_path if args.corpus_type == "wiki" and args.processed_corpus_save_path is not None: kwargs["sentences_save_path"] = args.processed_corpus_save_path logging.debug("Building corpus") corpus = CORPUS_TYPES[args.corpus_type](args.corpus_path, **kwargs) documents = corpus.get_texts() logging.debug("Now beginning VSM construction with Word2Vec") model = Word2Vec( sentences=documents, vocab_path=args.vocab_path, window=args.window_size, drop_capitals=args.drop_capitals, min_count=args.minimum_token_count, size=args.vector_dimensions, workers=multiprocessing.cpu_count(), ) model.save(args.out_path) if args.vocab_out_path is not None: model.save_vocab(args.vocab_out_path)
def _load(self): modeldir = self._workdir.joinpath("ldamodel_{}".format(self._name)) if not modeldir.exists(): return False self._lda = LdaMulticore.load(str(modeldir)) self._dictionary = Dictionary.load( str(self._workdir.joinpath("dictionary_{}.gz".format(self._name))))
def __init__(self, topics=10, worker=3, pretrained_model=None, dictionary=None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary)
def plot_dict_hist(gdict): ''' Provided gensim-dict `gdict`, plot hist statistics ''' if type(gdict) == str: gdict = Dictionary.load(gdict) sorted_dfs = sorted(gdict.dfs.iteritems(), key=operator.itemgetter(1), reverse=True) y = [tup[1] for tup in sorted_dfs] x = arange(0, len(y)) plt.figure(figsize=(8,5)); plt.loglog(x, y); plt.grid(); plt.xlabel("Token rank"); plt.ylabel("Document count"); cdf = np.empty(len(y)) delta(y, cdf) cdf /= np.max(cdf) # normalize x50 = x[cdf > 0.50][0] x80 = x[cdf > 0.80][0] x90 = x[cdf > 0.90][0] x95 = x[cdf > 0.95][0] plt.axvline(x50, color='c'); plt.axvline(x80, color='g'); plt.axvline(x90, color='r'); plt.axvline(x95, color='k'); print "50%\t", x50 print "80%\t", x80 print "90%\t", x90 print "95%\t", x95
def load_model(self, username): if username not in self.models: self.models[username] = models.LdaModel.load( self.get_model_path(username=username)) if username not in self.dictionaries: self.dictionaries[username] = Dictionary.load( self.get_dictionary_path(username=username))
class Corpus(object): def __init__(self, path, dict_path): self.dictionary = Dictionary() add_to_dict = True if dict_path and os.path.exists(dict_path): print('loading dictionary') self.dictionary = self.dictionary.load(dict_path) add_to_dict = False self.train = self.tokenize(os.path.join(path, 'train.txt'), add_to_dict) self.valid = self.tokenize(os.path.join(path, 'valid.txt'), add_to_dict) self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict) if dict_path and not os.path.exists(dict_path): self.dictionary.save(dict_path) def tokenize(self, path, add_to_dict): """Tokenizes a text file.""" assert os.path.exists(path) all_words = list( chain.from_iterable([ sent.split() + ['<eos>'] for sent in open(path).read().split('\n') ])) if add_to_dict: self.dictionary.add_documents([all_words]) return torch.LongTensor(self.dictionary.doc2idx(all_words))
def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary)
def prune_dictionary(src_dictionary_path, dest_dictionary_path=None, no_below=None, no_above=None, keep_n=None): dictionary = Dictionary.load(src_dictionary_path) dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) if dest_dictionary_path: dictionary.save(dest_dictionary_path) return dictionary
def representation(self): if not self.model: print("LOAD MODEL...") self.model = LsiModel.load( os.path.join(self.preprocessor.source.path, self.preprocessor.source.info + '.model')) self.dictionary = Dictionary.load( os.path.join(self.preprocessor.source.path, self.preprocessor.source.info + '.dic')) pass
def getDictionary(word_corpus, useSavedTill): if useSavedTill >= USESAVED.dictionary: common_logger.info("loading dictionary from file") dictionary = Dictionary.load(file_lda_gensim_dictionary) return dictionary else: common_logger.info("Creating dictionary from corpus") dictionary = Dictionary(word_corpus.values()) common_logger.info("saving dictionary") dictionary.save(file_lda_gensim_dictionary) return dictionary
def __init__(self, examples, vocab, lda_vocab_path, lda_model_path, args): self.data = examples self.vocab = vocab self.args = args self.item_vocab = load_item_vocab(args) self.lda_vocab = Dictionary.load(lda_vocab_path) self.lda_model = LdaMulticore.load(lda_model_path) self.sent_lim = [ self.args.cp_sentNum, self.args.desc_sentNum, self.args.require_sentNum, self.args.benefit_sentNum ]
def f4 (berita): from Sastrawi.Stemmer import StemmerFactory from Sastrawi.StopWordRemover import StopWordRemoverFactory import gensim from gensim import corpora from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from gensim.matutils import cossim as cs import os # os.chdir('D:/[Projects]/corpus/wiki2') id2word = Dictionary.load(os.path.join(dir_path, 'wiki_mini.dict')) mm_corp = corpora.MmCorpus(os.path.join(dir_path, 'wiki_mini_bow.mm')) lda = LdaModel.load(os.path.join(dir_path, 'lda_model_mini_wiki.model')) stopword_factory = StopWordRemoverFactory.StopWordRemoverFactory() stemmer_factory = StemmerFactory.StemmerFactory() stopwords = stopword_factory.create_stop_word_remover() stemmer = stemmer_factory.create_stemmer() judul = stemmer.stem(berita['judul']) judul = stopwords.remove(judul) bow_judul = id2word.doc2bow(judul.lower().split()) lda_judul = lda[bow_judul] sentences = berita['kalimat_bersih'] berita ['skor']['f4']=[] skor = berita['skor']['f4'] # distance belum bisa pake JSD karena belum ketemu solusi kalau beda ukuran matrixnya # possibly karena dictionary LDA masih kecil jadi sedikit for kalimat in sentences: bow_kalimat = id2word.doc2bow(kalimat.lower().split()) lda_kalimat = lda[bow_kalimat] skor_cs = cs(bow_kalimat,bow_judul) skor.append(skor_cs) # print(lda_kalimat) # print(lda_judul) # print(kalimat) # print(judul) # distance = jsd(lda_kalimat,lda_judul) # if -1 < distance < 1: # skor.append(distance-1) # else : # skor.append(0) return berita
def __init__(self, corpus, wiki_dict, wordfile, vocab_size=200000, window_size=5): self.w2id_dict = util.load_worddict(wordfile, vocab_size) self.window_size = window_size print('Starting loading Wiki Corpus...', end='') wiki_d = Dictionary.load(wiki_dict) self.wiki_corpus = WikiCorpus(corpus, dictionary=wiki_d) print('[done]')
def filter_extremes_wrapper(gdict, no_below=1, no_above=1.0, keep_n=None, save_pickle=None): ''' Given unfiltered gensim-dict `gdict`, wrap filter_extremes ''' if type(gdict) == str: gdict = Dictionary.load(gdict) print "Before filtering:", gdict gdict.filter_extremes(**kwargs) print "After filtering:", gdict if save_pickle: print "\nsaving..." gdict.save(save_pickle) return gdict
def get_vocab(tweets=None): if 'vocab_sentiment' in os.listdir('.'): if not tweets: print("Loading vocabulary...") vocab = Dictionary.load('vocab_sentiment') print("Loaded vocabulary") return vocab response = input('Vocabulary found. Do you want to load it? (Y/n)'\ ': ') if response.lower() in ['n', 'no', 'nah', 'nono', 'nahi', 'nein']: if not tweets: tweets, labels = export() del labels return create_vocab(tweets) else: print("Loading vocabulary...") vocab = Dictionary.load('vocab_sentiment') print("Loaded vocabulary") return vocab else: if not tweets: tweets, labels = export() del labels return create_vocab(tweets)
def load_data(): '''this function loads up the already processed data with all of the nested lists properly reformatted as lists, and loads up the dictionaries''' df = pd.read_csv('data/processed_full.tsv', sep='\t') df['english_tokens'] = df['english_tokens'].apply( lambda x: x.strip("['']").split("', '")) df['french_tokens'] = df['french_tokens'].apply( lambda x: x.strip("['']").split("', '")) df['english_bow'] = df['english_bow'].apply(str_to_int) df['french_bow'] = df['french_bow'].apply(str_to_int) df['english_padded'] = df['english_padded'].apply(str_to_int) df['french_padded'] = df['french_padded'].apply(str_to_int) df = df.drop('Unnamed: 0', axis=1) eng = Dictionary.load('data/Dictionaries/eng') fren = Dictionary.load('data/Dictionaries/fren') # create ML data X_eng = np.vstack(df['english_padded'].values) y_fren = np.vstack(df['french_padded'].values) y_fren = y_fren.reshape(*y_fren.shape, 1) X_eng = X_eng.reshape(*X_eng.shape, 1) return df, eng, fren, X_eng, y_fren
def pre_processing(): global vocab,model; try: model = load_model('SentimentAnalysis/model_nn.h5') except IOError: if 'model_nn.tar.gz' not in os.listdir('SentimentAnalysis'): raise IOError("Could not find Sentiment Analysis model. Ensure model "\ "is present in: ./SentimentAnalysis") else: process = subprocess.Popen("cd SentimentAnalysis/; "\ "tar -zxf model_nn.tar.gz; cd ..", shell=True, stdout=subprocess.PIPE) process.wait() model = load_model('/content/PClub-Project-master/SentimentAnalysis/model_nn.h5') vocab = Dictionary.load('SentimentAnalysis/vocab_sentiment')
def main(): logformat = '%(asctime)s %(name)-12s: %(message)s' logging.basicConfig(level=logging.DEBUG, format=logformat) kera = NOB_kera() es = Elasticsearch(port=9201) mod = LdaModel.load(modelfile) vocab = Dictionary.load(vocabulary) tfidf = TfidfModel(dictionary=vocab) results = [] for (topics, topicid) in get_doc_topics(mod, mod.num_topics, num_words_from_topic, vocab, tfidf): res = es.search(index='wiki4', body={"query": {"match": {"_all": topics}}}, size=num_results_from_es) results.append({'topics': topics, 'result': res, 'topicid': topicid}) results = add_keywords(results, kera) df = pd.DataFrame(results) df.to_csv('nowiki_4_with_kera_250_topics.csv', encoding='utf-8')
def main(coursesList): lda = LDA.load("./best_model.lda") dictionary = Dictionary.load("best_model.lda.id2word") bigrams = Phraser.load("./bigram_model.pkl") trigrams = Phraser.load("./trigram_model.pkl") text_clean = [doc.split(' ') for doc in coursesList['description']] corpus = [dictionary.doc2bow(text) for text in text_clean] create_vector_topics(lda, corpus, dictionary, coursesList) courses_topic = config.matrix_courses_topic.to_numpy() #lda, dictionary, bigrams, trigrams = create_LDA_model(coursesList) #courses_topic = config.matrix_courses_topic.to_numpy() cursor.execute("select id from auth_group") id_groups = cursor.fetchall() for i in id_groups: cursor.execute( "select distinct studyplan_id from students where group_id = %(id)s ", {'id': i[0]}) studyplan_id = cursor.fetchall() for j in studyplan_id: subject_list = pd.DataFrame(columns=['id_subject', 'description']) subject_list = WordProcessing.word_processing( get_work_program(j[0], subject_list)) #for k in subject_list: token_stud_prog = [ program.split(' ') for program in subject_list['description'] ] #token_stud_prog = add_n_grams(token_stud_prog, bigrams, trigrams) prog_corp = [ dictionary.doc2bow(program) for program in token_stud_prog ] topic_prog = lda.get_document_topics(prog_corp) for l in range(0, len(topic_prog)): profile_student = np.zeros(config.num_lda_topic) dense_topic_prog = np.zeros(config.num_lda_topic) for m in topic_prog[l]: dense_topic_prog[m[0]] += m[1] #mask = np.argsort(dense_topic_prog)[::-1][:1] #profile_student[mask] += 1 profile_student = dense_topic_prog cosine_similarities = linear_kernel( profile_student.reshape(1, -1), courses_topic).flatten() top_courses = np.where(cosine_similarities >= 0.2)[0] print(subject_list.loc[l, 'id_subject']) #print(top_courses) print(coursesList.loc[top_courses, 'name':'link'])
def latentDir(): # ini cuman intuk tes bisa masuk sini apa engga LDA modelnya import gensim from gensim import corpora from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel import os os.chdir('D:/[Projects]/corpus/wiki2') mm_corp = corpora.MmCorpus('./LDA/wiki_mini_bow.mm') id2word = Dictionary.load('./LDA/wiki_mini.dict') lda = LdaModel.load('./LDA/lda_model_mini_wiki.model') if lda != None: print('Model LDA berhasil di load') else: print('model LDA gagal di load') return
def __init__(self, examples, tokenizer, lda_vocab_path, lda_model_path, args): self.data = examples self.tokenizer = tokenizer # add new special token self.spec_tokens = load_special_tokens(args) self.tokenizer.additional_special_tokens = self.spec_tokens self.tokenizer.add_tokens(self.spec_tokens) self.args = args self.item_vocab = load_item_vocab(args) self.lda_vocab = Dictionary.load(lda_vocab_path) self.lda_model = LdaMulticore.load(lda_model_path) self.sent_lim = [ self.args.cp_sentNum, self.args.desc_sentNum, self.args.require_sentNum, self.args.benefit_sentNum ] self.text_fields = self.data[0]._fields[:4]
def questions_to_keywords(questions, per_question): with open('corpus.json') as bowfile: bow = json.load(bowfile) dictionary = Dictionary.load('dictionary.dict') if per_question: keywords_per_question = [] for question in questions: words = preprocess_question(question) keywords_per_question.append( tf_idf_keywords(words, bow, dictionary)) return keywords_per_question else: for i in range(len(questions)): question = questions[i] words = preprocess_question(question) questions[i] = words questions = [word for question in questions for word in question] return tf_idf_keywords(questions, bow, dictionary)
def corpus_tfidf(): path = "" corpus = MmCorpus(path + "corpus.mm") id2word = Dictionary.load(path + 'corpus.mm.dict') # TF-IDF the corpus tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] tfidf.save("5_topics_tfidf_only.model") lda_model_tfidf = models.LdaModel(corpus_tfidf, num_topics=5, id2word=id2word)#models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=id2word, passes=2, workers=4) # better model print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus)) # a measure of how good the model is. lower the better. for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic)) lda_model_tfidf.save(path + "5_topics_test.model") lda_model_tfidf.wv.save(path + "5_topics_test_kv.model")
def buildDictionary(force=False): """ Build a dictionary in which each post corresponds to a document. """ global globalDict if force or not isfile(dictName): postids = getPostids() numPosts = len(postids) count = 0 for postid in postids: if count % 100 == 0: print "Added %d out of %d to dictionary: %s" % (count, numPosts, time.strftime("%H:%M:%S")) addPostToDict(postid) count += 1 else: globalDict = Dictionary.load(dictName) # Filter out extremely common words globalDict.filter_extremes(no_below=2, no_above=0.5)
def __init__(self, analyzed_items_path=None, dictionary_path=None, corpus_path=None, tfidf_model_path=None): if dictionary_path: self.dictionary = Dictionary.load(dictionary_path) else: self.dictionary = None if analyzed_items_path: self.analyzed_items_path = analyzed_items_path else: self.analyzed_items_path = None if corpus_path: self.corpus = MmCorpus(corpus_path) else: self.corpus = None if tfidf_model_path: self.tfidf_model = TfidfModel.load(tfidf_model_path) else: self.tfidf_model = None
def build_lda_model(self, topics: int=20): ignore_words = [ 'like', 'know', 'f**k', 'f*****g', 'want', 'shit', 'know', 'sure', 'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going', 'WEBLINK', 'got', 'way', '' ] filename = op.join(self.input_dir, f'{self.board}.dictionary') dictionary: Dictionary = Dictionary.load(filename) documents = ReadThreads( self.board, input_dir=self.input_dir, file_type='phrases', return_func=lambda x, y: dictionary.doc2bow( [w for w in y.split() if w not in ignore_words] ) ) lda = LdaMulticore( documents, id2word=dictionary, num_topics=topics, iterations=2) filename = op.join(self.input_dir, f'{self.board}.lda') lda.save(filename) return lda
def matcher_attribute_descriptions(path, text1, text2): # using gensim lda temp_file = datapath(path + 'lda_model') lda = LdaModel.load(temp_file) dictionary = Dictionary.load(path + 'dict') # common_dictionary = Dictionary(common_texts) # print(lda.print_topics(5)) text1 = rm_special_chars(text1) text2 = rm_special_chars(text2) text1 = text1.split() text2 = text2.split() corpus = [text1, text2] # print(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] # print(corpus) vector1 = lda[corpus[0]] vector2 = lda[corpus[1]] from pprint import pprint # pprint(vector1) # pprint(vector2) vector1 = sorted(vector1, key=lambda x: x[1], reverse=True) vector2 = sorted(vector2, key=lambda x: x[1], reverse=True) print(vector1) print(vector2) topics1 = [(dictionary[tup[0]], tup[1]) for tup in vector1] topics2 = [(dictionary[tup[0]], tup[1]) for tup in vector2] print(topics1) print(topics2) return
def _create_dictionary(self, mongo_client): """ Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets the object's dictionary property. :param mongo_client: server.db.MongoClientContext """ from gensim.corpora.dictionary import Dictionary if self._resource_exists(self.dictionary_file): self.logger().debug( "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file)) self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file)) else: self.logger().debug("Dictionary file not found, creating a new Dictionary file") self._dictionary = Dictionary() documents = [] for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]: documents.append(self.tokenize_sentence(doc[self.considerable_doc_property])) self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents)) self._dictionary.add_documents(documents) self._dictionary.save(self._create_resource_path(self.dictionary_file))
def questions_to_keywords(questions, per_question): with open('corpus.json') as bowfile: bow = json.load(bowfile) dictionary = Dictionary.load('dictionary.dict') if per_question: keywords_per_question = [] for question in questions: question = question.lower() question = re.sub("'", ' ', question).replace('_', ' ').replace(' -', ' ') question = re.sub(r'[^A-Za-z^-]', ' ', question) question = re.sub(r'\s+', ' ', question) words = [ word for word in question.split() if word not in stopwords.words('dutch') ] keywords_per_question.append( tf_idf_keywords(words, bow, dictionary)) return keywords_per_question else: for i in range(len(questions)): question = questions[i] question = question.lower() question = re.sub("'", ' ', question).replace('_', ' ').replace(' -', ' ') question = re.sub(r'[^A-Za-z^-]', ' ', question) question = re.sub(r'\s+', ' ', question) words = [ word for word in question.split() if word not in stopwords.words('dutch') ] questions[i] = words questions = [word for question in questions for word in question] return tf_idf_keywords(questions, bow, dictionary)
def update(self, name, n=500, method='FastICA'): settings = self._setstorage.load(encode_name(name)) clusterer = Clusterer(settings) # load the models dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY])) ngram_size = len(dictionary[0]) transformer = NgramTransformer(ngram_size) ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL])) # get the input segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n)) documents = [s.value for s in segments] # prepare args kwargs = {'dictionary': dictionary, 'ngramtransformer': transformer, 'ldamodel': ldamodel, 'method': method} Xt = clusterer.fit_transform(documents, **kwargs) labels = clusterer.assign_labels(documents) data = self._make_data(Xt, labels, documents) return json.dumps({'result': 'OK', 'data': data})
#print title_corpus #print title_corpus.dictionary #description_corpus = Corpus_Column(fname, "FullDescription") #print len(description_corpus) #for word in description_corpus.get_texts(): #a = 5 #joblib.dump(cnt, path_join(cache_dir, "counter_train_desc_nltk"), compress=3) cnt = joblib.load(path_join(cache_dir, "counter_train_desc_nltk")) for word, freq in cnt.most_common(10): #[:-100:-1]: print word, freq #MmCorpus.serialize(path_join(cache_dir, "train_desc_nltk_corpus.pickle1"), description_corpus) #description_corpus.dictionary.save(path_join(cache_dir, "train_desc_nltk_dic.pickle")) dicti = Dictionary.load(path_join(cache_dir, "train_desc_nltk_dic.pickle")) #dicti = description_corpus.dictionary print dicti #print description_corpus #print description_corpus.dictionary #print files.dictionary #id2token = dicti.id2token i = 0 for k, v in sorted(dicti.dfs.items(), key=operator.itemgetter(1), reverse=True): if i < 10: print dicti[k], v, "ID:", k i = i + 1 k=0 print "printam token", k print id2token[k], dicti.dfs[k], "ID:", k
fscore_np = np.asarray(fscore) mean_jaccard.append(np.mean(jacc_np)) mean_bleu.append(np.mean(bleu_np)) mean_cos.append(np.mean(cos_np)) mean_fscore.append(np.mean(fscore_np)) return np.max(np.asarray(mean_bleu)), np.max( np.asarray(mean_jaccard)), np.max(np.asarray(mean_cos)), np.max( np.asarray(mean_fscore)) GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations( ) path = "/home/norberteke/PycharmProjects/Thesis/data/" dictionary = Dictionary.load(path + 'GH_full_processed_Dictionary.dict') corpus = MmCorpus(datapath(path + 'corpus_processed_GH_full.mm')) texts = [] with open(path + 'GH_full_processed_corpus.csv', 'r') as f: reader = csv.reader(f) texts = list(reader) terms = [] for (key, value) in dictionary.iteritems(): terms.append(value) def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos, max_fscore): with open(path, 'a') as f:
def fit(self): self._lda = LdaModel(corpus=self._corpus, id2word=self._dictionary, num_topics=self._num_topics, distributed=True) def get(self): return self._lda def usage(): print 'usage: ldalearner.py [segment_name] [dictionary_name] [resulting_model_name]' sys.exit(0) if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) args = sys.argv[1:] if len(args) != 3: usage() segment_name = unicode(args[0]) dict_path = os.path.join(DICTIONARY_PATH, args[1]) dictionary = Dictionary.load(dict_path) model_path = os.path.join(LDA_PATH, args[2]) corpus = SegmentCorpus(segment_name, dictionary, MongoSegmentStorage()) learner = LdaLearner(corpus, dictionary) learner.fit() learner.get().save(model_path)
from __future__ import division from collections import Counter, defaultdict from gensim.corpora.dictionary import Dictionary from lib.iterators import row_stream from itertools import izip import networkx as nx from itertools import combinations common, usefulness = defaultdict(int), defaultdict(int) total = Dictionary.load("../working/titledict.pickle") num_eng = 4 for eid in xrange(num_eng): for row in row_stream("../data/pruned_Train_%d.csv" % eid): ID, title, body, tags = row title_tokens = title.split() tags = set(tags.split()) for token in title_tokens: if token in tags: common[token] += 1 for (hash_id, count) in total.dfs.iteritems(): token = total[hash_id] usefulness[token] = common[token] / count ''' Tag==>Tag recommender ''' G = nx.Graph() num_eng = 4 for eid in xrange(num_eng): for row in row_stream("../data/pruned_Train_%d.csv" % eid):
def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path)
def load(self): self._lda = LdaModel.load(self._model_file) self._dictionary = Dictionary.load(self._dict_file)
from __future__ import division from collections import Counter, defaultdict from gensim.corpora.dictionary import Dictionary from lib.iterators import row_stream from itertools import izip import networkx as nx from itertools import combinations common, usefulness = defaultdict(int), defaultdict(int) total = Dictionary.load("../working/titledict.pickle") num_eng = 4 for eid in xrange(num_eng): for row in row_stream("../data/pruned_Train_%d.csv" % eid): ID, title, body, tags = row title_tokens = title.split() tags = set(tags.split()) for token in title_tokens: if token in tags: common[token] += 1 for (hash_id, count) in total.dfs.iteritems(): token = total[hash_id] usefulness[token] = common[token] / count ''' Tag==>Tag recommender ''' G = nx.Graph() num_eng = 4
break return segments if __name__ == '__main__': parser = argparse.ArgumentParser(description='Cluster segments') parser.add_argument('clustermodel', type=unicode, help='The clusterer model to use.') args = parser.parse_args() setstorage = MongoSettingsStorage() docstorage = MongoDocumentStorage() segstorage = MongoSegmentStorage() logger.info('Loading clusterer model') settings = setstorage.load(encode_name(args.clustermodel)) dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY])) ngram_size = len(dictionary[0]) transformer = NgramTransformer(ngram_size) ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL])) logger.info('Clusterer model loaded!') kwargs = {'dictionary': dictionary, 'ngramtransformer': transformer, 'ldamodel': ldamodel, 'method': 'LDA'} logger.info('Fitting clusterer') clusterer = Clusterer(settings) texts, labels = clusterer.get_training_data() clusterer.fit(texts, labels, **kwargs)
def scorer(model, dic): tfidf = TfidfModel.load(model) dictionary = Dictionary.load(dic) def score(words): return tfidf[dictionary.doc2bow(words)] return score
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) model_path = path.join(base_path, p['result_path'], p['model_label']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # train the model on the small marketing corpus preprocess = [] if 'stoplist' in p.as_dict(): stoplist = open(path.join(base_path, p['stoplist'])).readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] def remove_stopwords(sentence): return [word for word in sentence if not word in stoplist] preprocess.append(remove_stopwords) if 'stemmer' in p.as_dict(): stemmer = Stemmer.Stemmer(p['stemmer']) preprocess.append(stemmer.stemWords) if not p['model_label']: cor = TextFilesCorpus(path.join(base_path, p['corpus_path']), no_below=p['no_below'], no_above=p['no_above'], preprocess=preprocess) dictionary = cor.dictionary pre = LogEntropyModel(cor, id2word=dictionary, normalize=True) lsi = LsiModel(pre[cor], id2word=dictionary, num_topics=p['num_topics']) else: dictionary = Dictionary.load(path.join(model_path, p['dict_name'])) pre = SaveLoad.load(path.join(model_path, 'pre.model')) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) lsi.num_topics = p['num_topics'] test_cor_path = path.join(base_path, p['test_cor_path']) test_answers, gold_answers, ratings = [], [], [] flist = glob.glob(path.join(test_cor_path, 'corpus_3', '*.txt')) for file in flist: match = re.search('data3_(\d)_\d+.txt', file) ratings.append(int(match.group(1))) with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] test_answers.append(corpus) flist = glob.glob(path.join(test_cor_path, 'corpus_3_golden', '*.txt')) for file in flist: with open(file) as f: doc = string.join(map(string.strip, f.readlines())) doc = utils.tokenize(doc, lower=True) for func in preprocess: doc = func(doc) corpus = lsi[pre[dictionary.doc2bow(doc)]] gold_answers.append(corpus) sim = MatrixSimilarity(test_answers)[gold_answers] mean_sim = np.mean(sim, axis=0) print 'pearsons corrcoef: %f' % np.corrcoef(ratings, mean_sim)[0,1] print 'spearmans r: %f with p: %f' % stats.spearmanr(ratings, mean_sim)
def get_dictionary(): return Dictionary.load(DICTIONARY_FILE)
out stopwords without a explicit list. @author: dedan ''' from __future__ import division from gensim.corpora.dictionary import Dictionary import pylab as plt import numpy as np min_freq = 1000 n_words = 200 stoplist = open('/Users/dedan/projects/mpi/data/stoplists/german_stoplist.txt').readlines() stoplist = [unicode(s.strip(), encoding='utf-8').lower() for s in stoplist] dic = Dictionary.load('/Users/dedan/projects/mpi/data/results/20110628-170809/dic.dict') # word frequncy distibution of the dictionary freqs = np.array(dic.dfs.values()) freqs = freqs[freqs > min_freq] plt.figure() plt.subplot(3,1,1) plt.hist(freqs, bins=100) plt.title('distribution of word frequencies with frequency > %s' % min_freq) # most frequent words in the dictionary freqs = np.array([dic.dfs[dic.token2id[key]] for key in dic.token2id.keys()]) words = dic.token2id.keys() idx = np.argsort(freqs) freqs = freqs[idx[-n_words:]]