def test_saveAsText(self): """`Dictionary` can be saved as textfile. """ tmpf = get_tmpfile('save_dict_test.txt') small_text = [ ["prvé", "slovo"], ["slovo", "druhé"], ["druhé", "slovo"]] d = Dictionary(small_text) d.save_as_text(tmpf) with codecs.open(tmpf, 'r', encoding='utf-8') as file: serialized_lines = file.readlines() self.assertEqual(serialized_lines[0], u"3\n") self.assertEqual(len(serialized_lines), 4) # We do not know, which word will have which index self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n") self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n") self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n") d.save_as_text(tmpf, sort_by_word=False) with codecs.open(tmpf, 'r', encoding='utf-8') as file: serialized_lines = file.readlines() self.assertEqual(serialized_lines[0], u"3\n") self.assertEqual(len(serialized_lines), 4) self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n") self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n") self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")
def main(): parser = ArgumentParser() parser.add_argument('-d', '--dataset') parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) parser.add_argument('-o', '--output') opts = parser.parse_args() dataset_name = opts.dataset dataset_path = opts.dataset_path out_fn = opts.output if not out_fn: logging.error('--output argument required ...') parser.print_usage() sys.exit(1) if not dataset_name: logging.error('--dataset argument required ...') parser.print_usage() sys.exit(1) if dataset_name == 'newsgroups': corpus = (preprocess_ng(doc) for doc in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) if dataset_name == 'ndt': dataset = NDTDataset(dataset_path=dataset_path) dataset.install() corpus = (preprocess_ndt(doc) for doc in dataset) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) d = Dictionary(corpus) d.save_as_text(out_fn, sort_by_word=False)
def produce(self): print('Getting src docs') docs = [] doctokens = [] # aka Gensim's "text" stopwords = nltk.corpus.stopwords.words('english') for doc in self.src_doc_generator(): (doc_id,doc_label,doc_str) = doc docs.append(doc) doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords]) if len(docs) % 1000 == 0: print(len(docs)) print('Creating the dictionary') dictionary = Dictionary(doctokens) #dictionary.compactify() #dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating WORD') # aka Gensim's "dictionary" db.create_table('word') for word_id, word_str in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str)) print('Creating DOC and DOCWORD') db.create_table('doc') db.create_table('docword') for doc_idx, doc in enumerate(docs): db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2])) doc_id = doc[0] for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])): word_str = dictionary.get(word_id) # Is this valid? I believe it is. db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
def test_saveAsText_and_loadFromText(self): """ `Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') d = Dictionary(self.texts) d.save_as_text(tmpf) # does the file exists self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt')) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def test_saveAsText_and_loadFromText(self): """`Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') for sort_by_word in [True, False]: d = Dictionary(self.texts) d.save_as_text(tmpf, sort_by_word=sort_by_word) self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(tmpf) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def build_dictionary(): corpus = CorpusIterator(dir_list=dir_list) dictionary = Dictionary(corpus) dictionary.save_as_text( '/home/andre/Develop/corpora/lsamodel_wordids.txt.bz2') dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=500000) dictionary.save_as_text( '/home/andre/Develop/corpora/lsamodel_wordids_filtered.txt.bz2')
def build_dictionary(): corpus = CorpusIterator(dir_list=dir_list) dictionary = Dictionary(corpus) dictionary.save_as_text( '/home/andre/Develop/corpora/lsamodel_wordids.txt.bz2') dictionary.filter_extremes(no_below=10, no_above=0.1, keep_n=500000) dictionary.save_as_text( '/home/andre/Develop/corpora/lsamodel_wordids_filtered.txt.bz2')
def create_dictionary(doc_iterator, dict_file, as_text=False): """ Creates a gensim.corpora.Dictionary object from given document iterator and serializes it to given dict_file (filename) in a memory efficient way. @Params: as_text - flag: dictionary saved as text (default: binary) """ d = Dictionary(doc.strip().lower().split() for doc in doc_iterator) if as_text: d.save_as_text(dict_file) else: d.save(dict_file)
def Gensim_Dic(sentences, tem_fname): dct = Dictionary(sentences) a = [] for w in stopwords: if w in dct.token2id.keys(): a.append(dct.token2id[w]) dct.filter_extremes(no_below=10) dct.filter_tokens(bad_ids=a) dct.compactify() dct.save_as_text(tmp_fname)
def get_dictionary(self): tmp_fname = self.path + "lda.dictionary" if os.path.exists(tmp_fname): return Dictionary.load_from_text(tmp_fname) else: print("Creating dictionary.") docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] dictionary = Dictionary(docs) dictionary.save_as_text(tmp_fname) return dictionary
def get_dictionary(self): tmp_fname = self.path + self.model_type + "_dictionary" if os.path.exists(tmp_fname): return Dictionary.load_from_text(tmp_fname) else: print("Creating dictionary.") docs_by_id = read_ap.get_processed_docs() docs = [doc for doc_id, doc in docs_by_id.items()] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=20, no_above=0.5) dictionary.save_as_text(tmp_fname) return dictionary
def main(): global dictionary try: dictionary = Dictionary.load_from_text( "persist/reuters_dictionary.txt") #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2") except: dictionary = Dictionary(ReutersCorpus()) dictionary.filter_extremes() dictionary.save_as_text("persist/reuters_dictionary.txt") models = train_models() if settings["models"]["bow"]: bowmodel = BOWmodel() bowmodel.__out_size = len(dictionary) models["bow"] = bowmodel if settings["models"]["noise"]: noisemodel = NoiseModel(1000) noisemodel.__out_size = 1000 models["noise"] = noisemodel num_train_samples = 21578 - settings["held_out_docs"] test_samples = [] class generate_train_samples(object): first_iteration = True def __iter__(self): count = 0 for document in stream_reuters_documents(): sample = document["content"], "acq" in document[ "topics"] # todo: maybe try "usa" or "earn" if count > num_train_samples: if self.first_iteration: test_samples.append(sample) else: yield sample count += 1 self.first_iteration = False classifiers = train_classifiers(models, generate_train_samples()) classifications = run_evaluation(classifiers, models, test_samples) #output_results(classifications) return classifications
def main(): global dictionary try: dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt") #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2") except: dictionary = Dictionary(ReutersCorpus()) dictionary.filter_extremes() dictionary.save_as_text("persist/reuters_dictionary.txt") models = train_models() if settings["models"]["bow"]: bowmodel = BOWmodel() bowmodel.__out_size = len(dictionary) models["bow"] = bowmodel if settings["models"]["noise"]: noisemodel = NoiseModel(1000) noisemodel.__out_size = 1000 models["noise"] = noisemodel num_train_samples = 21578 - settings["held_out_docs"] test_samples = [] class generate_train_samples(object): first_iteration = True def __iter__(self): count = 0 for document in stream_reuters_documents(): sample = document["content"], "acq" in document["topics"] # todo: maybe try "usa" or "earn" if count > num_train_samples: if self.first_iteration: test_samples.append(sample) else: yield sample count += 1 self.first_iteration = False classifiers = train_classifiers(models, generate_train_samples()) classifications = run_evaluation(classifiers, models, test_samples) #output_results(classifications) return classifications
def build_vocab(): start = time.time() test_path = os.path.join(config.DATA_PATH, 'test.csv') train_path = os.path.join(config.DATA_PATH, 'train.csv') normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt') bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram') bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt') if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH): try: os.mkdir(config.PROCESSED_PATH) except OSError: pass vocab = {} train_df = read_file(train_path) test_df = read_file(test_path) print('tokenizing vocab file') texts = np.concatenate([train_df.comment_text.fillna('N/A').values, test_df.comment_text.fillna('N/A').values]) with open(normalized_text_path, 'w') as f: processed_text = parallelize_dataframe(texts, tokenizer) for line in processed_text: f.write(line + '\n') gc.collect() lines = LineSentence(normalized_text_path) bigram = Phrases(lines) bigram.save(bigram_path) phraser = Phraser(bigram) with open(bigram_comments_path, 'w', encoding='utf_8') as f: for comment in lines: comm = u' '.join(phraser[comment]) f.write(comm + '\n') commnets = LineSentence(bigram_comments_path) bigram_dict = Dictionary(commnets) bigram_dict.filter_extremes(no_below=config.THRESHOLD) bigram_dict.save_as_text(config.VOCAB_PATH) bigram_dict.add_documents([['<pad>']]) with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f: f.write('VOCAB_SIZE = {}'.format(len(bigram_dict))) print('time passed: {} minutes'.format((time.time() - start) / 60))
def get_data_tokenizer(fromdate, todate): print 'Starting get and save data from mysql-server into local folder....' fromdate = fromdate + ' 00:00:00' todate = todate + ' 23:59:59' connection = my_connection.getConnection() cursor = connection.cursor() query = 'SELECT id, vntokenizer, catid FROM news WHERE create_time BETWEEN ' + '\'' + fromdate + '\' AND \'' + todate + '\';' print query cursor.execute(query) rows = cursor.fetchall() count = 0 token_dictionary = Dictionary() data = dict() for row in rows: id = row[0] tokenizer = row[1] catid = row[2] if tokenizer != None: tokenizer = tokenizer.lower() count += 1 print count print tokenizer token_list = tokenizer.split(' ') valid_token_list = list() for token in token_list: if my_util.check_valid_token(token): valid_token_list.append(token) token_dictionary.add_documents([valid_token_list]) if catid == my_catid: data[id] = valid_token_list my_connection.closeConnection(connection) # save dictionary and data into text file token_dictionary.save_as_text('..' + parameter.FILE_DICTIONARY) fb = open('..' + parameter.FILE_DATA, 'wb') pickle.dump(data, fb) fb.close() print 'Done get and save data from mysql-server!'
def prepare_word_embedding(): """Construct vocabulary file and word embedding file. """ df = pd.read_csv( "data/raw/train.csv", usecols=["original_phrase1", "original_phrase2", "ytrue"] ) model = KeyedVectors.load_word2vec_format( "/data/mayu-ot/Data/Model/GoogleNews-vectors-negative300.bin.gz", binary=True ) CUSTOM_FILTERS = [ lambda x: x.lower(), strip_punctuation, strip_multiple_whitespaces, strip_numeric, ] doc = [preprocess_string(x, CUSTOM_FILTERS) for x in df.values[:, :2].ravel()] dct = Dictionary(doc) bad_ids = [] for k, v in dct.iteritems(): if v not in model: bad_ids.append(k) dct.filter_tokens(bad_ids) dct.compactify() for k, v in dct.iteritems(): print(k, v) if k == 10: break dct.save_as_text("data/processed/dictionary.txt") word_emb = np.ones((len(dct), 300)) for k, v in dct.iteritems(): word_emb[k] = model[v] np.save("data/processed/word2vec", word_emb)
def create_LDA_model(self): trigram_articles = LineSentence(self.trigram_articles_filepath) trigram_dictionary = Dictionary(trigram_articles) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save_as_text(self.trigram_dictionary_filepath) # trigram_dictionary = Dictionary.load(self.trigram_dictionary_filepath) MmCorpus.serialize(self.trigram_bow_filepath, self.trigram_bow_generator(self.trigram_articles_filepath, trigram_dictionary)) trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath) print(trigram_bow_corpus) with warnings.catch_warnings(): warnings.simplefilter("ignore") lda = LdaMulticore(trigram_bow_corpus, num_topics=20, id2word=trigram_dictionary, workers=3) lda.save(self.lda_model_filepath)
def produce(self): doc_n = 0 docs = [] doctokens = [] # AKA gensim "text" stopwords = nltk.corpus.stopwords.words('english') NOALPHA = re.compile('[^a-z]+') def prep_string(my_string,pattern = NOALPHA): return re.sub(pattern, ' ', my_string.strip().lower()) print('Getting src docs') for doc in self.src_doc_generator(): content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator? docs.append(content) doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords]) doc_n += 1 if doc_n % 1000 == 0: print(doc_n) print('Creating the dictionary') dictionary = Dictionary(doctokens) dictionary.compactify() dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating DOC') db.create_table('doc') for i, doc in enumerate(docs): db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc)) print('Creating WORD') db.create_table('word') for item in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item) print('Creating DOCWORD') db.create_table('docword') for i, tokens in enumerate(doctokens): for item in (dictionary.doc2bow(tokens)): db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
def main(): parser = argparse.ArgumentParser(description='creates an id2author mapping gensim dictionary a document->authorid contributions MatrixMarket file and a binary article title file from a given WikiMedia *-pages-meta-history dump (considering only articles in mainspace!)') parser.add_argument('--history-dump', type=argparse.FileType('r'), help='path to input WikiMedia *-pages-meta-history file (.xml/.xml.bz2)', required=True) parser.add_argument('--id2author', type=argparse.FileType('w'), help='path to output text id2author dictionary (.txt/.txt.bz2)', required=True) parser.add_argument('--contribs', type=argparse.FileType('w'), help='path to output MatrixMarket contributions .mm file; also creates a binary article title file CONTRIBS.metadata.cpickle', required=True) parser.add_argument('--contribution-value', choices=CONTRIBUTION_VALUE_FUNCTIONS, help='calculated per-contribution value; choices: {}'.format(CONTRIBUTION_VALUE_FUNCTIONS.keys()), required=True) parser.add_argument("--namespace-prefixes", type=argparse.FileType('r'), help='file of namespace prefixes to ignore') args = parser.parse_args() args = parser.parse_args() input_history_dump_path = args.history_dump.name output_id2author_path = args.id2author.name output_contribs_path = args.contribs.name contribution_value = args.contribution_value namespace_prefixes = read_lines(args.namespace_prefixes.name) if args.namespace_prefixes else () logger.info('running with:\n{}'.format(pformat({'input_history_dump_path':input_history_dump_path, 'output_id2author_path':output_id2author_path, 'output_contribs_path':output_contribs_path, 'contribution_value':contribution_value, 'namespace_prefixes':namespace_prefixes}))) # konstruiere id2author-Dictionary: mappt Autornamen von registrierten, Nicht-Bot-Autoren auf IDs und umgekehrt with smart_open(input_history_dump_path) as history_dump_file: logger.info('generating author->id mappings') history_dump = xml_dump.Iterator.from_file(history_dump_file) # benutze id2word-Dictionary von gensim als id2author-Dictionary: Autoren entsprechen Termen id2author = Dictionary(get_revision_authors_of_pages(history_dump, namespace_prefixes)) logger.info('found {} different authors'.format(len(id2author))) logger.info('removing non-registered authors') remove_from_dictionary(id2author, is_registered_user) logger.info('reduced to {} registered authors'.format(len(id2author))) logger.info('removing bots') remove_from_dictionary(id2author, is_not_bot_user) logger.info('reduced to {} registered non-bot authors'.format(len(id2author))) id2author.compactify() id2author.save_as_text(output_id2author_path) # berechne & speichere Einträge (Autor-ID, Versionswert) Versionen gültiger Autoren für alle Artikel with smart_open(input_history_dump_path) as history_dump_file: logger.info('generating MatrixMarket representation per revision: (docid, authorid, value of revision)') history_dump = xml_dump.Iterator.from_file(history_dump_file) revision_value_fun = CONTRIBUTION_VALUE_FUNCTIONS[contribution_value] doc_auth_contribs = MetadataCorpus(get_revision_values(get_revisions_of_pages(history_dump, namespace_prefixes), id2author, revision_value_fun)) MmWriter.write_corpus(output_contribs_path, corpus=doc_auth_contribs, num_terms=len(id2author), index=False, progress_cnt=10000, metadata=True)
def main(argv=None): if argv is None: argv = sys.argv print('Creating speech serialized corpus') # Create the speech corpus, it is inside the rawfile as a json format: # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"} with open(RAWFILE, 'r') as f: speech_dict = json.load(f) with open(RAWIDS, 'r') as f: id_dict = json.load(f) # We also need to make sure that the article ids are saved in the correct # format so that the gensimple engine can understand it, like this: # "int": ["url", "title"], texts = [] article_dict = {} counter = 0 for key, value in speech_dict.items(): texts.append([token for token in value['text']]) article_dict[str(counter)] = [value['url'], id_dict[key]['title']] counter += 1 with open(ARTICLEDICT, 'w') as f: json.dump(article_dict, f) dictionary = Dictionary(texts) dictionary.save_as_text(DICTFILE) corpus = [dictionary.doc2bow(text) for text in texts] MmCorpus.serialize(MMFILE, corpus) print('Speech serialized corpus created') # # Now run LSI on TDIDF dictionary = Dictionary.load_from_text(DICTFILE) mm = MmCorpus(MMFILE) tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(TDIFMODEL) MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000) mm_tdif = MmCorpus(TDIFFILE) lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300) index = similarities.MatrixSimilarity(lsi[mm_tdif]) index.save(SIMMATRIX) lsi.save(LSIMODEL) print("LSI model and index created")
class Index(object): """define an index instance along with its associated methods""" def __init__(self, stops, minsize=3): """initialize index variables""" self.ix = None self.tokenizer = StandardAnalyzer(stoplist=stops, minsize=minsize) self.umls = umls.UMLSLookup() self.term_dict = {} self.token2cuis = {} self.concept_dict = {"__NULL__": 0} self.synsets = {} def get_doc_ids(self, corpus_path, corpus_name): """get doc ids from corpus""" if "OHSUMED" in corpus_name: docs = safir_utils.gen_trec_doc(corpus_path) elif "TREC_CDS" in corpus_name: docs = safir_utils.gen_cds_doc(corpus_path) return [docno for docno, doc in docs] def only_digits(self, token): """check whether input token contains only digits and/or punctuation""" return all(char.isdigit() or char in string.punctuation for char in token) def preprocess_text(self, text, tags=False, remove_digits=True): """preprocess text: tokenize docs, lowerize text, remove words with length < min_size, remove tags, remove only-digits tokens and remove stopwords""" if tags: # remove tags text = strip_tags(text) if remove_digits: # tokenize and remove digits-only tokens text = [ token.text for token in self.tokenizer(text) if not self.only_digits(token.text) ] else: # tokenize and keep digits-only tokens text = [token.text for token in self.tokenizer(text)] # return preprocessed doc return text def preprocess_corpus(self, corpus_path, corpus_name, out_corpus, out_ids): """preprocess corpus: apply preprocess_text to each doc within corpus""" if "OHSUMED" in corpus_name: docs = safir_utils.gen_trec_doc(corpus_path) elif "TREC_CDS" in corpus_name: docs = safir_utils.gen_cds_doc(corpus_path) # tokenize docs print("pre processing docs...") #pproc_corpus = [self.preprocess_text(doc) for docno, doc in docs] pproc_corpus = [] doc_ids = [] # iterate over docs and store pre processed docs and docnos for docno, doc in docs: pproc_corpus.append(self.preprocess_text(doc)) doc_ids.append(docno) print("pre processing finished!") # store pproc_corpus print("store pre processed corpus in {}".format(out_corpus)) with open(out_corpus, 'w') as outf: json.dump(pproc_corpus, outf) # store docnos print("store doc_ids in {}".format(out_ids)) with open(out_ids, 'w') as outf: json.dump(doc_ids, outf) # return pproc_corpus and doc_ids return pproc_corpus, doc_ids def load_pproc_corpus(self, fname): """load stored pre processed corpus""" with open(fname, 'r') as inf: pproc_corpus = json.load(inf) return pproc_corpus def load_doc_ids(self, fname): """load stored doc ids""" with open(fname, 'r') as inf: doc_ids = json.load(inf) return doc_ids def index_corpus(self, pproc_corpus, fname): """index pre processed corpus using gensim dictionary - fast doc2bow, doc2idx conversion""" self.ix = Dictionary(pproc_corpus) self.ix.save_as_text(fname) return True def load_index(self, fname): """load stored index""" self.ix = Dictionary.load_from_text(fname) return True def build_term_dict(self, pproc_corpus, fname, dict_size=131072, remove_digits=True, min_df=2, max_df=0.5): """create term dictionary""" ttf = {} # filter terms with df lower than 2 and greater than 0.5 (in %) and store their ttf for doc in tqdm(pproc_corpus): # get doc in bow format bow = self.ix.doc2bow(doc) for idx, tf in bow: if self.ix.dfs[idx] >= 2 and self.ix.dfs[ idx] / self.ix.num_docs <= 0.5: if idx in ttf: ttf[idx] += tf else: ttf[idx] = tf # convert ttf dict into counter and keep dict_size most frequent terms count = Counter(ttf).most_common(dict_size) # create term dict - two-levels encoding (i.e. self.term_dict[self.ix.token2id[token]]) for idx, ttf in count: self.term_dict[idx] = len(self.term_dict) # store term dictionary with open(fname, 'w') as outf: json.dump(self.term_dict, outf) return True def load_term_dict(self, fname): """load term dictionary""" with open(fname, 'r') as inf: self.term_dict = json.load(inf) # convert keys from str back to int - json stores dict keys as str self.term_dict = { int(ix_term): dict_term for ix_term, dict_term in self.term_dict.items() } return True def get_pos2token(self, text): """split text into tokens and return {pos: [token, ["__NULL__"]]}""" pos2token = {} tokens = text.split( ) # split on whitespaces as text has been already pre processed # set text index index = text.index running_offset = 0 # loop over tokens for token in tokens: token_offset = index(token, running_offset) token_len = len(token) # update running offset running_offset = token_offset + token_len pos2token[token_offset] = [self.ix.token2id[token], ["__NULL__"] ] # note: ["__NULL__"] is for later use return pos2token def associate_token2cuis(self, pos2token, terms_candidate_cuis): """return list of (token, [cui1, cui2, ...]) pairs given token position and candidate concepts""" for term_cuis in terms_candidate_cuis: # get positional information start = term_cuis[0]['start'] # check whether 'start' matches with any pos2token key if start in pos2token: # update ["__NULL__"] with candidate cuis pos2token[start][1] = [concept['cui'] for concept in term_cuis] # return pos2token values only - i.e. (term, [cui1, cui2, ...]) pairs return list(pos2token.values()) def map_token2cuis(self, fname, threshold=1.0, stypes_fname=None): """map candidate cuis to each token in the index""" terms_str = ' '.join(list(self.ix.token2id.keys())) # split term_str into substrings of length <= 999999 - max length allowed by scipy parser substrs = wrap(terms_str, width=999999, break_long_words=False, break_on_hyphens=False) if stypes_fname is not None: # load user-specified UMLS semantic types print("user-specified UMLS semantic types for QuickUMLS enabled") semtypes = ','.join(safir_utils.load_semtypes(stypes_fname)) else: # keep default QuickUMLS semantic types semtypes = None # initialize QuickUMLS server server = QuickUMLS(window=1, threshold=threshold, semtypes=semtypes) server.launch_quickumls() # initialize concept matcher matcher = get_quickumls_client() token2cuis = [] # extract concepts from substrs for substr in substrs: terms_candidate_cuis = matcher.match(substr) # get position dict: {pos: [token, ["__NULL__"]]} given substr pos2token = self.get_pos2token(substr) # associate each token with its candidate concepts token2cuis += self.associate_token2cuis(pos2token, terms_candidate_cuis) # close connection with QuickUMLS server server.close_quickumls() # store token2cuis as dict self.token2cuis = dict(token2cuis) # store token2cuis with open(fname, 'w') as outf: json.dump(self.token2cuis, outf) return True def load_token2cuis(self, fname): """load token2cuis""" with open(fname, 'r') as inf: self.token2cuis = json.load(inf) # convert keys from str back to int - json stores dict keys as str self.token2cuis = { int(token): cuis for token, cuis in self.token2cuis.items() } return True def update_concept_dict(self, cui): """update concept dictionary""" if cui in self.concept_dict: return True else: self.concept_dict[cui] = len(self.concept_dict) return True def load_concept_dict(self, fname): """load concept dictionary""" with open(fname, 'r') as inf: self.concept_dict = json.load(inf) return True def update_synsets(self, cui, idx): """update synonyms set""" if self.concept_dict[ cui] in self.synsets: # add term to set of synonyms for the given cui self.synsets[self.concept_dict[cui]].add(self.term_dict[idx]) return True elif self.concept_dict[cui] != self.concept_dict[ "__NULL__"]: # initialize set of synsets for given cui self.synsets[self.concept_dict[cui]] = {self.term_dict[idx]} return True else: # do not update synsets return False def load_synsets(self, fname): """load synsets""" with open(fname, 'r') as inf: self.synsets = json.load(inf) # convert keys from str back to int - json stores dict keys as str self.synsets = {int(cui): syns for cui, syns in self.synsets.items()} return True def get_sense_pairs(self): """return senses as (term, cui) 2-dim np array""" syns = [ list(itertools.product(self.synsets[cui], [cui])) for cui in self.synsets ] synp = [list(itertools.combinations(syn, 2)) for syn in syns] return np.array(list(itertools.chain.from_iterable(synp))) def s_wsd(self, doc, table_name, query=False): """shallow word-sense disambiguation: disambiguate polysemous terms based on shallow word-concept connectivity within UMLS""" doc_cuis = {} # convert doc into doc2idx format doc2idx = self.ix.doc2idx(doc) # get cuis from doc tokens for idx in doc2idx: if idx in self.token2cuis and self.token2cuis[idx] != ["__NULL__"]: for cui in self.token2cuis[idx]: if cui in doc_cuis: # increase cui count doc_cuis[cui] += 1 else: # initialize cui count doc_cuis[cui] = 1 # perform shallow word-sense disambiguation enc_doc = [] for idx in doc2idx: if idx in self.term_dict: # disambiguate only for terms contained within self.term_dict max_edges = 0 # relative maximum connections (edges) if len(self.token2cuis[idx]) == 1: # monosemous term ref_cui = self.token2cuis[idx][0] if not query: # update concept dict and synsets self.update_concept_dict(ref_cui) self.update_synsets(ref_cui, idx) # encode (term, cui) pair enc_doc.append( [self.term_dict[idx], self.concept_dict[ref_cui]]) else: # polysemous term candidates = [] # loop over cadidate concepts for subj_cui in self.token2cuis[idx]: num_edges = 0 # number of edges if doc_cuis[ subj_cui] == 1: # subj_cui is only associated with current term (idx) obj_cuis = list( set(doc_cuis.keys()).difference({subj_cui})) else: # subj_cui is associated with other terms in the doc too obj_cuis = list(doc_cuis.keys()) num_edges += self.umls.compute_num_edges( subj_cui, obj_cuis, table_name) # verify connectivity if num_edges > max_edges: # set candidates to subj_cui candidates = [subj_cui] # update max_edges max_edges = num_edges else: # append subj_cui to candidates candidates.append(subj_cui) # keep head candidate - when disambiguation is not complete, it allows to get the most likely concept based on QuickUMLS ordering ref_cui = candidates[0] if not query: # update concept dict and synsets self.update_concept_dict(ref_cui) self.update_synsets(ref_cui, idx) # encode (term, cui) pair enc_doc.append( [self.term_dict[idx], self.concept_dict[ref_cui]]) else: # term oov continue return enc_doc def encode_corpus(self, pproc_corpus, corpus_name, ecorpus_fname, t2c_fname, cdict_fname, syn_fname, threshold=0.7, stypes_fname=None): """perform semantic indexing and encode corpus""" print("map UMLS concepts to (indexed) tokens") self.map_token2cuis(t2c_fname, threshold=threshold, stypes_fname=stypes_fname) # get UMLS concepts mapped to (indexed) tokens ix_concepts = { cui for cuis in self.token2cuis.values() for cui in cuis if cui != "__NULL__" } # create sql table to store relations between concepts associated to indexed tokens - allows for fast accessing compared to MRREL table print( "create table to store UMLS relations between concepts associated to (indexed) tokens - fast access is enabled by indexes" ) self.umls.restrict_to_ix_concepts(ix_concepts, corpus_name) # create indexes to speed up requests self.umls.create_index("CUI1_" + corpus_name, ["CUI1"], corpus_name) # create index for subject column self.umls.create_index("CUI2_" + corpus_name, ["CUI2"], corpus_name) # create index for object column self.umls.create_index( "CUI1_CUI2_" + corpus_name, ["CUI1", "CUI2"], corpus_name) # create multicolumn index (subj, obj) # encode corpus print("disambiguate polysemous tokens and encode corpus") enc_corpus = [ self.s_wsd(doc, corpus_name, query=False) for doc in tqdm(pproc_corpus) ] # store synsets as dict of lists - enables json encoding self.synsets = {cui: list(syns) for cui, syns in self.synsets.items()} # store semantic data and encoded corpus with open(ecorpus_fname, 'w') as outf: json.dump(enc_corpus, outf) with open(cdict_fname, 'w') as outf: json.dump(self.concept_dict, outf) with open(syn_fname, 'w') as outf: json.dump(self.synsets, outf) # return encoded corpus return enc_corpus def load_enc_corpus(self, fname): """load encoded corpus""" with open(fname, 'r') as inf: enc_corpus = json.load(inf) return enc_corpus def preprocess_query(self, query): """pre process query""" pproc_query = self.preprocess_text(query) return pproc_query def encode_query(self, pproc_query, corpus_name): """disambiguate polysemous terms and encode query""" enc_query = self.s_wsd(pproc_query, corpus_name, query=True) if not enc_query: print("query does not contain known terms") return None else: return np.array(enc_query) def project_query(self, query, corpus_name, word_embs, proj_weights, concept_embs=None): """project encoded query into dense vector of size [1, doc_embs]""" enc_query = self.encode_query(self.preprocess_query(query), corpus_name) if enc_query is None: return None else: if concept_embs is None: # only terms are considered return np.matmul(proj_weights, np.mean(word_embs[enc_query[:, 0]], axis=0)) else: # terms + concepts are considered (i.e. senses) return np.matmul( proj_weights, np.mean(np.add(word_embs[enc_query[:, 0]], concept_embs[enc_query[:, 1]]), axis=0)) def semantic_search(self, doc_ids, docs, query_ids, queries, ranking_folder, ranking_name): """perform search over queries using neural semantic models and return ranking""" doc_ids = np.array(doc_ids) print("compute similarities between docs and queries") similarities = cosine_similarity(docs, queries) out = open(ranking_folder + '/' + ranking_name + '.txt', 'w') for i in tqdm(range(similarities.shape[1])): rank = np.argsort(-similarities[:, i])[:1000] docs_rank = doc_ids[rank] qid = query_ids[i] if qid.isdigit( ): # cast to integer - this operation avoids storing topic ids as '0##' instead of '##' qid = str(int(qid)) # convert to int and then back to str for j in range(len(docs_rank)): out.write('%s %s %s %d %f %s\n' % (qid, 'Q0', docs_rank[j], j, similarities[rank[j]][i], ranking_name)) out.close() return True
def create_dictionary(self): YELP_DATASET_DIR = config.get("YELP_DATASET_DIR", None) SAVE_REVIEWS_BY_CATEGORY_DIRECTORY = config.get( "SAVE_REVIEWS_BY_CATEGORY_DIRECTORY", None) SAVE_DICTIONARY_DIR = config.get("SAVE_DICTIONARY_DIR", None) SAVE_BAG_OF_WORDS_DIR = config.get("SAVE_BAG_OF_WORDS_DIR", None) SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE = int( config.get("SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE", 25000)) if not (YELP_DATASET_DIR and SAVE_REVIEWS_BY_CATEGORY_DIRECTORY and SAVE_DICTIONARY_DIR and SAVE_BAG_OF_WORDS_DIR and SAVE_DICTIONARY_DIR): print( "config keys are not set correctly in the config file: socialconfig.py" ) exit(0) SAVE_UNFILTERED_DICTIONARY_DIR = os.path.join(SAVE_DICTIONARY_DIR, "Unfiltered") if not os.path.exists( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY) and not os.path.isdir( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): raise ("Directory {d} does not exist".format( d=SAVE_REVIEWS_BY_CATEGORY_DIRECTORY)) if not (os.path.exists(SAVE_BAG_OF_WORDS_DIR) and os.path.isdir(SAVE_BAG_OF_WORDS_DIR)): os.makedirs(SAVE_BAG_OF_WORDS_DIR) if not (os.path.exists(SAVE_UNFILTERED_DICTIONARY_DIR) and os.path.isdir(SAVE_UNFILTERED_DICTIONARY_DIR)): os.makedirs(SAVE_UNFILTERED_DICTIONARY_DIR) for pardir, sub_dirs, files in os.walk( SAVE_REVIEWS_BY_CATEGORY_DIRECTORY): if len(files) > 0: error_count = 0 review_docs = [] negative_docs = [] positive_docs = [] doc_count = 0 docs_per_file = SAVE_N_BAG_OF_WORDS_DOCS_PER_FILE file_num = str((doc_count / docs_per_file) + 1) for file in files: if "yelp_reviews_" in file and "category" in pardir: reviews = get_reviews_iterable( os.path.join(pardir, file)) yelp_category = pardir.split('/')[-1] CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category) if not (os.path.exists( CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) and os.path .isdir(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR)): os.makedirs(CATEGORY_SPECIFIC_BAG_OF_WORDS_DIR) fname = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category, "{cat}_file_{file_num}.txt".format( cat=yelp_category, file_num=file_num)) bow_file = open(fname, 'w') print( "Writing docs (in bag of words form) for {cat} to directory: {d}" .format(cat=yelp_category, d=os.path.join(SAVE_BAG_OF_WORDS_DIR, yelp_category))) for review in reviews: try: review_dict = ujson.loads(review) except: error_count += 1 pass adjs = review_dict.get("adjectives", None) rating = int(review_dict.get("rating", -1)) if adjs: doc_count += 1 bow_file.write( ujson.dumps(adjs.encode("utf-8")) + "\n") review_docs.append(adjs.strip().split()) if (doc_count % docs_per_file) == 0: if bow_file: bow_file.close() file_num = str((doc_count / docs_per_file) + 1) fname = os.path.join( SAVE_BAG_OF_WORDS_DIR, yelp_category, "{cat}_file_{file_num}.txt".format( cat=yelp_category, file_num=file_num)) bow_file = open(fname, 'w') if rating: if rating > 3: positive_docs.append(adjs.strip().split()) elif rating < 3: negative_docs.append(adjs.strip().split()) else: pass print("Wrote {total} docs in {cat} category".format( total=str(doc_count), cat=yelp_category)) dictionary = Dictionary(review_docs) CATEGORY_SPECIFIC_DICT_DIR = os.path.join( SAVE_UNFILTERED_DICTIONARY_DIR, yelp_category) POSITIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR, "positive") NEGATIVE_SUB_DIR = os.path.join(CATEGORY_SPECIFIC_DICT_DIR, "negative") if not (os.path.exists(CATEGORY_SPECIFIC_DICT_DIR) and os.path.isdir(CATEGORY_SPECIFIC_DICT_DIR)): os.makedirs(CATEGORY_SPECIFIC_DICT_DIR) os.makedirs(POSITIVE_SUB_DIR) os.makedirs(NEGATIVE_SUB_DIR) dictionary.save( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_dict.dict".format( yelp_category=yelp_category))) dictionary.save_as_text( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_dict.txt".format( yelp_category=yelp_category))) sorted_doc_freqs = sorted(dictionary.dfs.items(), key=lambda x: x[1], reverse=True) # print("Will save file in:\n " + os.path.join(CATEGORY_SPECIFIC_DICT_DIR,"{yelp_category}_dict.txt".format(yelp_category=yelp_category))) with open( os.path.join( CATEGORY_SPECIFIC_DICT_DIR, "{yelp_category}_words_doc_frequencies.txt".format( yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_doc_freqs: df_file.write( str( dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del dictionary del review_docs del sorted_doc_freqs pos_dictionary = Dictionary(positive_docs) del positive_docs neg_dictionary = Dictionary(negative_docs) del negative_docs pos_dictionary.save( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_dict.dict".format( yelp_category=yelp_category))) pos_dictionary.save_as_text( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_dict.txt".format( yelp_category=yelp_category))) sorted_pos_doc_freqs = sorted(pos_dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open( os.path.join( POSITIVE_SUB_DIR, "{yelp_category}_pos_words_doc_frequencies.txt". format(yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_pos_doc_freqs: df_file.write( str( pos_dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del pos_dictionary del sorted_pos_doc_freqs neg_dictionary.save( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_dict.dict".format( yelp_category=yelp_category))) neg_dictionary.save_as_text( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_dict.txt".format( yelp_category=yelp_category))) sorted_neg_doc_freqs = sorted(neg_dictionary.dfs.items(), key=lambda x: x[1], reverse=True) with open( os.path.join( NEGATIVE_SUB_DIR, "{yelp_category}_neg_words_doc_frequencies.txt". format(yelp_category=yelp_category)), 'w') as df_file: for (token_id, doc_freq) in sorted_neg_doc_freqs: df_file.write( str( neg_dictionary.get(token_id, "Unknown").encode( 'utf-8')) + " " + str(doc_freq) + "\n") del neg_dictionary del sorted_neg_doc_freqs print( "{count} {cat} reviews were discarded because of parsing errors" .format(count=error_count, cat=yelp_category)) print("Created dictionary for {cat} tokens".format( cat=yelp_category))
if len(sys.argv) < 3: print 'Usage: \n python train.py wiki.zh.chs.seg.utf.stop lda.model' sys.exit(1) inp, outp = sys.argv[1:3] logging.basicConfig(format = '%(asctime)s: %(levelname)s: %(message)s', level = logging.INFO) logging.info('Loading training set...') fp = codecs.open(inp, 'r', encoding='utf8') train = [] for line in fp: train.append(line.split()) fp.close() logging.info('Preparing corpus...') dictionary = Dictionary(train) dictionary.save_as_text('wiki.dictionary.bz2') corpus = [ dictionary.doc2bow(text) for text in train ] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] del train, tfidf logging.info('Training...') lda = LdaModel(corpus_tfidf, id2word=dictionary, num_topics=200) #lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=100, workers=2) logging.info('Saving LDA model...') lda.save(outp)
class TextClassifier(object): LABEL_TO_INDEX = {'auto':0, 'business':1, 'sports':2} INDEX_TO_LABEL = {0:'auto', 1:'business', 2:'sports'} def __init__(self, dict_file=None, model_file=None): if dict_file: self.dictionary = Dictionary.load_from_text(dict_file) else: self.dictionary = Dictionary() if model_file: self.model = joblib.load(model_file) else: self.model = None def expand_sent_terms(self, sent, ngrams=[2]): expd_sent = list(sent) ngram_terms = self._get_ngram_terms(sent, ngrams) expd_sent.extend(ngram_terms) return expd_sent def sentence_to_bow(self, sent): if self.dictionary: return self.dictionary.doc2bow(sent) else: return None def bow_to_feature_vec(self, bow_corpus): data = [] rows = [] cols = [] line_count = 0 for bow_sent in bow_corpus: for elem in bow_sent: rows.append(line_count) cols.append(elem[0]) data.append(elem[1]) line_count += 1 return csr_matrix( (data, (rows,cols)), shape=(line_count, len(self.dictionary))) def load_text(self, data_file, train=False): term_corpus = [] labels = [] with open(data_file) as fin: for line in fin: parts = line.strip().decode('utf8').split('\t') if len(parts) < 2: continue label = parts[0] sent = parts[1].split() # Expand sentence with more features. sent = self.expand_sent_terms(sent, [2]) # Save sentences and labels. term_corpus.append(sent) labels.append(self.LABEL_TO_INDEX[label]) # Update dictionary. if train: self.dictionary.add_documents([sent]) if train: # Compacitify dictionary. self.dictionary.filter_extremes(no_below=5, no_above=0.6, keep_n=None) self.dictionary.compactify() # Change text format corpus to bow format. bow_corpus = [] for sent in term_corpus: sent_bow = self.dictionary.doc2bow(sent) bow_corpus.append(sent_bow) return bow_corpus, labels def _get_ngram_terms(self, words, ngrams): terms = [] for i in range(1, len(words)): # Bigram terms. if 2 in ngrams and (i - 1) >= 0: terms.append('%s_%s' % (words[i - 1], words[i])) # Trigram terms. if 3 in ngrams and (i - 2) >= 0: terms.append( '%s_%s_%s' % (words[i - 2], words[i - 1], words[i])) return terms def dump_dict(self, dict_file): self.dictionary.save_as_text(dict_file) def dump_model(self, model_file): if self.model: joblib.dump(self.model, model_file) def train(self, x_list, y_list, model='lr'): X_train, X_test, y_train, y_test = train_test_split(x_list, y_list, test_size=0.3) if model == 'lr': self.model = LogisticRegression(C=1.0, multi_class='multinomial', penalty='l2', solver='sag', tol=0.1) else: logging.error('Unknown model name!') return self.model.fit(X_train, y_train) score = self.model.score(X_train, y_train) print("Evaluation on train set : %.4f" % score) score = self.model.score(X_test, y_test) print("Evaluation on test set : %.4f" % score) def predict(self, X): return self.model.predict(X) def predict_proba(self, X): return self.model.predict_proba(X) def eval(self, X, y): score = self.model.score(X, y) print("Evaluation on validation set : %.4f" % score)
topics = 20 dictionary.filter_extremes(no_below=no_below, no_above=no_above) logger.info("Making Corpus...") corpus = [dictionary.doc2bow(text) for text in tqdm(texts)] #======================================================================== # LDA Calculate #======================================================================== logger.info("LDA Calculation...") lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=topics) #======================================================================== # Model Save #======================================================================== logger.info("Dictionary & LDA Model Save...") dictionary.save_as_text('../model/1111_gensim_dict_below10_above08') with open('../model/1111_LDA_20topics_gensim__below10_above08', mode='wb') as f: pkl.dump(lda, f) for topic in lda.show_topics(num_topics=-1): print(f'topics: {topic}\n') # LDA Value write to Train & Test mx = np.zeros((len(texts), topics)) # Get LDA Topic Value from corpus logger.info("Get LDA Value from corpus...") arg_list = [] for i, bow in tqdm(enumerate(corpus)): # Pararell ===
epoch = int(sys.argv[3]) batch = int(sys.argv[4]) n_hidden = 128 df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object') q_maxlen = df['question'].map(len).max() a_maxlen = df['answer'].map(len).max() rpad_blank = lambda size: (lambda s: s.ljust(size, ' ')) que = df['question'].map(rpad_blank(q_maxlen)) ans = df['answer'].map(rpad_blank(a_maxlen)) dic = Dictionary([list(' '.join(df.values.flatten()))]) dic.save_as_text(f'{data_file}.dic') one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))] x = np.array([one_hot(q) for q in que]) y = np.array([one_hot(a) for a in ans]) model = Sequential() # encoder model.add(LSTM(n_hidden, input_shape=(q_maxlen, len(dic)))) # decoder model.add(RepeatVector(a_maxlen)) model.add(LSTM(n_hidden, return_sequences=True))
segmentor.load_with_lexicon(cws_model_path, LTP_DATA_DIR+'/user_dict.txt') # 加载模型,第二个参数是您的外部词典文件路径 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 combain_comtent = [] for file in file_list: combain_comtent.append(get_content(file)) segmentor.release() # 释放模型 dictionary = Dictionary(combain_comtent) corpus = [ dictionary.doc2bow(text) for text in combain_comtent] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=31) #词典的保存 dictionary.save_as_text(write_path+"dictionary.txt") # lda模型保存 lda.save(write_path+"model") for file in lda.print_topics(31): print(file[0]) topic_list = [] for i in lda.get_document_topics(corpus): listj=[] for j in i: listj.append(j[1]) topic_list.append(listj.index(max(listj))) file_dict = {}
class Dataset(object): ''' Create dataset for training supervised model ''' def __init__(self, config): self.config = config self.train_data = None self.test_data = None self.val_data = None self.vocab = None self.word_embeddings = None def get_pandas_df(self, filename): ''' Load the data into Pandas.DataFrame object This will be used to convert data to torchtext object ''' with open(filename, 'r', encoding='utf-8') as datafile: data = [line.strip().split(' ', maxsplit=1) for line in datafile] data_text = list(map(lambda x: x[1], data)) data_label = list(map(lambda x: x[0], data)) full_df = pd.DataFrame({"text": data_text, "label": data_label}) return full_df def load_data(self, train_file, test_file, dataname, embed_file=None, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: embed_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec) train_file (String): absolute path to training file test_file (String): absolute path to test file val_file (String): absolute path to validation file ''' # load embeddings voc_file = dataname + '_vocab.txt' new_embed = dataname + '_embed.pkl' train_X, train_Y = read_labeled(train_file) test_X, test_Y = read_labeled(test_file) val_X = None val_Y = None if val_file: val_X, val_Y = read_labeled(val_file) else: sp = int(len(train_X) * 0.8) train_X, val_X = (train_X[:sp], train_X[sp:]) train_Y, val_Y = (train_Y[:sp], train_Y[sp:]) train_X = [doc_padding(x, self.config.max_sen_len) for x in train_X] test_X = [doc_padding(x, self.config.max_sen_len) for x in test_X] val_X = [doc_padding(x, self.config.max_sen_len) for x in val_X] if os.path.isfile(voc_file): self.vocab = Dictionary.load_from_text(voc_file) else: self.vocab = Dictionary(train_X) special_tokens = {'<pad>': 0, '<unk>': 1} self.vocab.patch_with_special_tokens(special_tokens) self.vocab.save_as_text(voc_file) # build vocab train_X = [self.vocab.doc2idx(x, 1) for x in train_X] test_X = [self.vocab.doc2idx(x, 1) for x in test_X] val_X = [self.vocab.doc2idx(x, 1) for x in val_X] # transform words to index if os.path.isfile(new_embed): self.word_embeddings = torch.load(new_embed) else: embeds = Vectors(embed_file, unk_init=lambda x: torch.Tensor( np.random.normal(scale=0.6, size=(x.size())))) self.word_embeddings = weight_matrix(self.vocab, embeds) torch.save(self.word_embeddings, new_embed) self.train_data = (train_X, train_Y) self.test_data = (test_X, test_Y) self.val_data = (val_X, val_Y) print("Loaded {} training examples".format(len(train_X))) print("Loaded {} test examples".format(len(test_X))) print("Loaded {} validation examples".format(len(val_X))) def train_iterator(self): return batch_iter(*self.train_data, self.config.batch_size) def test_iterator(self): return batch_iter(*self.test_data, self.config.batch_size, False) def val_iterator(self): return batch_iter(*self.val_data, self.config.batch_size, False)
class ArticlesCollection: """Class which holds all articles (perhaps over several years) -- with ability to perform LDA on it.""" def __init__(self, year_range, text_output_dirpath, lang=DE_LANG): self.year_range = year_range self.text_output_dirpath = text_output_dirpath self.lang = lang self.articles = [] self.bow_corpus = None self.identifier = '' self.wordsids_filepath = '' self.bowmm_filepath = '' self.tfidf_filepath = '' self.number_of_docs = 0 self.number_of_tokens = 0 self.number_of_types = 0 # gensim data structures self.dictionary = None # Read in collection & clean it & start LDA process self._read_collection() self._collection_identifier() self._set_filepaths() self._create_dictionary() self._create_bow_representation() self._set_number_of_docs() self._set_number_of_tokens() self._set_number_of_types() # Create tf*idf matrix if requested. if USE_TFIDF: self._create_tfidf_matrix() def show_lda(self): """Show latent topics found.""" model = None # Only use tf*idf input if requested. corpus = self.bow_corpus if USE_TFIDF: corpus = MmCorpus(self.tfidf_filepath) # k = number of documents = number of topics (for now) num_topics = self.number_of_docs if NUM_TOPICS != -1: num_topics = NUM_TOPICS print('Number of docs presented: ' + str(self.number_of_docs)) print('Number of origin. tokens: ' + str(self.number_of_tokens)) print('Number of original types: ' + str(self.number_of_types)) print('Number of types at usage: ' + str(len(self.dictionary.\ keys()))) print('Number of topics to find: ' + str(num_topics)) print('Number of topics to show: ' + str(TOPICS_DISPLAY)) if MODEL == 'LdaMallet': model = LdaMallet(PATH_TO_MALLET_BIN, corpus=corpus, num_topics=num_topics, id2word=self.dictionary, iterations=ITERATIONS) elif MODEL == 'HdpModel': model = HdpModel(corpus, self.dictionary) else: model = LdaModel(corpus=corpus, id2word=self.dictionary, num_topics=num_topics, iterations=ITERATIONS, update_every=1, chunksize=10, passes=1, distributed=False) ''' More possible options above: chunksize=1, update_every=1, decay=0.5, ''' if MODEL == 'LdaModel' or MODEL == 'LdaMallet': topic_number = 0 for topic in model.show_topics(topics=TOPICS_DISPLAY, topn=WORDS_DISPLAY, formatted=True): topic_number += 1 print('Topic#' + str(topic_number) + ': ', topic) else: # For MODEL 'HdpModel' for topic in model.print_topics(topics=TOPICS_DISPLAY, \ topn=WORDS_DISPLAY): print topic def _set_number_of_types(self): """Set number of types (from tokens).""" self.number_of_types = len(set(list(itertools.\ chain(*self.articles)))) def _set_number_of_tokens(self): """Set number of tokens gotten in all documents.""" self.number_of_tokens = sum(len(article) \ for article in self.articles) def _set_number_of_docs(self): """Set number of docs found in collection read in.""" self.number_of_docs = len(self.articles) def _set_filepaths(self): """Sets filepaths for intermediate data.""" # Filepaths necessary for topic modeling self.wordsids_filepath = WORDSIDS_DIR + self.identifier + \ '_' + 'wordsids.txt' self.bowmm_filepath = BOWMM_DIR + self.identifier + '_' + \ 'bow.mm' self.tfidf_filepath = TFIDF_DIR + self.identifier + '_' + \ 'tfidf.mm' def _create_dictionary(self): """Create a mapping of ids and surface froms (=words).""" print('Create dictionary of collection.') self.dictionary = Dictionary(self.articles) self.dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE) self.dictionary.save_as_text(self.wordsids_filepath) self.dictionary.compactify() print(self.dictionary) def _create_bow_representation(self): """Create bag-of-words representation of collection, and save it in Matrix Matrix format to disk.""" print('Create bag-of-words matrix representation.') self.bow_corpus = [self.dictionary.doc2bow(article) for article in self.articles] MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus) def _create_tfidf_matrix(self): """Create TF-IDF matrix and save it in Matrix Matrix format to disk""" print('Create TF-IDF matrix of collection.') tfidf = TfidfModel(self.bow_corpus, id2word=self.dictionary, normalize=True) MmCorpus.serialize(self.tfidf_filepath, tfidf[self.bow_corpus]) print('Number of documents:', tfidf.num_docs) def _collection_identifier(self): """Collection id is important for the caching files and the file naming of the corresponding files.""" start_year = self.year_range[0] end_year = self.year_range[-1] if start_year == end_year: self.identifier = str(start_year) + '_' + self.lang else: self.identifier = str(start_year) + '-' + str(end_year) + \ '_' + self.lang def _read_collection(self): """Iterate through all years in order to get all articles read in.""" for year in self.year_range: # Not every single yearbook is available. try: self._read_book(year) except: print('Skip (inexistent) yearbook ' + str(year) + '.') def _read_book(self, year): """Read in a a single book and save its articles.""" filepath = sac_filepath(year, lang=self.lang) print('Read in yearbook ' + str(year) + '.') sac_xml = etree.parse(SAC_XML_DIR + filepath) sac_xml_articles_list = sac_xml.xpath('.//article') # For each article for sac_xml_article in sac_xml_articles_list: # Prepare file to write out words sac_xml_article_no = sac_xml_article.attrib['n'] out_filename = str(year) + '-' + str(self.lang) + '-' \ + sac_xml_article_no + '.txt' out_filepath = self.text_output_dirpath + sep + out_filename print(out_filepath) out_filehdl = open(out_filepath, 'w') article_word_list = [] sac_xml_sentences_list = \ sac_xml_article.xpath('.//s[@lang=\'' + \ self.lang + '\']') # For each sentence (in the article) for sac_xml_sentence in sac_xml_sentences_list: sac_xml_words_list = \ sac_xml_words_list = sac_xml_sentence.xpath('.//w') # For each word (in the sentence of the article) for sac_xml_word in sac_xml_words_list: word = None try: if WITH_POS_FILTER is False: if WITH_LEMMATA: word = sac_xml_word.attrib['lemma'].lower() if self._is_lemma_bogus(word): word = sac_xml_word.text.lower() if WITH_LEMMATA is False: word = sac_xml_word.text.lower() elif WITH_POS_FILTER: word = self._get_pos_filtered_word(sac_xml_word) except: pass # Don't add stop words, in any case if not word in STOPWORDS[self.lang] \ and word is not None and len(word) >= MIN_WORDLEN: article_word_list.append(self.\ _normalize_word(word).\ encode(ENCODING)) # Save article as bag-of-words (of the sentences) self.articles.append(article_word_list) out_filehdl.write(' '.join(article_word_list)) out_filehdl.close() def _get_pos_filtered_word(self, sac_xml_word): """ Get word by PoS filter """ # There are words without PoS tags, i. e. try try: if sac_xml_word.attrib['pos'] \ in POS_FILTER[self.lang]: if WITH_LEMMATA: word = sac_xml_word.attrib['lemma'].lower() if self._is_lemma_bogus(word): return sac_xml_word.text.lower() else: return sac_xml_word.attrib['lemma'].lower() else: return sac_xml_word.text.lower() else: return None except: return None def _is_lemma_bogus(self, lemma): """ Return true if the lemma is not useful for LDA, otherwise false. """ for bogus_symbol in SURFACE_TRIGGERS: if bogus_symbol in lemma: return True # That's the last resort return False def _normalize_word(self, word_to_normalize): """ This function helps to normalize words, because of encoding issues of some LDA tools ... @return: Normalized word as str type """ # Transform umlauts to ASCII friendly form word = word_to_normalize.replace(u"ä","ae").replace(u"ö","oe"). \ replace(u"ü","ue").replace(u"ß","ss") return word def __str__(self): """ Return a string which shows document number, number of words and number of types. """ ret_string = '' art_number = 0 for article in self.articles: art_number += 1 ret_string += 'Doc#' + str(art_number) + ': ' ret_string += str(len(article)) + ' [' + \ str(len(set((article)))) + ']' ret_string += '\n' return ret_string
class Classifier(object): """ 新建对象或者调用trainModel方法,可以生成Classifier模型 调用predict方法,可以预测新的日志文件类型及其置信度 $DATA/models/l1file_info.csv:记录原始样本文件信息(暂时不要?) $DATA/l1cache/: 存储各样本文件。目录结构就是被管服务器原始结构 """ __corpusCacheFile = os.path.join(G.projectModelPath, 'corpuscache.1') l1_dbf = os.path.join(G.projectModelPath, 'metadata.1') __MaxLines = G.cfg.getint('Classifier', 'MaxLines') def __init__(self, model_file=''): self.model_file = model_file self.model_id = 0 if model_file == G.productFileClassifierModel else 1 self.common_filenames, self.l1_structure = ([], []) self.ruleSet = None # 处理文件正则表达式 self.statsScope = None # 样本文件字符数、字数统计值(均值、标准差、中位数)的最小-最大值范围 self.dictionary = None # 字典对象(Gensim Dictionary) self.model = None # 聚类模型(Kmeans) self.categories = None # 聚类的类型(名称,数量占比,分位点距离,边界点距离) if os.path.exists(model_file): # 从模型文件装载模型 self.ruleSet, self.dictionary, self.statsScope, self.model, self.categories = joblib.load( model_file) else: G.log.warning('No model loaded!') # 重新训练模型 def reCluster(self): for folder in [G.l1_cache, G.l2_cache, G.outputs]: if os.path.exists(folder): shutil.rmtree(folder) time.sleep(3) for folder in [G.l1_cache, G.l2_cache, G.outputs]: os.mkdir(folder) common_files, file2merged = Util.mergeFilesByName( G.l0_inputs, G.l1_cache) self.__dbFilesSampled(file2merged) results = self.trainModel(k_=35) self.__saveModel() # model saved to file Util.clearModel(self.model_id) self.dbUpdCategories() db = Util.dbConnect() if not db: return cursor = db.cursor() classified_common_files, unclassified_common_files = self.splitResults( results) Util.dbFilesMerged(cursor, file2merged, classified_common_files, unclassified_common_files) Util.mergeFilesByClass(cursor, G.l2_cache) # 同类文件合并到to_目录下 wildcard_log_files = Util.genGatherList(cursor) # 生成采集文件列表 Util.dbWildcardLogFiles(cursor, wildcard_log_files) db.commit() db.close() def __dbFilesSampled(self, file2merged): db = Util.dbConnect() if not db: return cursor = db.cursor() for file_fullname, anchor_name, anchor_colRange, common_file_fullname in file2merged: file_fullname = file_fullname.replace('\\', '/') host = file_fullname[len(G.l0_inputs):].strip('/') host, filename = host.split('/', 1) archive_path, filename = os.path.split(filename) remote_path = '/' + archive_path if archive_path[ 1] != '_' else archive_path[0] + ':' + archive_path[2:] host = '"%s"' % host.strip('/') archive_path = '"%s"' % archive_path remote_path = '"%s"' % remote_path file_fullname = '"%s"' % file_fullname filename = '"%s"' % filename sql = 'INSERT INTO files_sampled (file_fullname,host,archive_path,filename,remote_path) VALUES(%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE file_fullname=%s' % ( file_fullname, host, archive_path, filename, remote_path, file_fullname) cursor.execute(sql) db.commit() db.close() def __iter__(self): self.category_id = 0 return self # 返回类别的(名称,数量占比,分位点到中心距离,边界到分位点距离) def __next__(self): i = self.category_id if i >= len(self.categories[0]): raise StopIteration self.category_id += 1 return self.categories[0][i], self.categories[1][i], self.categories[ 2][i], self.categories[3][i] def __len__(self): return len(self.categories[0]) def __getitem__(self, item): if item < -len(self.categories[0]) or item >= len(self.categories[0]): raise IndexError return self.categories[0][item], self.categories[1][ item], self.categories[2][item], self.categories[3][item] def __setitem__(self, key, value): if key < -len(self.categories[0]) or key >= len(self.categories[0]): raise IndexError name = str(value) if name in self.categories[0]: raise ValueError self.categories[0][key] = name self.dbUpdCategories() def dbUpdCategories(self): db = Util.dbConnect() if db: # Can't connect ro db, waiting and retry forever cursor = db.cursor() c = self.categories for category_id, (name, percent, boundary, quantile) in enumerate( zip(c[0], c[1], c[2], c[3])): name = '"%s"' % name sql = 'INSERT INTO file_class (model_id, category_id, name, quantile, boundary, percent) VALUES(%d, %d, %s, %e,%e, %f) ON DUPLICATE KEY UPDATE name=%s,quantile=%e,boundary=%e,percent=%f' % ( self.model_id, category_id, name, quantile, boundary, percent, name, quantile, boundary, percent) cursor.execute(sql) db.commit() db.close() # 训练、生成模型并保存在$models/xxx.mdl中,dataset:绝对/相对路径样本文件名,或者可迭代样本字符流 def trainModel(self, dataset_path=G.l1_cache, k_=0): """ Train and generate K-Means Model :param dataset_path: source path contains merged log files, or iterable char stream :param k_: K-means parameter, 0 means auto detect """ rule_sets = [] # 文本处理的替换、停用词和k-shingle规则 for ruleset_name in sorted([ section for section in G.cfg.sections() if section.split('-')[0] == 'RuleSet' ]): replace_rules, stop_words, k_list = [], [], [] for key, value in G.cfg.items(ruleset_name): if key == 'stopwords': stop_words = value.split(',') elif key == 'k-shingles': k_list = eval(value) else: replace_from, replace_to = value.split('TO') replace_rules.append( (re.compile(replace_from.strip(), re.I), replace_to.strip())) rule_sets.append((ruleset_name, replace_rules, stop_words, k_list)) # 尝试不同的向量化规则,确定聚类数量K for self.ruleSet in rule_sets: corpus_fp = self.__buildDictionary(dataset_path) # 建立字典,返回文档结构信息 if len(self.dictionary) < G.cfg.getint( 'Classifier', 'LeastTokens'): # 字典字数太少,重新采样 corpus_fp.close() self.__clearCache() G.log.info('Too few tokens[%d], Re-sample with next RuleSet.', len(self.dictionary)) continue corpus_fp.seek(0) vectors = self.__buildVectors( corpus_fp, self.dictionary.num_docs) # 建立稀疏矩阵doc*(dct + stats) corpus_fp.close() # 关闭缓存文件 # start_k = self.__findStartK(vectors) # 快速定位符合分布相对均衡的起点K # if start_k is None: # 聚类不均衡,换rule set重新采样 # continue start_k = min(50, int(vectors.shape[0] / 100)) k_ = k_ if k_ else self.__pilotClustering(vectors, start_k) # 多个K值试聚类,返回最佳K if k_ != 0: # 找到合适的K,跳出循环 break self.__clearCache() # 清除缓存的ruleset else: raise UserWarning( 'Cannot generate qualified corpus by all RuleSets') # 重新聚类, 得到模型(向量数、中心点和距离)和分类(向量-所属类) self.model, percents, boundaries, quantiles = self.__buildModel( k_, vectors) names = ['fc%d' % i for i in range(len(percents))] self.categories = [names, percents, boundaries, quantiles] results = self.__getResult(vectors) return results # 建立词典,同时缓存词表文件 def __buildDictionary(self, new_dataset_path): self.dictionary = Dictionary() # 装载处理过的缓存语料 cache_fp = open(self.__corpusCacheFile, mode='a+t', encoding='utf-8') # 创建或打开语料缓存文件 if cache_fp.tell() != 0: if os.path.exists(self.l1_dbf): self.ruleSet, self.common_filenames, self.l1_structure, self.statsScope = joblib.load( self.l1_dbf) cache_fp.seek(0) cached_documents = len(self.common_filenames) for lines, line_ in enumerate(cache_fp): if lines < cached_documents: self.dictionary.add_documents([line_.split()]) G.log.info('%d cached documents loaded.', lines) # 继续处理新增语料 for document in self.__buildDocument(new_dataset_path): self.dictionary.add_documents([document]) cache_fp.write(' '.join([word for word in document]) + '\n') if self.dictionary.num_docs < G.cfg.getint( 'Classifier', 'LeastFiles'): # 字典字数太少或文档数太少,没必要聚类 cache_fp.close() self.__clearCache() raise UserWarning('Too few documents[%d] to clustering' % self.dictionary.num_docs) # 去掉低频词,压缩字典 num_token = len(self.dictionary) no_below = int( min(G.cfg.getfloat('Classifier', 'NoBelow'), int(self.dictionary.num_docs / 50))) self.dictionary.filter_extremes(no_below=no_below, no_above=0.999, keep_n=G.cfg.getint( 'Classifier', 'KeepN')) self.dictionary.compactify() G.log.info( 'Dictionary built with [%s](%d tokens, reduced from %d), from %d files( %d words)', self.ruleSet[0], len(self.dictionary), num_token, self.dictionary.num_docs, self.dictionary.num_pos) statistics = np.array(self.l1_structure)[:, 1:7] statistics[statistics > 500] = 500 # 防止异常大的数干扰效果 self.statsScope = np.min(statistics, axis=0), np.max(statistics, axis=0) joblib.dump((self.ruleSet, self.common_filenames, self.l1_structure, self.statsScope), self.l1_dbf) # 保存模型,供后续使用 return cache_fp # 预处理,迭代方式返回某个文件的词表. def __buildDocument(self, dataset_path): amount_files, failed_files, file_fullname = 0, 0, '' G.log.info('Start Converting documents from ' + dataset_path) processed_files = os.path.join(G.projectModelPath, 'buildDocument.dbf') processed = [] if not os.path.exists(processed_files) else joblib.load( processed_files) for dir_path, dir_names, file_names in os.walk(dataset_path): for file_name in file_names: try: file_fullname = os.path.join(dir_path, file_name) if file_fullname in processed: continue amount_files += 1 if amount_files % 50 == 0: G.log.info('Converted %d[%d failed] files:\t%s', amount_files, failed_files, file_fullname) processed.append(file_fullname) yield self.__file2doc(file_fullname) except Exception as err: failed_files += 1 G.log.warning('Failed to convert\t%s, ignored.\t%s', file_fullname, str(err)) continue joblib.dump(processed, processed_files) G.log.info('Converted %d files,%d failed', amount_files, failed_files) raise StopIteration() # 使用规则集匹配和转换后,转化为词表 def __file2doc(self, file_fullname, encoding='utf-8'): document = [] line_idx, lc, lw = 0, [], [] G.log.debug('Converting ' + file_fullname) for line_idx, line in enumerate( open(file_fullname, 'r', encoding=encoding)): words = G.getWords(line, rule_set=self.ruleSet) document += words # 生成词表 lc.append(len(line)) lw.append(len(words)) if line_idx > self.__MaxLines: break line_idx += 1 # 计算统计数据 subtotal_chars = list( np.histogram(np.array(lc), bins=[0, 40, 80, 120, 160, 200, 1000 ])[0] / line_idx) subtotal_words = list( np.histogram(np.array(lw), bins=[0, 4, 8, 12, 16, 20, 100])[0] / line_idx) stats = [ np.mean(lc), np.mean(lw), np.std(lc), np.std(lw), np.median(lc), np.median(lw) ] doc_structure = [line_idx] + stats + subtotal_chars + subtotal_words # 汇总和保持元数据 self.common_filenames.append(file_fullname) self.l1_structure.append(doc_structure) return document # 从词表和文档结构形成聚类向量 def __buildVectors(self, corpus, rows): cols = len(self.dictionary) # 构造tf-idf词袋和文档向量 tfidf_model = TfidfModel(dictionary=self.dictionary, normalize=True) vectors = np.zeros((rows, cols)) for doc_idx, document in enumerate(corpus): if type(document) == str: document = document.split() for (word_idx, tf_idf_value ) in tfidf_model[self.dictionary.doc2bow(document)]: vectors[doc_idx, word_idx] = tf_idf_value # tfidf词表加入向量 # 按每个文档的行数对tfidf向量进行标准化,保证文档之间的可比性 l1_fd = np.array( self.l1_structure)[-rows:, :] # [[行数, 均值/标准差/中位数,字节和字数的12个分段数量比例]] lines = l1_fd[:, 0:1] vectors /= lines # 文档结构数据归一化处理,并生成向量 min_, max_ = self.statsScope statistics = l1_fd[:, 1:7] statistics[statistics > 500] = 500 # 防止异常大的数干扰效果 statistics = (statistics - min_) / (max_ - min_) * 0.01 # 6列统计值各占1%左右权重 subtotal = l1_fd[:, 7:] * 0.005 # subtotal 12列各占0.5%左右权重 cols += len(self.l1_structure[0]) if rows > 300: G.log.info('[%d*%d]Vectors built' % (rows, cols)) return np.hstack((statistics, subtotal, vectors)) # 从k=64开始,二分法确定Top5类样本量小于指定比例的K @staticmethod def __findStartK(vectors): k_from, k_, k_to = 5, 64, 0 while k_ < min(G.cfg.getint('Classifier', 'MaxCategory'), len(vectors)): kmeans = KMeans(n_clusters=k_).fit(vectors) # 聚类 n = min(5, int(k_ * 0.1) + 1) top5_ratio = sum([ v for (k, v) in Counter(kmeans.labels_).most_common(n) ]) / vectors.shape[0] G.log.debug( 'locating the starter. k=%d, SSE= %e, Top%d labels=%d%%', k_, kmeans.inertia_, n, top5_ratio * 100) if top5_ratio < G.cfg.getfloat('Classifier', 'Top5Ratio'): # 向前找 if k_ - k_from < 4: # 已靠近低限,找到大致起点 G.log.info('start k=%d', k_from) return k_from k_to = k_ - 1 k_ = k_from + int((k_ - k_from) / 2) else: # 向后找 if k_ < k_to < k_ + 4: # 已靠近高点,找到大致起点 G.log.info('start k=%d', k_) return k_ k_from = k_ + 1 if k_to > 0: # 有上限 k_ = k_to - int((k_to - k_) / 2) else: # 无上限 k_ *= 2 if kmeans.inertia_ < 1e-5: # 已经完全分类,但仍不均衡 break G.log.info('No starter found') return None # No found,re-samples # 聚类,得到各簇SSE(sum of the squared errors),作为手肘法评估确定k的依据 @staticmethod def __pilotClustering(vectors, k_from=1, k_to=G.cfg.getint('Classifier', 'MaxCategory')): norm_factor = vectors.shape[1] * vectors.shape[ 0] # 按行/样本数和列/字典宽度标准化因子,保证不同向量的可比性 termination_inertia = G.cfg.getfloat( 'Classifier', 'NormalizedTerminationInertia') * norm_factor cfg_q = G.cfg.getfloat('Classifier', 'Quantile') k_, pilot_list = 0, [ ] # [(k_, inertia, criterion, top5_percent, bad_percent)] criteria取inertia变化率的一阶微分的极大值 # 从k_from到k_to聚类,得到pilot_list for k_ in range(k_from, k_to): kmeans = KMeans(n_clusters=k_, tol=1e-5).fit(vectors) # 试聚类 if k_ < k_from + 2: pilot_list.append([k_, kmeans.inertia_, 0, 0, 0]) continue retry = 0 # 多聚几次,保证inertia递减 for retry in range(5): # 如果inertia因误差变大,重新聚几次 inertia = kmeans.inertia_ if inertia <= pilot_list[-1][1]: break G.log.debug('retries=%d, inertia=%e', retry + 1, inertia) kmeans = KMeans(n_clusters=k_).fit(vectors) else: inertia = pilot_list[-1][1] pilot_list[-1][2] = pilot_list[-2][1] / pilot_list[-1][ 1] - pilot_list[-1][1] / inertia a = pilot_list[-1] G.log.info( 'pilot clustering. (k,inertia,criteria,top5,bad)=\t%d\t%e\t%.3f\t%.3f\t%.3f', pilot_list[-1][0], pilot_list[-1][1], pilot_list[-1][2], pilot_list[-1][3], pilot_list[-1][4]) top5_percent = sum([ v for (k, v) in Counter(kmeans.labels_).most_common(5) ]) / len(kmeans.labels_) # 计算距离特别远(0.8分位点2倍距离以上)的坏点比例 v_scores = -np.array([kmeans.score([v]) for v in vectors]) groups = pd.DataFrame({ 'C': kmeans.labels_, 'S': v_scores }).groupby('C') c_quantiles_double = 2 * np.array( [groups.get_group(i)['S'].quantile(cfg_q) for i in range(k_)]) bad_samples = 0 for idx, score in enumerate(v_scores): if score > c_quantiles_double[kmeans.labels_[idx]]: bad_samples += 1 bad_percent = bad_samples / len(v_scores) pilot_list.append([k_, inertia, None, top5_percent, bad_percent]) if inertia < termination_inertia: # 已经收敛到很小且找到可选值,没必要继续增加 break # 从pilot list中取极大值的top5 pilot_list = np.array(pilot_list)[1:-1, :] # 去掉第一个和最后一个没法计算criterion值的 pilot_list = pilot_list[pilot_list[:, 3] < G.cfg.getfloat( 'Classifier', 'Top5Ratio')] # 去掉top5占比超标的 pilot_list = pilot_list[argrelextrema(pilot_list[:, 2], np.greater)] # 得到极大值 criteria = pilot_list[:, 2].tolist() if not criteria: # 没有极值 return None max_top_n, idx_ = [], 0 while criteria[idx_:]: idx_ = criteria.index(max(criteria[idx_:])) max_top_n.append(pilot_list[idx_]) idx_ += 1 G.log.debug( 'topN k=\n%s', '\n'.join([ '%d\t%e\t%.3f\t%.3f\t%.3f' % (k, i, c, t, b) for k, i, c, t, b in max_top_n ])) products = [k * c for k, i, c, t, b in max_top_n] idx_ = products.index(max(products)) preferred = max_top_n[idx_][0] G.log.info('pilot-clustering[k:%d] finished. preferred k=(%d)', k_, preferred) return preferred # 重新聚类,得到各Cluster的中心点、分位点距离、边界距离以及数量占比等 @staticmethod def __buildModel(k_, vectors): # 再次聚类并对结果分组。 Kmeans不支持余弦距离 kmeans = KMeans(n_clusters=k_, n_init=20, max_iter=500).fit(vectors) norm_factor = -vectors.shape[1] # 按字典宽度归一化 groups = pd.DataFrame({ 'C': kmeans.labels_, 'S': [kmeans.score([v]) / norm_factor for v in vectors] }).groupby('C') percents = groups.size() / len(vectors) # 该簇向量数在聚类总向量数中的占比 cfg_q = G.cfg.getfloat('Classifier', 'Quantile') quantiles = np.array([ groups.get_group(i)['S'].quantile(cfg_q, interpolation='higher') for i in range(k_) ]) boundaries = groups['S'].agg('max').values # 该簇中最远点距离 quantiles2 = quantiles * 2 boundaries[boundaries > quantiles2] = quantiles2[ boundaries > quantiles2] # 边界太远的话,修正一下 boundaries[boundaries < 1e-100] = 1e-100 # 边界为零的话,修正一下 quantiles = boundaries - quantiles quantiles[quantiles < 1e-100] = 1e-100 # 避免出现0/0 G.log.info( 'Model(k=%d) built. inertia=%e, max proportion=%.2f%%, max quantile=%e, max border=%e', k_, kmeans.inertia_, max(percents) * 100, max(quantiles), max(boundaries)) return kmeans, percents, boundaries, quantiles # 保存文本格式模型 def __saveModel(self): joblib.dump((self.ruleSet, self.dictionary, self.statsScope, self.model, self.categories), G.projectFileClassifierModel) self.dictionary.save_as_text( os.path.join(G.logsPath, 'FileDictionary.csv')) category_names, percents, boundaries, quantiles = self.categories l2_fd = pd.DataFrame({ '类名': category_names, '占比': percents, '分位点到边界': quantiles, '边界点': boundaries }) l2_fd.to_csv(os.path.join(G.logsPath, 'FileCategories.csv'), sep='\t', encoding='GBK') G.log.info( 'Model is built and saved to %s, %s and Database: FileDictionary.csv, FileCategories.csv successful.', G.projectFileClassifierModel, G.logsPath) def splitResults(self, results): classified_files, unclassified_files = [], [] for common_name, category, category_name, confidence, distance in zip( self.common_filenames, results[0], results[1], results[2], results[3]): if confidence < G.minConfidence: # 置信度不够,未完成分类 unclassified_files.append(common_name) else: classified_files.append([ self.model_id, common_name, category, category_name, confidence, distance ]) return classified_files, unclassified_files # 对单个样本文件进行分类,返回文件名称、时间戳锚点位置,类别和置信度 def predictFile(self, file_fullname, encoding='utf-8'): """ :param file_fullname: log file to be predicted: :param encoding: encoding of the file :return: None if file __process errors, tuple of filename, number-of-lines, timestamp-cols, predict category index, name, confidence and distance-to-center. confidence > 1 means nearer than 0.8-quantile to the center, < 0 means out of boundary """ if self.model is None: raise UserWarning('Failed to predict: Model is not exist!') try: document = self.__file2doc(file_fullname, encoding=encoding) # 文件转为词表 vectors = self.__buildVectors([document], 1) categories, names, confidences, distances = self.__getResult( vectors) # 预测分类并计算可信度 return categories[0], names[0], confidences[0], distances[0] except Exception as err: G.log.warning('Failed to predict\t%s, ignored.\t%s', file_fullname, str(err)) return None # 对目录下多个样本文件进行分类,返回文件名称、时间戳锚点位置,类别和置信度 def predictFiles(self, dataset_path, encoding='utf-8'): """ :param dataset_path: path which contains filed to be predicted :param encoding: encoding of the file :return: list of file-names, number-of-line, timestamp-col, predict category index, name, confidence and distance-center. confidence > 1 means nearer than 0.8-quantile to the center, < 0 means out of boundary """ if self.model is None: raise UserWarning('Failed to predict: Model is not exist!') corpus = [] start_ = len(self.common_filenames) amount_files, failed_files, file_fullname = 0, 0, '' G.log.info('Start __process documents from ' + dataset_path) for dir_path, dir_names, file_names in os.walk(dataset_path): try: for file_name in file_names: file_fullname = os.path.join(dir_path, file_name) amount_files += 1 if amount_files % 50 == 0: G.log.info('Processed %d files, failed %d', amount_files, failed_files) corpus.append( self.__file2doc(file_fullname, encoding=encoding)) # 文件转为词表 except Exception as err: failed_files += 1 G.log.warning('Failed to __process\t%s, ignored.\t%s', file_fullname, str(err)) continue G.log.info('Converted %d files,%d(%d%%) failed', amount_files, failed_files, failed_files / amount_files * 100) vectors = self.__buildVectors(corpus, len(corpus)) categories, category_names, confidences, distances = self.__getResult( vectors) # 预测分类并计算可信度 files = self.common_filenames[start_:] return files, list(categories), category_names, list( confidences), distances # 预测分类并计算可信度。<0 表示超出边界,完全不对,〉1完全表示比分位点还近,非常可信 def __getResult(self, vectors): c_names, c_percents, c_boundaries, c_quantiles = self.categories norm_factor = -vectors.shape[1] # 按字典宽度归一化 predicted_labels = self.model.predict(vectors) # 使用聚类模型预测记录的类别 predicted_names = [c_names[label] for label in predicted_labels] confidences = [] distances = [] for i, v in enumerate(vectors): distance = self.model.score([v]) / norm_factor distances.append(distance) category = predicted_labels[i] confidences.append( (c_boundaries[category] - distance) / c_quantiles[category]) confidences = np.array(confidences, copy=False) confidences[confidences > 99.9] = 99.9 confidences[confidences < -99.9] = -99.9 return predicted_labels, predicted_names, confidences, distances # 删除缓存文件 def __clearCache(self): for f in [self.__corpusCacheFile, self.l1_dbf]: try: os.remove(self.l1_dbf) if os.path.exists(self.l1_dbf) else None os.remove(self.__corpusCacheFile) if os.path.exists( self.__corpusCacheFile) else None except Exception as err: G.log.warning('Failed to clear %s. %s' % (f, str(err))) continue
class DocDataset(Dataset): def __init__(self, taskname, txtPath=None, lang="zh", tokenizer=None, stopwords=None, no_below=5, no_above=0.1, hasLable=False, rebuild=False, use_tfidf=False): cwd = os.getcwd() txtPath = os.path.join( cwd, 'data', f'{taskname}_lines.txt') if txtPath == None else txtPath tmpDir = os.path.join(cwd, 'data', taskname) self.txtLines = [ line.strip('\n') for line in open(txtPath, 'r', encoding='utf-8') ] self.dictionary = None self.bows, self.docs = None, None self.use_tfidf = use_tfidf self.tfidf, self.tfidf_model = None, None if not os.path.exists(tmpDir): os.mkdir(tmpDir) if not rebuild and os.path.exists(os.path.join(tmpDir, 'corpus.mm')): self.bows = gensim.corpora.MmCorpus( os.path.join(tmpDir, 'corpus.mm')) if self.use_tfidf: self.tfidf = gensim.corpora.MmCorpus( os.path.join(tmpDir, 'tfidf.mm')) self.dictionary = Dictionary.load_from_text( os.path.join(tmpDir, 'dict.txt')) self.docs = pickle.load( open(os.path.join(tmpDir, 'docs.pkl'), 'rb')) self.dictionary.id2token = { v: k for k, v in self.dictionary.token2id.items() } # because id2token is empty be default, it is a bug. else: if stopwords == None: stopwords = set([ l.strip('\n').strip() for l in open(os.path.join(cwd, 'data', 'stopwords.txt'), 'r', encoding='utf-8') ]) # self.txtLines is the list of string, without any preprocessing. # self.texts is the list of list of tokens. print('Tokenizing ...') if tokenizer is None: tokenizer = globals()[LANG_CLS[lang]](stopwords=stopwords) self.docs = tokenizer.tokenize(self.txtLines) self.docs = [line for line in self.docs if line != []] # build dictionary self.dictionary = Dictionary(self.docs) #self.dictionary.filter_n_most_frequent(remove_n=20) # self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=1994) # use Dictionary to remove un-relevant tokens self.dictionary.compactify() self.dictionary.id2token = { v: k for k, v in self.dictionary.token2id.items() } # because id2token is empty by default, it is a bug. # convert to BOW representation self.bows, _docs = [], [] for doc in self.docs: _bow = self.dictionary.doc2bow(doc) if _bow != []: _docs.append(list(doc)) self.bows.append(_bow) self.docs = _docs if self.use_tfidf == True: self.tfidf_model = TfidfModel(self.bows) self.tfidf = [self.tfidf_model[bow] for bow in self.bows] # serialize the dictionary gensim.corpora.MmCorpus.serialize( os.path.join(tmpDir, 'corpus.mm'), self.bows) self.dictionary.save_as_text(os.path.join(tmpDir, 'dict.txt')) pickle.dump(self.docs, open(os.path.join(tmpDir, 'docs.pkl'), 'wb')) if self.use_tfidf: gensim.corpora.MmCorpus.serialize( os.path.join(tmpDir, 'tfidf.mm'), self.tfidf) self.vocabsize = len(self.dictionary) self.numDocs = len(self.bows) print(f'Processed {len(self.bows)} documents.') def __getitem__(self, idx): bow = torch.zeros(self.vocabsize) if self.use_tfidf: item = list(zip(*self.tfidf[idx])) else: item = list( zip(*self.bows[idx] )) # bow = [[token_id1,token_id2,...],[freq1,freq2,...]] bow[list(item[0])] = torch.tensor(list(item[1])).float() txt = self.docs[idx] return txt, bow def __len__(self): return self.numDocs def collate_fn(self, batch_data): texts, bows = list(zip(*batch_data)) return texts, torch.stack(bows, dim=0) def __iter__(self): for doc in self.docs: yield doc def show_dfs_topk(self, topk=20): ndoc = len(self.docs) dfs_topk = sorted([(self.dictionary.id2token[k], fq) for k, fq in self.dictionary.dfs.items()], key=lambda x: x[1], reverse=True)[:topk] for i, (word, freq) in enumerate(dfs_topk): print(f'{i+1}:{word} --> {freq}/{ndoc} = {(1.0*freq/ndoc):>.13f}') return dfs_topk def show_cfs_topk(self, topk=20): ntokens = sum([v for k, v in self.dictionary.cfs.items()]) cfs_topk = sorted([(self.dictionary.id2token[k], fq) for k, fq in self.dictionary.cfs.items()], key=lambda x: x[1], reverse=True)[:topk] for i, (word, freq) in enumerate(cfs_topk): print( f'{i+1}:{word} --> {freq}/{ntokens} = {(1.0*freq/ntokens):>.13f}' ) def topk_dfs(self, topk=20): ndoc = len(self.docs) dfs_topk = self.show_dfs_topk(topk=topk) return 1.0 * dfs_topk[-1][-1] / ndoc
class SenseClassifier(object): DATE_PTN = re.compile( u'(((19)*9\d|(20)*[01]\d)\-?)?((0[1-9]|1[012])\-?)([012]\d|3[01])') LABEL_TO_INDEX = { 'movie': 0, 'episode': 1, 'enter': 2, 'cartoon': 3, 'game': 4 } INDEX_TO_LABEL = { 0: 'movie', 1: 'episode', 2: 'enter', 3: 'cartoon', 4: 'game' } def __init__(self, dict_file=None, model_file=None): if dict_file: self.dictionary = Dictionary.load_from_text(dict_file) else: self.dictionary = Dictionary() if model_file: self.model = joblib.load(model_file) else: self.model = None def dictionary_size(self): return len(self.dictionary) def expand_sent_terms(self, sent, center, rm_kw=False): expd_sent = list(sent) # Expand with ngram and position_term features. if center >= 0: ngram_terms = self._get_ngram_terms(sent, center) expd_sent.extend(ngram_terms) posi_terms = self._get_posi_terms(sent, center) expd_sent.extend(posi_terms) # Remove the keyword itself. if rm_kw and center >= 0: del expd_sent[center] return expd_sent def sentence_to_bow(self, sent): if self.dictionary: return self.dictionary.doc2bow(sent) else: return None def bow_to_feature_vec(self, bow_corpus): data = [] rows = [] cols = [] line_count = 0 for bow_sent in bow_corpus: for elem in bow_sent: rows.append(line_count) cols.append(elem[0]) data.append(elem[1]) line_count += 1 return csr_matrix((data, (rows, cols)), shape=(line_count, len(self.dictionary))) def load_text(self, data_file, train=False): term_corpus = [] labels = [] with open(data_file) as fin: for line in fin: parts = line.strip().decode('utf8').split('\t') if len(parts) < 3: continue label, keyword = parts[0:2] orig_sent = parts[2:] if train: keyword_count = sum( [1 if x == keyword else 0 for x in orig_sent]) if keyword_count != 1: continue # Normalize special terms. sent = [ '@date@' if self.DATE_PTN.match(term) else term for term in orig_sent ] # Expand sentence with more features. center = sent.index(keyword) sent = self.expand_sent_terms(sent, center, True) # Save sentences and labels. term_corpus.append(sent) labels.append(self.LABEL_TO_INDEX[label]) # Update dictionary. if train: self.dictionary.add_documents([sent]) if train: # Compacitify dictionary. self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) self.dictionary.compactify() # Change text format corpus to bow format. bow_corpus = [] for sent in term_corpus: sent_bow = self.dictionary.doc2bow(sent) bow_corpus.append(sent_bow) return bow_corpus, labels WINDOW_SIZE = 3 def _get_posi_terms(self, words, center): terms = [] for i in range(self.WINDOW_SIZE): offset = (i + 1) left_posi = center - offset if left_posi >= 0: terms.append('%s-%d' % (words[left_posi], offset)) right_posi = center + offset if right_posi < len(words): terms.append('%s+%d' % (words[right_posi], offset)) return terms NGRAM_WINDOW_SIZE = 10 def _get_ngram_terms(self, words, center): terms = [] for i in range(1, self.NGRAM_WINDOW_SIZE): offset = (i + 1) left_posi = center - offset if left_posi >= 0: terms.append('%s_%s' % (words[left_posi], words[left_posi + 1])) right_posi = center + offset if right_posi < len(words): terms.append('%s_%s' % (words[right_posi - 1], words[right_posi])) return terms def dump_dict(self, dict_file): self.dictionary.save_as_text(dict_file) def dump_model(self, model_file): if self.model: joblib.dump(self.model, model_file) def train(self, x_list, y_list, model='lr'): X_train, X_test, y_train, y_test = train_test_split(x_list, y_list, test_size=0.3) if model == 'lr': self.model = LogisticRegression(C=1.0, multi_class='multinomial', penalty='l2', solver='sag', tol=0.1) else: logging.error('Unknown model name!') return self.model.fit(X_train, y_train) score = self.model.score(X_train, y_train) print("Evaluation on train set : %.4f" % score) score = self.model.score(X_test, y_test) print("Evaluation on test set : %.4f" % score) def predict(self, X): return self.model.predict(X) def predict_proba(self, X): return self.model.predict_proba(X) def eval(self, X, y): score = self.model.score(X, y) print("Evaluation on validation set : %.4f" % score)
def phrases(): unigram_sentences = LineSentence(unigram_sentences_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) bigram_model = Phrases.load(bigram_model_filepath) bigram_sentences_filepath = intermediate_directory + 'bigram_model_all.txt' with open(bigram_sentences_filepath, 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence) bigram_sentences = LineSentence(bigram_sentences_filepath) trigram_model_filepath = intermediate_directory + 'trigram_sentences_all' trigram_model = Phrases(bigram_sentences) trigram_model.save(trigram_model_filepath) trigram_model = Phrases.load(trigram_model_filepath) trigram_sentences_filepath = intermediate_directory + 'trigram_sentences_all.txt' with open(trigram_sentences_filepath, 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = ' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence(trigram_sentences_filepath) ### STOP WORDS REMOVAL ### trigram_reviews_filepath = intermediate_directory + 'trigram_transformed_reviews_all.txt' with open(trigram_reviews_filepath, 'w', encoding='utf_8') as f: for parsed_review in nlp.pipe(line_review('data/'), batch_size=10000, n_threads=4): # lemmatize the text, removing punctuation and whitespace unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)] # apply the first-order and second-order phrase models bigram_review = bigram_model[unigram_review] trigram_review = trigram_model[bigram_review] trigram_review = [term for term in trigram_review if term not in STOP_WORDS and term != '-PRON-' and term != '‘' and term != '’' and term != "'s" and term != "’s"] # write the transformed review as a line in the new file trigram_review = ' '.join(trigram_review) ## MOVED OUTSIDE OF THE LOOP SO WE COULD GET A SINGULAR CORPUS WITH ALL THE TEXT # print(trigram_review) f.write(trigram_review + '\n') ######BAG OF WORDS CREATION ######### trigram_reviews = LineSentence(trigram_reviews_filepath) # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary() trigram_dictionary.add_documents(trigram_reviews) # add keep_n=10000 trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save_as_text(trigram_dictionary_filepath)
""" make_dic.py 1. Read csv file 2. Make dictionary 3. Update dictionary 4. Save dictionary into a txt file """ import pandas as pd from gensim.corpora import Dictionary # Read csv file df = pd.read_csv("livedoor_news.csv") # 辞書 dct = Dictionary() for i, news in enumerate(df["news"]): # Update dictionary with new documents dct.add_documents([news.split()]) dct.save_as_text("vocab.txt")
class Vocab: def __init__(self): self.dictionary = Dictionary() self.dictionary.token2id['<UNK>'] = -1 self.dictionary.id2token[-1] = '<UNK>' self.dictionary.dfs[-1] = 0 def set(self, corpus, prune_at=2000000): self.dictionary.add_documents(corpus, prune_at) def prune(self, **kwargs): # it is best if pruning is applied after all the updates # otherwise dropped tokens during pruning, seen in update # docs will produce wrong counts if self.dictionary.dfs == {}: raise ValueError('no vocab to filter; build vocab first') no_below = kwargs.get('no_below', 5) no_above = kwargs.get('no_above', 0.7) keep_n = kwargs.get('keep_n', 100000) keep_tokens = kwargs.get('keep_tokens', None) if keep_tokens: keep_tokens.append('UNK') else: keep_tokens = ['UNK'] preprune_count = sum([df for _, df in self.dictionary.dfs.items()]) self.dictionary.filter_extremes(no_below, no_above, keep_n, keep_tokens) postprune_count = sum([df for _, df in self.dictionary.dfs.items()]) self.dictionary.dfs[-1] = preprune_count - postprune_count # add UNK back (gets pruned due to 0 initial val) self.dictionary.token2id['<UNK>'] = -1 self.dictionary.id2token[-1] = '<UNK>' def update(self, docs, prune_at=2000000): self.add_documents(docs, prune_at) def transform(self, docs, transform_to='ids', with_unk=True): if transform_to == 'ids': for doc in docs: yield self.dictionary.doc2idx(doc) elif transform_to == 'bow': for doc in docs: if with_unk: yield self.doc2bow(doc) else: yield self.dictionary.doc2bow(doc) else: raise ValueError('unknwon transformation format') def fit_transform(self, docs, transform_to='ids', prune_at=2000000, filter_vocab=False, **kwargs): self.set(docs, prune_at) if filter_vocab: self.prune(**kwargs) yield from self.transform(docs, transform_to) def merge(self, other): self.dictionary.merge_with(other) def save(self, fname, as_text=False, sort_by_word=False): if as_text: self.dictionary.save_as_text(fname, sort_by_word) else: self.dictionary.save(fname) def load(self, fname, from_text=False): if from_text: self.dictionary = Dictionary.load_from_text(fname) else: self.dictionary = Dictionary.load(fname) def __len__(self): return len(self.dictionary) def __iter__(self): return iter(self.dictionary) def keys(self): return list(self.dictionary.token2id.values()) def __str__(self): return str(self.dictionary) def __getitem__(self, tokenid): return self.dictionary[tokenid] def doc2bow(self, document): # note: slight variation to BoW format conversion from gensim # to allow '<UNK>' tokens if isinstance(document, string_types): raise TypeError( "doc2bow expects an array of unicode tokens on input, not a single string" ) # Construct (word, frequency) mapping. counter = defaultdict(int) for w in document: if w in self.dictionary.token2id: counter[self.dictionary.token2id[w]] += 1 else: counter[-1] += 1 # return tokenids, in ascending id order counter = sorted(iteritems(counter)) return counter
dictionary = Dictionary() dictionary.add_documents(wiki.get_texts(), prune_at=None) print(' Building dictionary took %s' % formatTime(time.time() - t0)) print(' %d unique tokens before pruning.' % len(dictionary)) sys.stdout.flush() # keep_words = 100000 # The initial dictionary is huge (~8.75M words in my Wikipedia dump), # so let's filter it down. We want to keep the words that are neither # very rare or overly common. To do this, we will keep only words that # exist within at least 20 articles, but not more than 10% of all # documents. Finally, we'll also put a hard limit on the dictionary # size and just keep the 'keep_words' most frequent works. dictionary.save_as_text('./data/dictionary_full.txt.bz2') dictionary.filter_extremes(no_below=50, no_above=0.15, keep_n=None) print(' %d unique tokens after pruning.' % len(dictionary)) # Write out the dictionary to disk. # For my run, this file is 769KB when compressed. # TODO -- This text format lets you peruse it, but you can # compress it better as binary... dictionary.save_as_text('./data/dictionary.txt.bz2') else: # Nothing to do here. print('') # ======== STEP 2: Convert Articles To Bag-of-words ======== # Now that we have our finalized dictionary, we can create bag-of-words # representations for the Wikipedia articles. This means taking another
def store_contents(data_path, save_path, datasource, processOnlyFilesinOriginalQrels, num_workers=None): """Preprocess and store a corpus of documents in sqlite. Args: data_path: Root path to directory (or directory of directories) of files containing json encoded documents (must have `id` and `text` fields). save_path: Path to output sqlite db. preprocess: Path to file defining a custom `preprocess` function. Takes in and outputs a structured doc. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) print save_path print data_path docIds = [] # list of TREC DocID docIdToDocIndex = {} # key is DocID, value is docIndex docIndex = 0 workers = ProcessPool(num_workers) files = [] if processOnlyFilesinOriginalQrels == True: topicData = TRECTopics(datasource, start_topic[datasource], end_topic[datasource]) qrelDocList = topicData.qrelDocIdLister( qrelAddress[datasource], save_path, topic_original_qrels_doc_list_file_name) files = [] for docId in qrelDocList: fileid = docId + '.txt' files.append(os.path.join(data_path, fileid)) #files = [f for f in iter_files(data_path) if os.path.splitext(os.path.basename(f))[0] in qrelDocList] print "Number of unique documents in the qrels", len(files) else: files = [f for f in iter_files(data_path)] dictionary = Dictionary() count = 0 with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): count += len(pairs) dictionary.add_documents([ pairs[0][1].split() ]) # pairs[0][0]-->docId, pairs[0][1]-->documentContent docIdToDocIndex[pairs[0][0]] = docIndex docIds.append(pairs[0][0]) docIndex = docIndex + 1 pbar.update() print("Number of documents:", docIndex, len(docIds), len(docIdToDocIndex)) total_documents = len(docIds) metadata = {} metadata['docIdToDocIndex'] = docIdToDocIndex metadata['docIndexToDocId'] = docIds # protocol 2 for version compaitability pickle.dump(metadata, open(save_path + meta_data_file_name[datasource], 'wb'), protocol=2) # keep only words that # exist within at least 20 articles # keep only the top most freqent 15000 tokens dictionary.filter_extremes(no_below=20, keep_n=dictionary_features_number) dictionary.compactify() dictionary.save_as_text(save_path + dictionary_name) dictionary = Dictionary.load_from_text(save_path + dictionary_name) start_time = time.time() corpus_bow_stream = stream_corpus(data_path, dictionary, files) MmCorpus.serialize(save_path + corpus_bow_file_name, corpus_bow_stream, progress_cnt=10000) corpus_bow = MmCorpus(save_path + corpus_bow_file_name) model_tfidf = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) model_tfidf.save(save_path + corpus_tfidf_model_file_name) corpus_tfidf = model_tfidf[corpus_bow] # apply model MmCorpus.serialize(save_path + corpus_tfidf_file_name, corpus_tfidf, progress_cnt=1000) # Load the tf-idf corpus back from disk. corpus_tfidf = MmCorpus(save_path + corpus_tfidf_file_name) #n_items = len(dictionary) #print corpus_tfidf # CSR matrix construction phase indptr = [0] indices = [] data = [] # processing took 9:26s with tqdm(total=total_documents) as pbar: for doc in corpus_tfidf: for (index, values) in doc: indices.append(index) data.append(values) indptr.append(len(indices)) pbar.update() start = time.time() sparse_matrix = sp.csr_matrix((data, indices, indptr), dtype=float) # saving took 01:21s sp.save_npz(save_path + csr_matrix_file_name[datasource], sparse_matrix) print "Finished in:", (time.time() - start)