class MyCorpus(corpora.TextCorpus): def __init__(self, input=None, path='../processed_papers'): self.path = path super(corpora.TextCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False self.dictionary.add_documents(self.get_texts()) def __len__(self): files = os.listdir(self.path) return len(files) def get_texts(self): files = os.listdir(self.path) counteR = 0 json_data = {} for fl in files: #print(counteR, ': ', fl) if (counteR % 1000 == 0): print(counteR) counteR += 1 text = '' json_data = json.load(open(self.path + '/' + fl)) if json_data["title"] is not None: text += (json_data["title"] + " ") for val in json_data["abstract_sentences"].values(): if val is not None: text += val + " " for val in json_data['body_sentences'].values(): if val is not None: text += val + " " yield ie_preprocess(text)
def build_dictionary(): dictionary = Dictionary() for line in open(wiki_index.ARTICLES_FILE): dictionary.add_documents([line.lower().split()]) dictionary.filter_extremes(no_below=2, no_above=0.5) dictionary.save(DICTIONARY_FILE) return dictionary
class CorpusOfMethodContents(TextCorpus): def __init__(self): self.mapMethodFQNtoIndex = {} self.methodFqns = [] self.methodContents = [] TextCorpus.__init__(self) def addDocument(self, methodFqn, words): if methodFqn not in self.mapMethodFQNtoIndex: self.methodFqns.append(methodFqn) self.mapMethodFQNtoIndex[methodFqn] = len(self.mapMethodFQNtoIndex) - 1 self.methodContents.append(words) self.dictionary.doc2bow(words, allow_update = True) else: self.methodContents[self.mapMethodFQNtoIndex[methodFqn]] = words self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) def getMethodContentsForFqn(self, fqn): if fqn in self.mapMethodFQNtoIndex.keys(): return self.methodContents[self.mapMethodFQNtoIndex[fqn]] return None def get_texts(self): for content in self.methodContents: yield content
class Corpus(object): def __init__(self, path, dict_path): self.dictionary = Dictionary() add_to_dict = True if dict_path and os.path.exists(dict_path): print('loading dictionary') self.dictionary = self.dictionary.load(dict_path) add_to_dict = False self.train = self.tokenize(os.path.join(path, 'train.txt'), add_to_dict) self.valid = self.tokenize(os.path.join(path, 'valid.txt'), add_to_dict) self.test = self.tokenize(os.path.join(path, 'test.txt'), add_to_dict) if dict_path and not os.path.exists(dict_path): self.dictionary.save(dict_path) def tokenize(self, path, add_to_dict): """Tokenizes a text file.""" assert os.path.exists(path) all_words = list( chain.from_iterable([ sent.split() + ['<eos>'] for sent in open(path).read().split('\n') ])) if add_to_dict: self.dictionary.add_documents([all_words]) return torch.LongTensor(self.dictionary.doc2idx(all_words))
def download_dictionary(corpus_name: str, target_path: str) -> Dictionary: """ Download dictionary only for a corpus from UCI website :param corpus_name: name of UCI corpus :param target_path: output directory for dictionary file :return: gensim Dictionary """ url_root = "https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/" target_path = os.path.join(target_path, "uci", "raw") if not os.path.exists(target_path): print("creating target path: {}".format(target_path)) os.makedirs(target_path) vocab_file = os.path.join(target_path, "vocab.{}.txt".format(corpus_name)) print("downloading {} vocab file to: {}".format(corpus_name, vocab_file)) urllib.request.urlretrieve(url_root + "vocab.{}.txt".format(corpus_name), filename=vocab_file) dictionary = Dictionary() with open(vocab_file) as f: for line in f: dictionary.add_documents([[line.strip()]]) dictionary.compactify() return dictionary
def topic_model(df_train, df_test, topic_count=10): ## general remove text df_train['tweet'] = df_train['tweet'].map(general_text_processing) df_test['tweet'] = df_test['tweet'].map(general_text_processing) ## remove stop words df_train['tweet'] = df_train['tweet'].map(remove_stop_words) df_test['tweet'] = df_test['tweet'].map(remove_stop_words) ## gensim lda from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel dictionary = Dictionary() for t in df_train.tweet.values.tolist(): #print(t) dictionary.add_documents([t.split()]) #for t in df_test['tweet'].values.tolist() : #print(t) # print(t[0].split()) #print(dictionary.doc2bow(t.split())) train_doc2_corupus = [ dictionary.doc2bow(text.split()) for text in df_train['tweet'].values.tolist() ] #print(train_doc2_corupus) lda_model = LdaModel(train_doc2_corupus, num_topics=topic_count) """ fill topics """ df_test = fill_lda_result(df_test, lda_model, dictionary, topic_count) df_train = fill_lda_result(df_train, lda_model, dictionary, topic_count) """ return """ return df_train, df_test
def build_dictionary_from_splits(splits_template, column, n, save_pickle=None): ''' Build dictionary from splits. If `save_pickle` is provided, then save. ''' unfiltered_dict = Dictionary() for eid in xrange(n): unfiltered_dict.add_documents(csv_isolator("../../data/proc_Train_%d.csv" % eid, column)) print "Before filtering,", unfiltered_dict if save_pickle: print "\nsaving..." unfiltered_dict.save(save_pickle) return unfiltered_dict
class SublexicalizedCorpus(TextCorpus): def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True, n_proc=1): self.order = order self.clean_func = clean_func self.base_corpus = base_corpus self.word_limit = word_limit self.n_proc = n_proc super(SublexicalizedCorpus, self).__init__() self.dictionary = Dictionary() if create_dictionary: self.dictionary.add_documents(self.get_texts()) def get_texts(self): a_count = 0 t_count = 0 texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts()) pool = multiprocessing.Pool(self.n_proc) start = time.clock() prev = start for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100): for tokens in pool.imap_unordered(process, group): a_count += 1 cur = time.clock() if cur - prev > 60: logging.info("Sublexicalized %d in %d seconds, %.0f t/s" % (t_count, cur - start, t_count*1. / (cur - start))) prev = cur t_count += len(tokens) yield tokens if self.word_limit and t_count > self.word_limit: break pool.terminate() end = time.clock() logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s" % (t_count, end - start, t_count*1. / (end - start))) self.length = t_count
def __init__(self, fname, dictionary=None): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ self.fname = fname self.metadata = False if dictionary is None: dictionary = Dictionary() for text in self.get_texts(): dictionary.add_documents([text]) self.dictionary = dictionary
def initialize_lda(): path = os.path.join("../data", "train.csv") dct = Dictionary(common_texts) corpus = [dct.doc2bow(text) for text in common_texts] with open(path, 'r') as file: csv_file = csv.DictReader(file) for row in csv_file: row = dict(row) new_texts = [row['story'].split()] dct.add_documents(new_texts) corpus += [dct.doc2bow(text) for text in new_texts] lda = models.ldamodel.LdaModel(corpus, num_topics=50) lda.save(os.path.join("lda_model", "model")) dct.save_as_text(os.path.join("lda_model", "dictionary"))
class TextCorpus(gensim.corpora.TextCorpus): """A corpus class which makes some minor extensions to the Gensim `TextCorpus` implementation: - Support loading of pre-built dictionary """ def __init__(self, input=None, dictionary=None, dictionary_save_path=None, pre_tokenized=False, lowercase=False): super(gensim.corpora.TextCorpus, self).__init__() self.input = input self.metadata = False self.pre_tokenized = pre_tokenized self.lowercase = lowercase if dictionary is None: self.dictionary = Dictionary() if input is not None: self.dictionary.add_documents(self.get_texts()) else: logging.warning("No input document stream provided; " "assuming dictionary will be " "initialized in some other way.") else: self.dictionary = dictionary if dictionary_save_path is not None: self.dictionary.save(dictionary_save_path) def get_texts(self): length = 0 # Input should have one document (sentence, for the word2vec case) per line for line in getstream(self.input): length += 1 if self.pre_tokenized: if not isinstance(line, unicode): line = unicode(line, encoding='utf8', errors='strict') yield line else: yield gensim.utils.tokenize(line, lowercase=self.lowercase) self.length = length
def topic_model(df_train, df_test, topic_count=10): ## general remove text df_train['tweet'] = df_train['tweet'].fillna("") df_test['tweet'] = df_test['tweet'].fillna("") df_train['tweet'] = df_train['tweet'].map(general_text_processing) df_test['tweet'] = df_test['tweet'].map(general_text_processing) ## remove stop words df_train['tweet'] = df_train['tweet'].map(remove_stop_words) df_test['tweet'] = df_test['tweet'].map(remove_stop_words) ## gensim lda dictionary = Dictionary() for t in df_train.tweet.values.tolist(): #print(t) dictionary.add_documents([t.split()]) #for t in df_test['tweet'].values.tolist() : #print(t) # print(t[0].split()) #print(dictionary.doc2bow(t.split())) train_doc2_corupus = [ dictionary.doc2bow(text.split()) for text in df_train['tweet'].values.tolist() ] #print(train_doc2_corupus) print("Started LDA") lda_model = LdaModel(train_doc2_corupus, num_topics=topic_count, iterations=30) print("Completed LDA") """ fill topics """ df_test = fill_lda_result(df_test, lda_model, dictionary, topic_count) df_train = fill_lda_result(df_train, lda_model, dictionary, topic_count) """ return """ print('LDA Completed') return df_train, df_test
class FolderCorpus(corpora.TextCorpus): def __init__(self, filepaths, preprocess=[], dictionary=None): self.filepaths = filepaths self.preprocess = preprocess self.metadata = None self.dictionary = Dictionary() self.dictionary.add_documents(self.get_texts()) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000) self.dictionary.compactify() def get_texts(self): for path in self.filepaths: with codecs.open(path, encoding='utf8') as f: raw_text = f.read() raw_text = raw_text.lower() for filt in self.preprocess: raw_text = filt(raw_text) text = list(utils.tokenize(raw_text, deacc=True, lowercase=True)) yield text
class ArchiveCorpus(corpora.TextCorpus): def __init__(self, datafile, preprocess=[], dictionary=None): self.datafile = datafile self.preprocess = preprocess self.metadata = None if dictionary: self.dictionary = dictionary else: self.dictionary = Dictionary() if datafile is not None: self.dictionary.add_documents(self.get_texts()) self.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=500000) def get_texts(self): with utils.smart_open(self.datafile) as inputfile: for line in inputfile: for f in self.preprocess: line = f(line) text = list(utils.tokenize(line, deacc=True, lowercase=True)) yield text
def main(): parser = argparse.ArgumentParser() parser.add_argument("--use_domain", action="store_true") parser.add_argument("--update", action="store_true") parser.add_argument("--save_interval", type=int, default=100) args = parser.parse_args() if args.update: common_dict = Dictionary.load_from_text("./common_dict.txt") else: common_dict = Dictionary() for i, url in enumerate(sys.stdin): print("url " + str(i)) text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain) if not text: continue word_list = doc2word_list(text) common_dict.add_documents([word_list]) if i % args.save_interval == args.save_interval - 1: common_dict.save_as_text("./common_dict.txt") common_dict.save_as_text("./common_dict.txt")
def construct_test(tagger): f = open(cfg.PATH_TO_VGR_domain_text2) g = open(cfg.PATH_TO_X_TEST, 'w') line = f.readline() word_dic = Dictionary() char_dic = Dictionary() word_dic.add_documents([["UNK", "EOS"]]) char_dic.add_documents([["UNK", "BOW"]]) while line: sentence = _tokenize(line, tagger) g.write(" ".join(sentence) + "\n") word_dic.add_documents([sentence]) char_dic.add_documents([list(line)]) line = f.readline() f.close g.close return list(word_dic.itervalues()), list(char_dic.itervalues())
class Vocab(): def __init__(self): self.dic = Dictionary() self.dic.add_documents([[u'<UNK>']]) def construct(self, input_file): f = codecs.open(input_file, 'r', 'utf-8') sentences = [] for line in f: line = line.strip().split() sentences.append(line) self.dic.add_documents(sentences) f.close() self.dic.id2token = {v: k for k, v in self.dic.token2id.items()} def word2id(self, input_file, output_file): f = codecs.open(input_file, 'r', 'utf-8') g = open(output_file, 'w') for line in f: line = line.strip().split() line = map(lambda x: str(self.dic.token2id[x]), line) line = u" ".join(line) + u"\n" g.write(line) f.close() g.close() def id2word(self, input_file, output_file): f = open(input_file, 'r') g = codecs.open(output_file, 'w', 'utf-8') for line in f: line = line.strip().split() line = map(lambda x: self.dic.id2token.get(int(x), u'#'), line) line = u" ".join(line) + u"\n" g.write(line) f.close() g.close()
def buildDict(self): batchiter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) common_dictionary = Dictionary(batchiter) print(len(common_dictionary)) if self.testReaderargs: print('update vocab from test set') batchiter = BatchIterBert(self.testDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) common_dictionary.add_documents(batchiter) print(len(common_dictionary)) common_dictionary.filter_extremes(no_below=self.dict_no_below, no_above=self.dict_no_above, keep_n=self.dict_keep_n) self.dictProcess = DictionaryProcess(common_dictionary) self.postProcessor.dictProcess = self.dictProcess self.vocab_dim = len(self.dictProcess) self.have_dict = True if 1: count_list = [] self.trainDataIter._reset_iter() batchiter = BatchIterBert(self.trainDataIter, filling_last_batch=False, postProcessor=xonlyBatchProcessor, batch_size=1) for item in batchiter: current_count = sum(item) count_list.append(current_count) #print(current_count) print(sum(count_list) / len(count_list))
def create_vocab(tweets): print("Building vocabulary...") vocab = Dictionary() vocab.add_documents(tweets) vocab.save('vocab_sentiment') return vocab
help='File name to give the dictionary upon saving') args = parser.parse_args() input_path = args.input_path output_name = args.output_name CHUNK_SIZE = args.chunk_size # Stream in documents from path rdr = lmd.Reader(input_path) gnr = rdr.stream_data(get_meta=True) # Build a dictionary out of the validation documents dictionary = Dictionary() docs = rdr.stream_data(threaded=True) doc_chunks = chunks(docs, size=CHUNK_SIZE) # Progress in chunks for chunk in doc_chunks: print("Adding ", CHUNK_SIZE, " docs") tokenized = [[ tok.lower_ for tok in doc if not tok.is_stop and tok.is_alpha ] for doc in tokenizer.pipe( [item for item in chunk if language(item) == 'en'], batch_size=CHUNK_SIZE)] dictionary.add_documents(tokenized) # Keep only 2**16 most frequent tokens dictionary.filter_extremes(keep_n=2**16) dictionary.compactify() dictionary.save(output_name)
# >>> DataFrame['column'].apply(str.lower).apply(word_tokenize) # Also we added the START and the END symbol to the sentences. english_sents = [START] + df['English'].apply( str.lower).apply(word_tokenize) + [END] indo_sents = [START] + df['Indonesian'].apply( str.lower).apply(word_tokenize) + [END] # We're sort of getting into the data into the shape we want. # But now it's still too humanly readable and redundant. ## Cut-away: Computers like it to be simpler, more concise. -_-||| print('First English sentence:', english_sents[0]) print('First Indo sentence:', indo_sents[0]) english_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']]) english_vocab.add_documents(english_sents) indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']]) indo_vocab.add_documents(indo_sents) # First ten words in the vocabulary. print('First 10 Indonesian words in Dictionary:\n', sorted(indo_vocab.items())[:10]) print() print('First 10 English words in Dictionary:\n', sorted(english_vocab.items())[:10]) english_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']]) english_vocab.add_documents(english_sents) indo_vocab = Dictionary([['<s>'], ['</s>'], ['UNK']])
class MultiVectorizer(): reserved = ["<PAD>", "<UNK>"] embedding_matrix = None embedding_word_vector = {} glove = False def __init__(self, reserved=None, min_occur=1, glove_path=None, tokenizer=None, embedding_size=300): self.mi_occur = min_occur self.embedding_size = embedding_size self.nlp = spacy.load("en") if tokenizer is None: self.tokenizer = English().Defaults.create_tokenizer(self.nlp) else: self.tokenizer = tokenizer if glove_path is not None: self.load_glove(glove_path) self.glove = True if reserved is not None: self.vocabulary = Dictionary([self.reserved.extend(reserved)]) else: self.vocabulary = Dictionary([self.reserved]) def get_vocabulary_size(self): return len(self.vocabulary.token2id.items()) def load_glove(self, glove_file_path): f = open(glove_file_path, encoding="utf-8") for line in tqdm(f): value = line.split(" ") word = value[0] coef = np.array(value[1:], dtype='float32') self.embedding_word_vector[word] = coef f.close() def get_embedding_matrix(self): return self.embedding_matrix def is_word(self, string_value): if self.embedding_word_vector.get(string_value): return True def get_vocabulary(self): return self.vocabulary def get_word_id(self, word): return self.vocabulary.token2id[word] def get_word_from_id(self, index): return self.vocabulary.id2token[index] def fit_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] for section in document: sentence_tokens = [] for sentence in section: tokens = self.tokenizer(sentence.lower()) word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) self.vocabulary.add_documents(sentence_tokens) section_tokens.append(sentence_tokens) document_tokens.append(section_tokens) return document_tokens def fit_samples_with_sentences(self, samples): output_tokens = [] for sample in samples: sentence_tokens = [] for sentence in sample: tokens = self.tokenizer(sentence.lower()) word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) self.vocabulary.add_documents(sentence_tokens) output_tokens.append(sentence_tokens) return output_tokens def fit(self, X): if type(X[0]) == list: x_tokens = self.fit_samples_with_sentences(X) #self.fit_document(X) else: x_tokens = self.fit_text(X) self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved) if self.glove: print("Vocabulary Size:",self.get_vocabulary_size()) self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size)) for word, i in tqdm(self.vocabulary.token2id.items()): if word == "<PAD>": embedding_value = np.zeros((1, self.embedding_size)) elif word == "<UNK>": sd = 1/np.sqrt(self.embedding_size) np.random.seed(seed=42) embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size]) else: embedding_value = self.embedding_word_vector.get(word) if embedding_value is None: embedding_value = self.embedding_word_vector.get("<UNK>") if embedding_value is not None: self.embedding_matrix[i] = embedding_value return self.transform(x_tokens) def fit_text(self, X): x_tokens = [] for x in X: if x is not None: # x_tokens.append(word_tokenize(x.lower())) tokens = self.tokenizer(x.lower()) word_str_tokens = list(map(convert_to_string, tokens)) x_tokens.append(word_str_tokens) self.vocabulary.add_documents(x_tokens) return x_tokens def transform(self, X): return self.transform_list_of_list(X) def transform_list_of_list(self, samples): samples_tokens = [] for sample in samples: encoded_tokens = self.transform_section(sample) samples_tokens.append(encoded_tokens) return samples_tokens def transform_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] encoded_tokens = [] for section in document: if type(section) == str: encoded_tokens.append(section) if len(encoded_tokens) == len(document): section_tokens.append(encoded_tokens) section_tokens = self.transform_section(section_tokens) else: encoded_tokens = self.transform_section(section) section_tokens.append(encoded_tokens) document_tokens.append(section_tokens) return document_tokens def transform_section(self, X): if hasattr(self, "limit"): return [[i if i < self.limit else self.reserved.index("<UNK>") for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))] for x in X] else: return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X] def inverse_transform(self, X): return [[ self.vocabulary[i] for i in x ] for x in X] def save(self, file_path="./vecorizer.vec"): with open(file_path, "wb") as handle: pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) return file_path @classmethod def load(cls, file_path): with open(file_path, "rb") as handle: self = pickle.load(handle) return self
def construct_vocab(): f = open(cfg.PATH_TO_ENG_Y_TRAIN) g = open(cfg.PATH_TO_ENG_Y_TEST) word_dic = Dictionary() char_dic = Dictionary() target_dic = Dictionary() word_dic.add_documents([["UNK", "EOS"]]) char_dic.add_documents([["UNK", "BOW"]]) target_dic.add_documents([["UNK", "BOW"]]) line = f.readline() while line: sentence = _tokenize(line) word_dic.add_documents([sentence]) char_dic.add_documents([get_chars(line)]) target_dic.add_documents([sentence]) target_dic.add_documents([get_chars(line)]) line = f.readline() f.close line = g.readline() while line: sentence = _tokenize(line) word_dic.add_documents([sentence]) char_dic.add_documents([get_chars(line)]) line = g.readline() g.close return list(word_dic.itervalues()), list(char_dic.itervalues()), list( target_dic.itervalues())
class MultiVectorizer(): reserved = ["<PAD>", "<UNK>"] embedding_matrix = None embedding_word_vector = {} glove = False def __init__(self, reserved=None, min_occur=1, use_bert=False, glove_path=None, tokenizer=None, embedding_size=300): self.mi_occur = min_occur self.embedding_size = embedding_size self.use_bert = use_bert self.nlp = spacy.load("en") if tokenizer is None: self.tokenizer = English().Defaults.create_tokenizer(self.nlp) else: self.tokenizer = tokenizer if glove_path is not None: self.load_glove(glove_path) self.glove = True if reserved is not None: self.vocabulary = Dictionary([self.reserved.extend(reserved)]) else: self.vocabulary = Dictionary([self.reserved]) def get_vocabulary_size(self): if not self.use_bert: return len(self.vocabulary.token2id.items()) else: return len(self.tokenizer.vocab.keys()) def load_glove(self, glove_file_path): f = open(glove_file_path, encoding="utf-8") for line in tqdm(f): value = line.split(" ") word = value[0] coef = np.array(value[1:], dtype='float32') self.embedding_word_vector[word] = coef f.close() def get_embedding_matrix(self): return self.embedding_matrix def is_word(self, string_value): if self.embedding_word_vector.get(string_value): return True def get_vocabulary(self): if not self.use_bert: return self.vocabulary else: return self.tokenizer.vocab def get_word_id(self, word): if not self.use_bert: return self.vocabulary.token2id[word] else: return self.tokenizer.vocab[word] def get_word_from_id(self, index): if not self.use_bert: return self.vocabulary.id2token[index] else: return self.tokenizer.inv_vocab[index] def fit_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] for section in document: sentence_tokens = [] for sentence in section: tokens = self.tokenizer(sentence.lower()) word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) self.vocabulary.add_documents(sentence_tokens) section_tokens.append(sentence_tokens) document_tokens.append(section_tokens) return document_tokens def fit_bert_sentences(self, samples, remove_stop_words=True): output_tokens = [] vocab = [] stop_words = set(stopwords.words('english')) for sample in tqdm(samples): sentence_tokens = [] for sentence in sample: tokens = self.tokenizer.tokenize(sentence.lower()) tokens = [w for w in tokens if not w in stop_words] tokens = ["[CLS]"] + tokens + ["[SEP]"] sentence_tokens.append(tokens) vocab.append(tokens) output_tokens.append(sentence_tokens) #self.vocabulary.add_documents(vocab) return output_tokens def fit_samples_with_sentences(self, samples, remove_stop_words=True): output_tokens = [] vocab = [] for sample in tqdm(samples): sentence_tokens = [] for sentence in sample: tokens = self.tokenizer(sentence.lower()) if remove_stop_words: tokens = [token for token in tokens if not token.is_stop] word_str_tokens = list(map(convert_to_string, tokens)) sentence_tokens.append(word_str_tokens) vocab.append(word_str_tokens) output_tokens.append(sentence_tokens) self.vocabulary.add_documents(vocab) return output_tokens def fit(self, X, remove_stop_words=True, list_of_lists=False): if list_of_lists: if not self.use_bert: x_tokens = self.fit_samples_with_sentences(X,remove_stop_words=remove_stop_words) #self.fit_document(X) else: x_tokens = self.fit_bert_sentences(X, remove_stop_words=remove_stop_words) else: x_tokens = self.fit_text(X) self.vocabulary.filter_extremes(no_below=self.mi_occur, no_above=1.0, keep_tokens=self.reserved) unknown_words = [] if self.glove: #spell = Spellchecker() print("Vocabulary Size:",self.get_vocabulary_size()) self.embedding_matrix = np.zeros((self.get_vocabulary_size(), self.embedding_size)) for word, i in tqdm(self.vocabulary.token2id.items()): if word == "<PAD>": embedding_value = np.zeros((1, self.embedding_size)) elif word == "<UNK>": sd = 1/np.sqrt(self.embedding_size) np.random.seed(seed=42) embedding_value = np.random.normal(0, scale=sd, size=[1, self.embedding_size]) else: embedding_value = self.embedding_word_vector.get(word) if embedding_value is None: embedding_value = self.embedding_word_vector.get(self.correct_word(word)) if embedding_value is None: unknown_words.append(word) embedding_value = self.embedding_word_vector.get("<UNK>") if embedding_value is not None: self.embedding_matrix[i] = embedding_value print("Number of unknown words:",len(unknown_words)) unknown_words_df = pd.DataFrame() unknown_words_df["Unknown Words"] = unknown_words encoded_tokens = self.transform(x_tokens, list_of_lists=list_of_lists) return encoded_tokens def fit_text(self, X, remove_stop_words=True): output_tokens = [] for sample in tqdm(X): tokens = self.tokenizer(sample.lower()) if remove_stop_words: tokens = [token for token in tokens if not token.is_stop] word_str_tokens = list(map(convert_to_string, tokens)) output_tokens.append(word_str_tokens) self.vocabulary.add_documents(output_tokens) return output_tokens def correct_word(self, word): return word def transform(self, X, list_of_lists=False): if list_of_lists: if not self.use_bert: return self.transform_list_of_list(X) else: return self.transform_bert(X) else: return self.transform_text(X) def transform_list_of_list(self, samples): samples_tokens = [] for sample in samples: encoded_tokens = self.transform_text(sample) samples_tokens.append(encoded_tokens) return samples_tokens def transform_document(self, documents): document_tokens = [] for document in documents: section_tokens = [] encoded_tokens = [] for section in document: if type(section) == str: encoded_tokens.append(section) if len(encoded_tokens) == len(document): section_tokens.append(encoded_tokens) section_tokens = self.transform_text(section_tokens) else: encoded_tokens = self.transform_text(section) section_tokens.append(encoded_tokens) document_tokens.append(section_tokens) return document_tokens def transform_bert(self, samples): samples_tokens = [] for sample in samples: encoded_sentences = [] for sentence_tokens in sample: encoded_tokens = self.tokenizer.convert_tokens_to_ids(sentence_tokens) encoded_sentences.append(encoded_tokens) samples_tokens.append(encoded_sentences) return samples_tokens def transform_text(self, X): if hasattr(self, "limit"): return [[i if i < self.limit else self.reserved.index("<UNK>") for i in self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>"))] for x in X] else: return [self.vocabulary.doc2idx(x, unknown_word_index=self.reserved.index("<UNK>")) for x in X] def inverse_transform(self, X): return [[ self.vocabulary[i] for i in x ] for x in X] def save(self, file_path="./vecorizer.vec"): with open(file_path, "wb") as handle: pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL) return file_path @classmethod def load(cls, file_path): with open(file_path, "rb") as handle: self = pickle.load(handle) return self
class Vocab(): def __init__(self): self.dic = Dictionary() self.dic.add_documents([[u'<UNK>', u',']]) def construct(self, input_file): f = codecs.open(input_file, 'r', 'utf-8') sentences = [] for line in f: line = line.strip().split() sentences.append(line) self.dic.add_documents(sentences) f.close() self.dic.id2token = {v: k for k, v in self.dic.token2id.items()} def load_cond(self, input_file, cond_length, unk): """Get a list of unique conditions""" f = codecs.open(input_file, 'r', 'utf-8') conditions = [] lines = f.readlines() if lines[-1].strip() == '': print("deleted the last element:", lines[-1]) lines = lines[:-1] lines = list(set(lines)) for line in lines: line = map(int, line.strip().split()) line = padding(line, cond_length, unk) if not line in conditions: conditions.append(line) self.cond = np.array(conditions) self.n_cond = len(conditions) def choice_cond(self, num): return self.cond[np.random.choice(len(self.cond), num)] def word2id(self, input_file, output_file): def get_id(dic, key): if key in dic: return str(dic[key]) else: ret = [] key = list(key) for k in key: ret.append(str(dic.get(k, 0))) return u" ".join(ret) f = codecs.open(input_file, 'r', 'utf-8') g = open(output_file, 'w') for line in f: line = line.strip().split() line = map(lambda x: get_id(self.dic.token2id, x), line) line = u" ".join(line) + u"\n" g.write(line) f.close() g.close() def id2word(self, input_file, output_file): f = open(input_file, 'r') g = codecs.open(output_file, 'w', 'utf-8') for line in f: line = line.strip().split() line = map(lambda x: self.dic.id2token.get(int(x), u'#'), line) line = u" ".join(line) + u"\n" g.write(line) f.close() g.close()
def train_LDA(base_path, table_paths, batch_size, limit, use_dictionary=False, **kwargs): model_name = dic2name(kwargs) print("Model: ", model_name) topic_num = kwargs['tn'] # Pass 1 get the dictionary if use_dictionary == 'True': dic = Dictionary.load( join(LDA_CACHE, 'dictionary_{}'.format(model_name))) else: dic = Dictionary([]) b = 0 for corpus in corpus_iter(base_path, table_paths, batch_size, limit, **kwargs): dic.add_documents(corpus) print('Dictionary batch {}: current dic size {}'.format( b, len(dic))) b += 1 # save dictionary dic.save(join(LDA_CACHE, 'dictionary_{}'.format(model_name))) print("Dictionary size", len(dic)) # Pass 2 train LDA whole_corpus = corpus_iter(base_path, table_paths, batch_size, limit, **kwargs) first_batch = next(whole_corpus) first_bow = [dic.doc2bow(text, allow_update=False) for text in first_batch] #print(first_bow) lda = LdaModel(first_bow, id2word=dic, num_topics=topic_num, minimum_probability=0.0) batch_no = 0 print('LDA update batch {}'.format(batch_no)) for batch in whole_corpus: batch_bow = [dic.doc2bow(text, allow_update=False) for text in batch] #print(corpus_bow) lda.update(batch_bow) batch_no += 1 print('LDA update batch {}'.format(batch_no)) # Save model to disk. temp_file = join(LDA_CACHE, "model_{}".format(model_name)) lda.save(temp_file) print( "Training from {} done. Batch_size: {}, long str tokenization threshold: {}, numerical representations: {}.\ \nTotal size of dictionary: {}".format(table_paths, batch_size, kwargs['thr'], kwargs['num'], len(dic))) return
class TextCorpus(interfaces.CorpusABC): """ Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim corpus) from plain text. This is an abstract base class: override the `get_texts()` method to match your particular input. Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You must only provide a correct `get_texts` implementation. """ def __init__(self, input=None): super(TextCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if input is not None: self.dictionary.add_documents(self.get_texts()) else: logger.warning("No input document stream provided; assuming " "dictionary will be initialized some other way.") def __iter__(self): """ The function that defines a corpus. Iterating over the corpus must yield sparse vectors, one for each document. """ for text in self.get_texts(): if self.metadata: yield (self.dictionary.doc2bow(text[0], allow_update=False), text[1]) else: yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return getstream(self.input) def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. length = 0 for lineno, line in enumerate(getstream(self.input)): length += 1 yield utils.tokenize(line, lowercase=True) self.length = length def __len__(self): return self.length # will throw if corpus not initialized
class LDA(object): def __init__(self, topics = 10, worker = 3, pretrained_model = None, dictionary = None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus = [[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [self._common_dictionary.doc2bow(sentence) for sentence in corpus] self._model.update(new_corpus_data) def inference(self, document = []): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
else: logging.info('no calculated files found, recomputing...') logging.info('loading files...') with open(data_dark_file, 'r') as f1, open(data_clean_file, 'r') as f2: logging.info('loading dark text...') dark_text = [line.split() for line in f1.readlines()] logging.info('loading clean text...') clean_text = [line.split() for line in f2.readlines()] logging.info('load file done') if os.path.exists(dict_file): dictionary = Dictionary.load(dict_file) else: logging.info('creating the dictionary...') dictionary = Dictionary(dark_text) dictionary.add_documents(clean_text) dictionary.save(dict_file) dictionary = filter_dict(args.vocab_size, dictionary, get_keep_tokens(dictionary)) logging.info('dictionary created') logging.info('building neighbor unigrams...') if os.path.exists(file_unigram_dark) and os.path.exists( file_unigram_dark_all): unigram_dark = np.load(file_unigram_dark) unigram_dark_all = np.load(file_unigram_dark_all) else: unigram_dark, unigram_dark_all = get_neighbor_unigram( dictionary, dark_text, args.num_neighbors)
class TextCorpus(interfaces.CorpusABC): """ Helper class to simplify the pipeline of getting bag-of-words vectors (= a gensim_package corpus) from plain text. This is an abstract base class: override the `get_texts()` and `__len__()` methods to match your particular input. Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized with a dictionary in `self.dictionary` and will support the `iter` corpus method. You must only provide a correct `get_texts` implementation. """ def __init__(self, input=None): super(TextCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if input is not None: self.dictionary.add_documents(self.get_texts()) else: logger.warning("No input document stream provided; assuming " "dictionary will be initialized some other way.") def __iter__(self): """ The function that defines a corpus. Iterating over the corpus must yield sparse vectors, one for each document. """ for text in self.get_texts(): if self.metadata: yield self.dictionary.doc2bow(text[0], allow_update=False), text[1] else: yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return utils.file_or_filename(self.input) def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. with self.getstream() as lines: for lineno, line in enumerate(lines): if self.metadata: yield utils.tokenize(line, lowercase=True), (lineno, ) else: yield utils.tokenize(line, lowercase=True) def __len__(self): if not hasattr(self, 'length'): # cache the corpus length self.length = sum(1 for _ in self.get_texts()) return self.length
class TrainModel(BaseModel): _TRAIN_EXTENSION = 'csv' _PKL_EXT = 'pkl' def __init__(self, train_path, pkl_path): super(TrainModel, self).__init__() self._validate_path(train_path, self._TRAIN_EXTENSION) self._validate_path(pkl_path, self._PKL_EXT) self._train_path = train_path self._pkl_path = pkl_path self.dictionary = Dictionary() self.weights = None def __call__(self, *args, **kwargs): if self._pkl_path is not None: pass pass def train(self): """Pkl path returned empty, so we need to bring in the contents from the train path, and create a new model to be saved off :return: """ print("Reading in training set") train_set = pd.read_csv(self._train_path) print("Tokenizing training set") train_tokenized = self.get_tokens(train_set) tokens_weights = defaultdict(dict) # add the tokens to the dictionary to keep track if what words we have found for cat, tokens in train_tokenized: print(f"\nBeginning training process for {cat}") print("Adding tokens to dictionary") self.dictionary.add_documents(tokens.values()) print("Determining weights.") tokens_weights[cat]['weights'] = self._create_training_weights(tokens=tokens.values()) print("Finding intersections.") tokens_weights[cat]['intersections'] = self._get_intersections(tokens.values()) print(f"Intersections for {cat}: {tokens_weights[cat]['intersections']}") self.weights = tokens_weights self._write(self) print("Successfully trained model.") def get_tokens(self, train_set): return ( (cat, self._tokenize_doc(self._get_training_paragraphs(train_set, cat))) for cat in self._get_train_categories(train_set) ) def read(self): """Read from the pkl_path. If there are contents, return it. No training needed. :return: """ try: with open(self._pkl_path, "rb") as f: pkl = pickle.load(f) except IOError: # File doesn't exist return None else: return pkl def _validate_path(self, path, ext): if not path.split('.')[-1] == ext: raise IncorrectExtensionError(f"{self._pkl_path} does not have correct extension. Expected {ext}") def _write(self, train): with open(self._pkl_path, "wb") as f: pickle.dump(train, f) @staticmethod def _get_training_paragraphs(train_set, category): """Iterates over df, and returns all paragraphs where data_key_friendly_name matches self._category :param train_set: pd.DataFrame() :return: list() """ return ( row.paragraph_text for _, row in train_set[train_set.data_key_friendly_name == category].iterrows() ) @staticmethod def _get_train_categories(train_set): if 'data_key_friendly_name' not in train_set.columns: raise UnexpectedColumnError( f"'data_key_friendly_name' not found in columns, got {train_set.columns} instead") return ( cat for cat in train_set.data_key_friendly_name.unique().tolist() if cat != 'Unknown Share Repurchase Data' ) @staticmethod def _get_intersections(tokens): """Find all unique processed words from the training paragraphs. :param tokens: list of lists :return: """ intersections = set() for token in tokens: if not intersections: intersections = set(token) else: intersections = intersections.intersection(set(token)) return list(intersections) @staticmethod def _create_training_weights(tokens): """Creates a list of normalized weights to apply to the tf-idf model once it has been determined :param tokens: list of lists :return: dict() {word: weight} 0 <= word <= 1 """ # create count of all words in training set counts = Counter() for doc in tokens: for word in doc: counts[word] += 1 # Grab the min and max counts in the set min_counts = counts.most_common()[-1] max_counts = counts.most_common(1)[0] min_max = min_counts[1], max_counts[1] diff = min_max[1] - min_max[0] return {word: ((count - min_max[0]) / diff) for word, count in counts.items()}
class DefaultJsonCorpus(object): """ A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input. The methods provided by gensim TextCorpus are needed for the GenSim training. Any corpus provided to DocumentSimilarity should provide the methods given in this class. """ def __init__(self, input=None,create_dictionary=True): super(DefaultJsonCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False if create_dictionary: self.dictionary.add_documents(self.get_texts()) def __iter__(self): for text in self.get_texts(): yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return utils.file_or_filename(self.input) def __len__(self): if not hasattr(self, 'length'): # cache the corpus length self.length = sum(1 for _ in self.get_texts()) return self.length def get_json(self): if isinstance(self.input,list): for j in self.input: yield j else: with self.getstream() as lines: for line in lines: line = line.rstrip() j = json.loads(line) yield j def get_texts(self,raw=False): """ yield raw text or tokenized text """ for j in self.get_json(): text = j["text"] if raw: yield text else: yield utils.tokenize(text, deacc=True, lowercase=True) def get_meta(self): """ return a json object with meta data for the documents. It must return: id - id for this document optional title and tags. Tags will be used as base truth used to score document similarity results. """ doc_id = 0 for j in self.get_json(): m = copy.deepcopy(j) m['id'] = long(m['id']) m['corpus_seq_id'] = doc_id doc_id += 1 yield m def get_dictionary(self): return self.dictionary
class SublexicalizedCorpus(TextCorpus): def __init__(self, base_corpus, order=3, word_limit=None, clean_func=mahoney_clean, create_dictionary=True, n_proc=1): self.order = order self.clean_func = clean_func self.base_corpus = base_corpus self.word_limit = word_limit self.n_proc = n_proc super(SublexicalizedCorpus, self).__init__() self.dictionary = Dictionary() if create_dictionary: self.dictionary.add_documents(self.get_texts()) def get_texts(self): a_count = 0 t_count = 0 texts = ((text, self.clean_func, self.order) for text in self.base_corpus.get_texts()) pool = multiprocessing.Pool(self.n_proc) start = time.clock() prev = start for group in chunkize(texts, chunksize=10 * self.n_proc, maxsize=100): for tokens in pool.imap_unordered(process, group): a_count += 1 cur = time.clock() if cur - prev > 60: logging.info("Sublexicalized %d in %d seconds, %.0f t/s" % (t_count, cur - start, t_count * 1. / (cur - start))) prev = cur t_count += len(tokens) yield tokens if self.word_limit and t_count > self.word_limit: break pool.terminate() end = time.clock() logging.info("Sublexicalizing %d finished in %d seconds, %.0f t/s" % (t_count, end - start, t_count * 1. / (end - start))) self.length = t_count
class DefaultJsonCorpus(object): """ A default JSON corpus based on gensim TextCorpus. It assumes a file or list of JSON as input. The methods provided by gensim TextCorpus are needed for the GenSim training. Any corpus provided to DocumentSimilarity should provide the methods given in this class. """ def __init__(self, input=None): super(DefaultJsonCorpus, self).__init__() self.input = input self.dictionary = Dictionary() self.metadata = False self.dictionary.add_documents(self.get_texts()) def __iter__(self): for text in self.get_texts(): yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): return utils.file_or_filename(self.input) def __len__(self): if not hasattr(self, 'length'): # cache the corpus length self.length = sum(1 for _ in self.get_texts()) return self.length def get_json(self): if isinstance(self.input, list): for j in self.input: yield j else: with self.getstream() as lines: for line in lines: line = line.rstrip() j = json.loads(line) yield j def get_texts(self, raw=False): """ yield raw text or tokenized text """ for j in self.get_json(): text = j["text"] if raw: yield text else: yield utils.tokenize(text, deacc=True, lowercase=True) def get_meta(self): """ return a json object with meta data for the documents. It must return: id - id for this document optional title and tags. Tags will be used as base truth used to score document similarity results. """ doc_id = 0 for j in self.get_json(): m = copy.deepcopy(j) m['id'] = long(m['id']) m['corpus_seq_id'] = doc_id doc_id += 1 yield m def get_dictionary(self): return self.dictionary
class LDA(object): def __init__(self, topics=10, worker=3, pretrained_model=None, dictionary=None): """ lda模型训练初始化。 Args: topics -- 指定主题个数 worker -- 并行化参数,一般为core数量减一 pretrained_model -- 预训练的模型,由于支持在线更新,所以可以加载上次训练的模型 dictionary -- 训练时词需要转换成ID,所以跟模型配套有一个ID映射的词典 Example: >>> lda = LDA(topics = 20, worker = 2, pretrained_model = model_file, dictionary = dictionary_file) >>> corpus = read_file(corpus_file) # [['word1', 'word2'], ['word3', 'word4']] >>> lda.update(corpus) >>> lda.save(model_file, dictionary_file) >>> topics = lda.inference(['word5', 'word6']) """ self._topics = topics self._workers = worker self._model = None self._common_dictionary = None if pretrained_model and common_dictionary: self._model = LdaModel.load(pretrained_model) self._common_dictionary = Dictionary.load(dictionary) def save(self, model_file, dictionary_file): """ 保存训练的模型,同时保存对应的词典 Args: model_file -- 模型文件 dictionary_file -- 词典文件 Returns: 无 """ if self._model: self._model.save(model_file) if self._common_dictionary: self._common_dictionary.save(dictionary_file) def update(self, corpus=[[]]): """ 在线更新,在已有模型的基础上在线更新 Args: corpus -- 用于更新的文档列表 """ if not self._model and len(corpus) > 0: self._common_dictionary = Dictionary(corpus) corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model = LdaModel(corpus_data, self._topics) elif self._model and len(corpus) > 0: self._common_dictionary.add_documents(corpus) new_corpus_data = [ self._common_dictionary.doc2bow(sentence) for sentence in corpus ] self._model.update(new_corpus_data) def inference(self, document=[]): """ 对新文档推断其话题分布 Args: document -- 文档,其实是词列表 Returns: 话题分布列表 """ if self._model: doc = [self._common_dictionary.doc2bow(document)] return self._model.get_document_topics(doc) return [] @property def model(self): return self._model @property def dictionary(self): return self._common_dictionary
class Similarities(object): """ Class for text similarities stuff """ def __init__(self, mongo_conn_rec, stopwords=None): self._stopwords = set(stopwords) if stopwords is not None else set() self._mongo_connection_record = mongo_conn_rec self._lsi_mapping = dict() self._sim_index = None self._dictionary = None self._lsimodel = None self._run_transformers() @staticmethod def logger(): """ Scrapper's specific logger instance. Use this to log inside scrappers. :return: Returns a logging.Logger('openews.scrappers') instance. """ return logging.getLogger('openews.language') @property def considerable_doc_property(self): """ The document property to use for training. this is the actually data we take from the MongoDB documents to parse and train. :return: str """ return 'title' @property def dictionary_file(self): """ The filename to use when serializing gensim.corpora.dictionary.Dictionary to disk. :return: str """ return "openews.processors.dict" @property def dictionary(self): """ The used Dictionary. :return: gensim.corpora.dictionary.Dictionary """ return self._dictionary @property def lsi_model(self): """ The used LSI model. :return: gensim.models.lsimodel.LsiModel """ return self._lsimodel @property def similarity_index(self): """ The similarity index instance :return: gensim.similarities.docsim.MatrixSimilarity """ return self._sim_index @property def similarity_threshold(self): """ The similarity threshold. Anything above or equals to this value will be considered as similar document. :return: float """ return server_app.config['SIMILARITY_THRESHOLD'] @property def lsi_index_mapping(self): """ A mapping between the LSI model index (key) and the documents (Collection the document is in, document) :return: dict """ return self._lsi_mapping @staticmethod def _create_resource_path(resource_file): """ Creates a absolute path to resource_file based on the given system's temp directory. :param resource_file: str :return: str """ return os.path.join(tempfile.gettempdir(), resource_file) def _resource_exists(self, resource_file): """ Checks if resource_file exists in the given system's temp directory. :param resource_file: str :return: bool """ return os.path.isfile(self._create_resource_path(resource_file)) def _run_transformers(self): """ Runs all the transformer methods listed providing the MongoDB client context instance. """ with MongoClientContext(self._mongo_connection_record) as client: self._create_dictionary(client) self._create_lsi_similarity_index(client) def _create_dictionary(self, mongo_client): """ Creates the gensim Dictionary (gensim.corpora.dictionary.Dictionary) or loads it if it already exists and sets the object's dictionary property. :param mongo_client: server.db.MongoClientContext """ from gensim.corpora.dictionary import Dictionary if self._resource_exists(self.dictionary_file): self.logger().debug( "Dictionary file found, loading it [%s]" % self._create_resource_path(self.dictionary_file)) self._dictionary = Dictionary.load(self._create_resource_path(self.dictionary_file)) else: self.logger().debug("Dictionary file not found, creating a new Dictionary file") self._dictionary = Dictionary() documents = [] for doc in [di for d in mongo_client.scrappers_collections() for di in d.find()]: documents.append(self.tokenize_sentence(doc[self.considerable_doc_property])) self.logger().debug("Adding %d documents to dictionary (will skip existing ones)" % len(documents)) self._dictionary.add_documents(documents) self._dictionary.save(self._create_resource_path(self.dictionary_file)) def _create_lsi_similarity_index(self, mongo_client): """ Creates a Similarity index based on LSI model from the available dictionary. Sets the object's lsi_model and similarity_index object properties. """ from gensim.models import LsiModel from gensim.similarities import MatrixSimilarity self._lsi_mapping.clear() bow_corpus = [] for idx, tp in enumerate([(c, di) for c in mongo_client.scrappers_collections() for di in c.find()]): self._lsi_mapping[idx] = tp bow_corpus.append(self.sentence_to_bow(tp[1][self.considerable_doc_property])) self._lsimodel = LsiModel(bow_corpus, id2word=self.dictionary) self._sim_index = MatrixSimilarity(self._lsimodel[bow_corpus]) def calculate_similarities(self): """ Find / calculate similarities between documents in the index. Returns a defaultdict with the key as the LSI index and the value is a list of tuples with the following values (LSI model Index, similarity threshold - numpy.float32) tuple :return: defaultdict(list) """ similarities = defaultdict(list) if not self.lsi_index_mapping: return for idx, tp in sorted(self.lsi_index_mapping.items(), key=itemgetter(0)): sentence = tp[1][self.considerable_doc_property] bow = self.sentence_to_bow(sentence) latent_space_vector = self.lsi_model[bow] sim_vector = self.similarity_index[latent_space_vector] sorted_mapped_vector = list(sorted(enumerate(sim_vector), key=itemgetter(1))) for sit in [v for v in sorted_mapped_vector if v[0] != idx and v[1] >= self.similarity_threshold and tp[0].name != self.lsi_index_mapping[v[0]][0].name]: if sit[0] not in similarities: similarities[idx].append(sit) for s in similarities.items(): main_sentence = self.lsi_index_mapping[s[0]][1][self.considerable_doc_property] print("[%s] %s:" % (self.lsi_index_mapping[s[0]][0].name, main_sentence)) for sm in s[1]: print("\t[%f][%s]: %s" % (sm[1], self._lsi_mapping[sm[0]][0].name, self.lsi_index_mapping[sm[0]][1][self.considerable_doc_property])) return similarities def store_similarities(self, update=False): """ Stores the similarities to the database :param update: True to update existing, False to delete and add new items """ with MongoClientContext(self._mongo_connection_record) as client: pass def tokenize_sentence(self, sentence): """ Tokenize a sentence (see 'tokenized_corpus_sentences' method on what tokenization in this context means). :param sentence: str :return: a list """ excluded = set(chain(self._stopwords, string.punctuation)) return [w.lower() for w in word_tokenize(sentence) if w.lower() not in excluded] def sentence_to_bow(self, sentence): """ Transforms a string sentence to a VSM bag-of-words representation. :param sentence: str :return: list of tuples """ return self.dictionary.doc2bow(self.tokenize_sentence(sentence))
class Corpus(object): def __init__(self, path, save_data, max_len=16): self.train = os.path.join(path, "train") self.valid = os.path.join(path, "valid") self._save_data = save_data self.train_sents = [] self.train_labels = [] self.valid_sents = [] self.valid_labels = [] self.max_len = max_len self.dict = Dictionary() self.l = Labels() def parse_data_from_file(self, _file, is_train=True): _sents, _labels = [], [] for sentence in open(_file): label, _, _words = sentence.replace('\xf0', ' ').partition( ' ') #特定格式:类别 文档,可改写该段代码 label = label.split(":")[0] words = _words.lower().strip().split() if len(words) > self.max_len: words = words[:self.max_len] _sents += [words] _labels += [label] if is_train: self.train_sents.extend(_sents) self.train_labels.extend(_labels) self.l(self.train_labels) self.build_dict(self.train_sents) else: self.valid_sents.extend(_sents) self.valid_labels.extend(_labels) def parse_data_from_dir(self, dirs, is_train=True, lines_are_documents=True): _sents, _labels = [], [] dirs = os.path.expanduser(dirs) for label in sorted(os.listdir(dirs)): d = os.path.join(dirs, label) if not os.path.isdir(d): continue for root, _, fnames in sorted(os.walk(d)): for fname in sorted(fnames): path = os.path.join(root, fname) with open(path, 'rt') as f: if lines_are_documents: for line in f: _sents += [line.lower().strip().split()] _labels += [label] else: _sents += [f.read().strip().split()] _labels += [label] if is_train: self.train_sents.extend(_sents) self.train_labels.extend(_labels) self.l(self.train_labels) self.build_dict(self.train_sents) else: self.valid_sents.extend(_sents) self.valid_labels.extend(_labels) def build_dict(self, _sents): self.dict.add_documents(_sents) def build_vocab(self): for key in self.dict.token2id.keys(): self.dict.token2id[key] += 1 def save(self): self.parse_data_from_file(self.train, is_train=True) self.parse_data_from_file(self.valid, is_train=False) #self.parse_data_from_dir("./data/corpus/data",lines_are_documents=False) self.build_vocab() data = { 'max_len': self.max_len, 'dict': { 'train': self.dict.token2id, 'vocab_size': len(self.dict), 'label': self.l.word2idx, 'label_size': len(self.l), }, 'train': { 'doc': word2idx(self.train_sents, self.dict.token2id), 'label': [self.l.word2idx[l] for l in self.train_labels] }, 'valid': { 'doc': word2idx(self.valid_sents, self.dict.token2id), 'label': [self.l.word2idx[l] for l in self.valid_labels] } } torch.save(data, self._save_data) print('Finish dumping the data to file - [{}]'.format(self._save_data)) print('words length - [{}]'.format(len(self.dict))) print('label size - [{}]'.format(len(self.l))) print('train_src length - [{}]'.format(len(data['train']['doc']))) print('valid_src length - [{}]'.format(len(data['valid']['doc'])))
def topic_model(df_train, df_test, topic_count=10, cached=True): lda_train_save_file = '../data/lsa_train.csv' lda_test_save_file = '../data/lsa_test.csv' if (os.path.exists(lda_train_save_file) and cached): pd.read_csv(lda_train_save_file), pd.read_csv(lda_test_save_file) ### cleanup #parallel_proces(test_src,'../data/training_user_tweet_processed.csv') ## general remove text #df_train['tweet'] = df_train['tweet'].fillna("") #df_test['tweet'] = df_test['tweet'].fillna("") # df_train['tweet'] = df_train['tweet'].map(general_text_processing) # df_test['tweet'] = df_test['tweet'].map(general_text_processing) """ Parallel tweet. """ # df_test['tweet'] = parallelize(df_test, clean_tweet) # df_train['tweet'] = parallelize(df_train, clean_tweet) #df_train['tweet'] = df_train['tweet'].map(clean_tweet) #df_test['tweet'] = df_test['tweet'].map(clean_tweet) ## remove stop words # df_train['tweet'] = df_train['tweet'].map(remove_stop_words) # df_test['tweet'] = df_test['tweet'].map(remove_stop_words) ## gensim lda # dictionary = Dictionary() # for t in df_train.tweet.values.tolist(): # #print(t) # dictionary.add_documents([t.split()]) dictionary = Dictionary() for t in df_train.tweet.values.tolist(): # print(t) dictionary.add_documents([t]) # for t in df_test['tweet'].values.tolist() : # print(t) # print(t[0].split()) # print(dictionary.doc2bow(t.split())) train_doc2_corupus = [ dictionary.doc2bow(text) for text in df_train['tweet'].values.tolist() ] # train_doc2_corupus = [dictionary.doc2bow(text.split()) for # text in df_train['tweet'].values.tolist()] # print(train_doc2_corupus) print("Started LDA") lda_model = LdaModel(train_doc2_corupus, num_topics=topic_count, iterations=30) print("Completed LDA") """ fill topics """ df_test = fill_lda_result_2(df_test, lda_model, dictionary, topic_count) df_train = fill_lda_result_2(df_train, lda_model, dictionary, topic_count) """ Save the file """ df_train.to_csv(lda_train_save_file, index=False) df_test.to_csv(lda_test_save_file, index=False) """ return """ print('LDA Completed') return df_train, df_test