class MySentences(object): """lines -> sentence list(['...', '...',...]) -> words list([[[' '],[' ']],[[' '],[' ']],...) -> phrase([['a_b','c_d']])""" def __init__(self, dirname, common_terms): self.dirname = dirname self.bigram = Phrases(min_count=2, threshold=5, common_terms=common_terms) def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filename with open(file_path, 'rb') as f: # read all lines in the file as a list readlines = f.readlines() sentence_stream = [parse_sent(doc) for doc in readlines] self.bigram.add_vocab(sentence_stream) # bigram = Phrases( # sentence_stream, # min_count=2, # threshold=5, # common_terms=common_terms) sentence_stream = list(self.bigram[sentence_stream]) for sent in sentence_stream: yield sent
def load_shit(file_paths, save_path): for i, path in enumerate(file_paths): # first iteration if i == 0: print('[info] initializing phrase model') with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) # initialize phrase model phrases = Phrases(reader, delimiter=b" ") # every other iteration else: if i % 1000: progress = (i / len(file_paths)) * 100 print('[info] processed {}% files'.format(round(progress, 1))) with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) # show the model new data phrases.add_vocab(reader) # save model after iterations are done with open(save_path, 'w') as f: phrases.save(save_path)
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): """ :param g_DataQueue:全局变量存放数据库中的数据 :param g_FinishRead:是否读取完数据库的标志 :param savePath:短语学习器保存的位置 :param priorPhrasePath:前一个学习器保存的位置 :return: """ count = 0 phrase = Phrases(None, min_count=10, threshold=15) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): data = g_DataQueue.get() count += len(data) print("have processed:", count) words = [] reSub0 = re.compile( "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]" ) # URL reSub1 = re.compile( "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$") # replace with " " reSub2 = re.compile( "'[.?;!]") # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况 reSplit1 = re.compile("\.[^a-z0-9]|[?!;]") # 获取单词 for t in data: if (t[0] is not None): st = re.sub(reSub0, " ", t[0].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) if (t[1] is not None): st = re.sub(reSub0, " ", t[1].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) del data gc.collect() # 训练短语 if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words # print(len(phrase.vocab)) gc.collect phrase.save(savePath)
def update(self,new_corpus,count,wrkers,sze,wndow): sentences = Corpus_Sentence_Extractor(new_corpus) bigram = Phrases().load(self.models + 'bigram_model') trigram = Phrases().load(self.models + 'trigram_model') bigram.add_vocab(sentences) trigram.add_vocab(bigram[sentences]) self.train(sentences,trigram,self.word2vec,count,wrkers,sze,wndow)
def trainPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): count = 0 phrase = Phrases(None, min_count=15, threshold=10, max_vocab_size=40000000) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): words = g_DataQueue.get() if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words gc.collect() phrase.save(savePath)
def salient_bigrams(phrases: Phrases): """Finds the most salient bigrams Args: phrases (Phrases): Phrases class set up for bigram search """ for slice in read_corpus(): phrases.add_vocab(read_slice(slice)) # evaluate all previous corpus slices found = set() total_bigrams_encountered = 0 for previous_slice in read_corpus(): for phrase, score in phrases.export_phrases( read_slice(previous_slice)): found.add((phrase, score)) total_bigrams_encountered += 1 if previous_slice == slice: break found = sorted(found, key=lambda element: element[1], reverse=True) # no bigrams found? if len(found) == 0: output(slice, "") # log the top ten bigrams for phrase, score in found[:10]: output(slice, "{phrase}, {score}".format(phrase=phrase, score=score)) # log the total counts output( slice, """ Total bigrams: {total} Unique bigrams: {unique} Median score:{median} Max score:{max} Min score:{min} """.format(total=total_bigrams_encountered, unique=len(found), median=found[len(found) // 2] if len(found) != 0 else 0, max=found[0] if len(found) != 0 else 0, min=found[-1]) if len(found) != 0 else 0) # will log a time if command line args were enabled Timer.try_to_time()
else: nlp = None # TODO: # 1. Create n-gram features for full text similarity search # This will create a problem of two documents being similar # everything but the subject of research. We want documents # to be similar in subject of research # 2. Need to reliably determine the topic. Seems hard to solve # this in ad-hoc fashion. Need to train LDA on paper abstracts # first. if nlp is not None: tokenized = nlp(full_text) phrases_model.add_vocab([token for token, pos in s] for s in tokenized) phrase_voc.add_documents(nlp.chunks(t) for t in tokenized) print("\n") #%% pickle.dump(phrases_model, open("gensim_phrase.pkl", "wb")) pickle.dump(phrase_voc, open("gensim_chunk_dict.pkl", "wb")) #%% common_dict = Dictionary() for loc, filecontent in get_files_content(files):
seg = Segmenter() # vocab = Dictionary() phrases = Phrases() text_path = sys.argv[1] def get_data(text_path): for line in open(text_path, "r"): line = line.strip() if line: data = json.loads(line) yield data['abstract'] for ind, text in enumerate(get_data(text_path)): segments = seg(text, segment_len=1, segment_overlap=0) phrases.add_vocab(segments) # vocab.add_documents(segments, prune_at=2000000) if ind % 10000: print(f"\rProcessed:{ind}", end = "") break # vocab.filter_extremes(no_below=5, no_above=0.5, keep_n=2000000) # vocab.save("academic.dict") phrases.save("academic.phrases")
def salient_trigrams(phrases: Phrases): """Finds the most salient trigrams Args: phrases (Phrases): Phrases class set up for bigram search """ trigram = Phrases() for slice in read_corpus(): # prepare the bigram for previous_slice in read_corpus(): phrases.add_vocab(read_slice(slice)) if previous_slice == slice: break # transform sentences into possible bigrams bigram_phraser = Phraser(phrases) def bigrammed(slice: str): for sent in read_slice(slice): yield bigram_phraser[sent] trigram.add_vocab(bigrammed(slice)) # evaluate all previous corpus slices found = set() total_trigrams_encountered = 0 for previous_slice in read_corpus(): for phrase, score in trigram.export_phrases( bigrammed(previous_slice)): if phrase.count(b'_') == 2: found.add((phrase, score)) total_trigrams_encountered += 1 elif '_' in phrase: print(phrase) if previous_slice == slice: break found = sorted(found, key=lambda element: element[1], reverse=True) # no trigrams found? if len(found) == 0: output(slice, "") # log the top ten trigrams for phrase, score in found[:10]: output(slice, "{phrase}, {score}".format(phrase=phrase, score=score)) # log the total counts output( slice, """ Total trigrams: {total} Unique trigrams: {unique} Mean score:{median} Max score:{max} Min score:{min} """.format(total=total_trigrams_encountered, unique=len(found), median=found[len(found) // 2] if len(found) != 0 else 0, max=found[0] if len(found) != 0 else 0, min=found[-1] if len(found) != 0 else 0)) # will log a time if command line args were enabled Timer.try_to_time()
class GensimTokenizer: def __init__(self, dictionary, phraser=None, patch_dict=PATCH_DICT): """Wrap a Gensim Dictionary, phrase detector, and special tokens for creating tokenization from OWT Args: dictionary: The gensim dictionary mapping vocabulary to IDs and back phraser: If provided, use gensim's phrase detector to lump common concepts together patch_dict: Patch the dictionary with special tokens """ self.dictionary = dictionary self.phraser = Phrases([[]]) if phraser is None else phraser self.patch_dict = patch_dict @classmethod def from_file(cls, dict_fname, phraser_fname=None): """Load tokenizer information from a dictionary file (generated by gensim dictionary.save) and a phraser file.""" d = Dictionary.load(str(dict_fname)) if phraser_fname is not None: p = Phraser.load(phraser_fname) else: p = Phraser(Phrases([[]])) return cls(d, p) def add_document_from_fname(self, fname): """For training, add the contents of a text file to the dictionary""" print(f"Adding {fname}") tokens = self.phraser[file2tokens(fname)] self.dictionary.add_documents(tokens) def add_to_phraser_from_fname(self, fname): """Detect common phrases from fname for bigramming purposes""" print(f"Adding {fname} to phraser") tokens = file2tokens(fname) self.phraser.add_vocab(tokens) def get_dictionary(self): return self.dictionary def token2id(self, word): """Convert a token into an id, converting to UNK ID as necessary""" d = self.dictionary return d.token2id.get(word, d.token2id["<UNK>"]) def tokens2ids(self, tokens): """Convert a list of tokens into ids, converting to UNK as necessary""" return [self.token2id(tok) for tok in tokens] def tokenize(self, s: str): """Convert a sentence into its tokens""" return self.phraser[process_line(s)[0]] def tokenize_batch(self, lines: List[str]): """Convert a batch of lines into their tokens""" return self.phraser[[process_line(line)[0] for line in lines]] def encode(self, s): """Encode a single sentence into IDs""" sent_tokens = self.tokenize(s) return self.tokens2ids(sent_tokens) def decode(self, ids): """Alias for `ids2tokens`""" return self.ids2tokens(ids) def id2token(self, id): """Convert an id to a token""" d = self.dictionary if id == -1: return "<STOPWRD>" # Account for post processing return d[id] # Add error handling if bad id def ids2tokens(self, ids): """Convert iterable of ids to tokens""" return [self.id2token(id) for id in ids] def set_outdir(self, outdir): """Useful when training in parallel. If set, will save contents to outdir""" self.outdir = Path(outdir) def patch(self, vocab_size, new_vocab, no_below=15, no_above=0.8): """Patch the tokenizer with a manually specified list of tokens, after training""" print("Patching with special tokens...") self.dictionary.patch_with_special_tokens(self.patch_dict) print("Filtering vocabulary...") self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=vocab_size) print(f"Adding {len(new_vocab)} new words to dictionary...") new_vocab = self.tokenize_batch(new_vocab) self.dictionary.add_documents(new_vocab) print(f"Done patching. New vocab size = {self.n_vocab()}") return new_vocab def save(self, outfile): self.dictionary.save(outfile) def n_vocab(self): return len(self.vocab) @cached_property def vocab(self): return list(self.dictionary.keys()) @cached_property def token_vocab(self): return list(self.dictionary.values()) def __len__(self): return self.n_vocab() def encode_sentences_from_fname(self, fname): """Tokenize all the sentences from a text file""" outlist = [] ind_offsets = [] new_start = 0 with open(fname, 'r') as fp: for line in fp.readlines(): if is_good_line(line): sents = self.phraser[process_line(line)] for sent in sents: ids = self.tokens2ids(sent) outlist += ids new_start = new_start + len(ids) ind_offsets.append(new_start) return np.asarray(outlist, dtype=np.int32), np.asarray(ind_offsets, dtype=np.uint64) def encode_and_save_for_mp(self, fname): """Save sentences from fname. Needed because a local function can't be used with the MP module""" if self.outdir is None: raise ValueError("Please `set_outdir` first") fname = Path(fname) idarr_outfile = self.outdir / (fname.stem + '.npy') ind_offsets_outfile = self.outdir / (fname.stem + '_offsets.npy') idarr, ind_offsets = self.encode_sentences_from_fname(fname) np.save(idarr_outfile, idarr) np.save(ind_offsets_outfile, ind_offsets)