def train(self, lang): st = time.time() sentences = self.read_data() with open("training_jsons/training_sentences_nb_happybytes_faq.json", "w+") as fs: fs.write(json.dumps({"sentences": sentences}, indent=4)) sentences = [self.cleaning_pipeline(sent, lang) for sent in sentences] sentences = [word_tokenize(sent) for sent in sentences] self.load_word2vec(lang) train_time = time.time() instance_wmd = WmdSimilarity(sentences, self.model) instance_wmd.save("models/fasttext_wmd_nb_happybytes_faq.model") del self.model print("\n wmd training time --- ", time.time() - train_time) print("\n total execution time --- ", time.time() - st)
print("Total processed documents: %r " % counter) return claim_corpus, documents # print('Step 4: Process claim text (lowercase, stop words, stemming, etc)') start = time() print('Processing PATI Data...') claim_txt_corpus, original_corpus = process_claims(df_pati_clm_txt) print('Took %.2f seconds to load PATI Data.' % (time() - start)) # print('Step 7: Load google vector model to get W2V similarity between words') print('Loading Google model...') start = time() if not os.path.exists('../data/GoogleNews-vectors-negative300.bin.gz'): raise ValueError("You need to download the google news model: https://code.google.com/archive/p/word2vec/") model = KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True) print('Took %.2f seconds to load the Google model.' % (time() - start)) # print('Step 8: Now that you have corpus of claims and W2V Google model, build WMD Similarity model') num_best = 10 start = time() instance = WmdSimilarity(claim_txt_corpus, model, num_best=num_best) print('Took %.2f seconds to build WMD Similarity Instance.' % (time() - start)) # print('Step 9: Save the WMD Similarity model and claim list') instance.save('wmd_instance.model') with open('original_corpus.pkl', 'wb') as f: pickle.dump(original_corpus, f)
def trainModelM2(self, sampleUtterances_tokens, outpath): index = WmdSimilarity(sampleUtterances_tokens, self.w2vModel) index.save(os.path.join(outpath, "m2.index"))
def trainModelM1(self, detector_tokens, outpath): index = WmdSimilarity(detector_tokens, self.w2vModel) index.save(os.path.join(outpath, "m1.index"))