def train(self, lang):
        st = time.time()
        sentences = self.read_data()

        with open("training_jsons/training_sentences_nb_happybytes_faq.json",
                  "w+") as fs:
            fs.write(json.dumps({"sentences": sentences}, indent=4))

        sentences = [self.cleaning_pipeline(sent, lang) for sent in sentences]

        sentences = [word_tokenize(sent) for sent in sentences]

        self.load_word2vec(lang)

        train_time = time.time()
        instance_wmd = WmdSimilarity(sentences, self.model)
        instance_wmd.save("models/fasttext_wmd_nb_happybytes_faq.model")
        del self.model
        print("\n wmd training time --- ", time.time() - train_time)
        print("\n total execution time --- ", time.time() - st)
Example #2
0
    print("Total processed documents: %r " % counter)

    return claim_corpus, documents


# print('Step 4: Process claim text (lowercase, stop words, stemming, etc)')
start = time()
print('Processing PATI Data...')
claim_txt_corpus, original_corpus = process_claims(df_pati_clm_txt)
print('Took %.2f seconds to load PATI Data.' % (time() - start))

# print('Step 7: Load google vector model to get W2V similarity between words')
print('Loading Google model...')
start = time()
if not os.path.exists('../data/GoogleNews-vectors-negative300.bin.gz'):
    raise ValueError("You need to download the google news model: https://code.google.com/archive/p/word2vec/")
model = KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)
print('Took %.2f seconds to load the Google model.' % (time() - start))

# print('Step 8: Now that you have corpus of claims and W2V Google model, build WMD Similarity model')
num_best = 10
start = time()
instance = WmdSimilarity(claim_txt_corpus, model, num_best=num_best)
print('Took %.2f seconds to build WMD Similarity Instance.' % (time() - start))

# print('Step 9: Save the WMD Similarity model and claim list')
instance.save('wmd_instance.model')

with open('original_corpus.pkl', 'wb') as f:
    pickle.dump(original_corpus, f)
 def trainModelM2(self, sampleUtterances_tokens, outpath):
     index = WmdSimilarity(sampleUtterances_tokens, self.w2vModel)
     index.save(os.path.join(outpath, "m2.index"))
 def trainModelM1(self, detector_tokens, outpath):
     index = WmdSimilarity(detector_tokens, self.w2vModel)
     index.save(os.path.join(outpath, "m1.index"))