def train(self, model, model_data): model.train(utils.shuffle([x for x in tqdm(model_data)]), total_examples=len(model_data), epochs=self.epoch) logging.info("Training complete. Saving model") model_path = get_path( 'models/word2vec') + '/nassai_word2vec.vec'.format( self.type) if self.type == "word2vec" else get_path( 'models/doc2vec') + '/nassai_doc2vec.vec'.format(self.type) model.save(model_path) return True
def __init__(self, clf, data, **kwargs): print(kwargs) self.clf = clf self.epoch_count = kwargs.get('epoch', 10) self.batch = kwargs.get('batch', 10) self.dbow = kwargs.get('dbow', 1) self.use_glove = kwargs.get('use_glove', True) self.data = pandas.read_csv(data) self.nass_embedding_path = get_path( 'models/doc2vec/nassai_dbow_doc2vec.vec' ) if self.dbow else get_path('models/doc2vec/nassai_dm_doc2vec.vec') self.max_sequence_length = 300 self.max_num_words = 50000 self.embedding_dim = 300 self.validation_split = 0.2 self.num_words = None self.embedding_matrix = None self.result = {"type": "word2vec"}
def run(model_list, mode, **kwargs): records = {} results_path = get_path('data') + '/results.csv' clean_data_path = get_path('data') + '/clean_data.csv' print("TRAINING : {}".format(mode)) for model in model_list: print("Current Model : {}".format(model)) score, duration = train(clf=model, data=clean_data_path, name="{}_{}".format(model[0], mode), **kwargs) records.update({ 'date': datetime.now(), 'f1': score, 'mode': mode, 'duration': duration, 'model_name': model[0], }) print("{0} took {1} seconds.".format(model, duration)) with open(results_path, 'a') as f: w = csv.DictWriter(f, records.keys()) w.writerow(records)
def preprocess_data(data_path): print("Starting processing ...") data = pandas.read_csv(data_path) nans = lambda data: data[data.isnull().any(axis=1)] data = data.drop(list(nans(data).index)) data['clean_text'] = data['clean_text'].apply(clean_text) saved_data_path = get_path('data') + '/clean_data.csv' data[['clean_text', 'bill_class']].to_csv(saved_data_path, index=False) print("Preprocessing Complete") return data
def get_glove(vocab): print("Loading Glove") embeddings_matrix = {} glove_path = get_path('models/glove/glove.6B.300d.txt') with open(glove_path) as f: for line in f: word, coefs = line.split(maxsplit=1) if word in vocab: coefs = numpy.fromstring(coefs, 'f', sep=' ') embeddings_matrix[word] = coefs return embeddings_matrix
def make_glove(deaccent): words = WordCorpus(DATAPATH) texts = list(words.get_texts(deaccent=deaccent)) flattened = flatten_data(texts) with open(get_path(f"/data/glove_data.txt"), "w+") as fi: for entry in flattened: if type(entry) != list: fi.write("%s\n" % entry) else: fi.write("\n".join(entry)) return True
def fasttext(self, save=True): model = FastText(min_count=1, **self.params) logging.info("Building Model") model.build_vocab(self.data) model.train(utils.shuffle(self.data), total_examples=self.total_examples, epochs=self.epoch) logging.info("Training complete. Saving model") if save: model_name = f"fasttext_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec" model_path = get_path(f'/models/fastext/{model_name}') return model.save(model_path) else: return model
def word2vec(self, save=True): model = Word2Vec(min_count=1, **self.params) logging.info("Building Model") model.build_vocab(sentences=self.data) model.train(sentences=self.data, total_examples=self.total_examples, epochs=self.epoch) logging.info("Training complete. Saving model") if save: model_name = f"word2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec" model_path = get_path(f'/models/word2vec/{model_name}') return model.save(model_path) else: return model
def sentence2vec(self, parentmodel=None, save=True): model_map = { "word2vec": self.word2vec(save=False), "glove": self.glove(save=False) } model = model_map.get(parentmodel, self.fasttext(save=False)) sentence_model = SIF(model) sentence_model = sentence_model.train(self.data) logging.info("Training complete. Saving model") if save: model_name = f"sentence2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec" model_path = get_path(f'/models/sentence2vec/{model_name}') return sentence_model.save(model_path) else: return sentence_model
def doc2vec(self): model = Doc2Vec(dm=self.dbow, vector_size=self.dim, min_count=3, window=10, hs=self.hs, iter=self.epoch, workers=max(1, cpu_count() - 1)) logging.info("Building Model") model.build_vocab(self.data) model.train(utils.shuffle(self.data), total_examples=self.total_examples, epochs=self.epoch) logging.info("Training complete. Saving model") model_name = f"doc2vec_{'dbow' if not self.dbow else 'dm'}_{self.dim}.vec" model_path = get_path(f'/models/doc2vec/{model_name}') return model.save(model_path)
def phrase2vec(self): phrases = Phrases(self.data, min_count=5, threshold=7, progress_per=1000) phrase_model = Phraser(phrases) training_data = [phrase_model[sentence] for sentence in self.data] model = Word2Vec(min_count=1, **self.params) logging.info("Building Model") model.build_vocab(sentences=training_data) model.train(sentences=training_data, total_examples=len(training_data), epochs=self.epoch) logging.info("Training complete. Saving model") model_name = f"phrase2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec" model_path = get_path(f'/models/word2vec/{model_name}') return model.save(model_path)
def prepare_data(prep=False, do_decode=False): data = pandas.read_csv(get_path('data') + "/clean_data.csv") if prep: text = data.apply( lambda r: simple_preprocess(r['clean_text'], min_len=3), axis=1) else: text = data.clean_text.values if not do_decode: labels = data.bill_class label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) else: labels = data.bill_class tok = Tokenizer(num_words=30000) tok.fit_on_texts(text) word_counts = tok.word_counts vocab = [''] + [ w for (w, _) in sorted(word_counts.items(), key=star(lambda _, c: -c)) ] vocab = [i for i in vocab if 15 > len(i) > 1] return text, labels, data['bill_class'].unique(), vocab, tok
def nassai_cli(action, batch, epoch, mode, text, data, cbow=True, dbow=False, glove=True): base_data_path = data clean_data_path = get_path('data') + "/clean_data.csv" if action == "preprocess": from code import preprocessing return preprocessing.preprocess_data(base_data_path) elif action == "build_embedding": if dbow: builder = Embedding(embedding_type="doc2vec", data=clean_data_path, dbow=dbow, epoch=epoch) return builder.build() builder = Embedding(embedding_type="word2vec", data=clean_data_path, cbow=cbow, epoch=epoch) return builder.build() elif action == "train": word2vec_embedding = get_path('models') + '/doc2vec/nassai_word2vec.vec' doc2vec_embedding = get_path('models') + '/doc2vec/nassai_doc2vec.vec' if mode == "doc2vec": model_list = [ ("doc2vec_bnb_mean_embedding", BernNB(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")), ("doc2vec_svm_mean_embedding", SVM(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")), ("doc2vec_linear_svm_mean_embedding", LinearSVM(glove=False, embedding_path=doc2vec_embedding, tfidf="mean_embedding")), ("doc2vec_bnb_tfidfemmbedding", BernNB(glove=glove, use_tfidf=True)), ("doc2vec_svm_tfidfembedding", (SVM(glove=glove, use_tfidf=True))), ("doc2vec_linear_svm_tfidfembedding", LinearSVM(glove=glove, use_tfidf=True)), ("lstm_doc2vec_glove", LSTMClassifier(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding, layers=4)), ("fchollet_cnn_doc2vec_glove", FCholletCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("bilstm_doc2vec_glove", BLSTM2DCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("ykimcnn_doc2vec_glove", YKimCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)) ] elif mode == "word2vec": model_list = [ ("bnb_mean_embedding", BernNB(glove=glove, tfidf="mean_embedding")), ("svm_mean_embedding", (SVM(glove=glove, tfidf="mean_embedding"))), ("linear_svm_mean_embedding", LinearSVM(glove=glove, tfidf="mean_embedding")), ("bnb_tfidfemmbedding", BernNB(glove=glove, use_tfidf=True)), ("svm_tfidfembedding", (SVM(glove=glove, use_tfidf=True))), ("linear_svm_tfidfembedding", LinearSVM(glove=glove, use_tfidf=True)), # ("mlp_mean_embedding", MLP(glove=glove, tfidf="mean_embedding"), 1), # ("mlp_tfidfemmbedding", MLP(glove=glove, tfidf="tfidf_embedding_vectorizer"), 1) ] else: model_list = [ ("word2vec_bnb_mean_embedding", BernNB(glove=glove, embedding_path=word2vec_embedding, tfidf="mean_embedding")), ("word2vec_svm_mean_embedding", SVM(glove=glove, embedding_path=word2vec_embedding, tfidf="mean_embedding")), ("word2vec_linear_svm_mean_embedding", LinearSVM(glove=False, embedding_path=word2vec_embedding, tfidf="mean_embedding")), ("lstm_word2vec_glove", LSTMClassifier(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding, layers=4)), ("fchollet_cnn_doc2vec_glove", FCholletCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("bilstm_word2vec_glove", BLSTM2DCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("ykimcnn_word2vec_glove", YKimCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("doc2vec_bnb_mean_embedding", BernNB(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")), ("doc2vec_svm_mean_embedding", SVM(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")), ("doc2vec_linear_svm_mean_embedding", LinearSVM(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")), ("lstm_doc2vec_glove", LSTMClassifier(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding, layers=4)), ("fchollet_cnn_doc2vec_glove", FCholletCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("bilstm_doc2vec_glove", BLSTM2DCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("ykimcnn_doc2vec_glove", YKimCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)), ("doc2vec_bnb_tfidfemmbedding", BernNB(glove=glove, use_tfidf=True)), ("doc2vec_svm_tfidfembedding", (SVM(glove=glove, use_tfidf=True))), ("doc2vec_linear_svm_tfidfembedding", LinearSVM(glove=True, use_tfidf=True)) ] return run(model_list, mode=mode, batch=batch, layers=4, dropout_rate=0.25) else: model = load_model(mode, '') pred = model.predict([text]) click.echo("TEXT : {}".format(text)) print() click.echo("PREDICTION: {}".format(pred))