Example #1
0
 def train(self, model, model_data):
     model.train(utils.shuffle([x for x in tqdm(model_data)]),
                 total_examples=len(model_data),
                 epochs=self.epoch)
     logging.info("Training complete. Saving model")
     model_path = get_path(
         'models/word2vec') + '/nassai_word2vec.vec'.format(
             self.type) if self.type == "word2vec" else get_path(
                 'models/doc2vec') + '/nassai_doc2vec.vec'.format(self.type)
     model.save(model_path)
     return True
Example #2
0
 def __init__(self, clf, data, **kwargs):
     print(kwargs)
     self.clf = clf
     self.epoch_count = kwargs.get('epoch', 10)
     self.batch = kwargs.get('batch', 10)
     self.dbow = kwargs.get('dbow', 1)
     self.use_glove = kwargs.get('use_glove', True)
     self.data = pandas.read_csv(data)
     self.nass_embedding_path = get_path(
         'models/doc2vec/nassai_dbow_doc2vec.vec'
     ) if self.dbow else get_path('models/doc2vec/nassai_dm_doc2vec.vec')
     self.max_sequence_length = 300
     self.max_num_words = 50000
     self.embedding_dim = 300
     self.validation_split = 0.2
     self.num_words = None
     self.embedding_matrix = None
     self.result = {"type": "word2vec"}
Example #3
0
def run(model_list, mode, **kwargs):
    records = {}
    results_path = get_path('data') + '/results.csv'
    clean_data_path = get_path('data') + '/clean_data.csv'
    print("TRAINING : {}".format(mode))
    for model in model_list:
        print("Current Model : {}".format(model))
        score, duration = train(clf=model, data=clean_data_path, name="{}_{}".format(model[0], mode), **kwargs)
        records.update({
            'date': datetime.now(),
            'f1': score,
            'mode': mode,
            'duration': duration,
            'model_name': model[0],
        })
        print("{0} took {1} seconds.".format(model, duration))
        with open(results_path, 'a') as f:
            w = csv.DictWriter(f, records.keys())
            w.writerow(records)
Example #4
0
def preprocess_data(data_path):
    print("Starting processing ...")
    data = pandas.read_csv(data_path)
    nans = lambda data: data[data.isnull().any(axis=1)]
    data = data.drop(list(nans(data).index))
    data['clean_text'] = data['clean_text'].apply(clean_text)
    saved_data_path = get_path('data') + '/clean_data.csv'
    data[['clean_text', 'bill_class']].to_csv(saved_data_path, index=False)
    print("Preprocessing Complete")
    return data
Example #5
0
def get_glove(vocab):
    print("Loading Glove")
    embeddings_matrix = {}
    glove_path = get_path('models/glove/glove.6B.300d.txt')
    with open(glove_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            if word in vocab:
                coefs = numpy.fromstring(coefs, 'f', sep=' ')
                embeddings_matrix[word] = coefs
        return embeddings_matrix
def make_glove(deaccent):
    words = WordCorpus(DATAPATH)
    texts = list(words.get_texts(deaccent=deaccent))
    flattened = flatten_data(texts)
    with open(get_path(f"/data/glove_data.txt"), "w+") as fi:
        for entry in flattened:
            if type(entry) != list:
                fi.write("%s\n" % entry)
            else:
                fi.write("\n".join(entry))
    return True
Example #7
0
 def fasttext(self, save=True):
     model = FastText(min_count=1, **self.params)
     logging.info("Building Model")
     model.build_vocab(self.data)
     model.train(utils.shuffle(self.data),
                 total_examples=self.total_examples,
                 epochs=self.epoch)
     logging.info("Training complete. Saving model")
     if save:
         model_name = f"fasttext_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec"
         model_path = get_path(f'/models/fastext/{model_name}')
         return model.save(model_path)
     else:
         return model
Example #8
0
 def word2vec(self, save=True):
     model = Word2Vec(min_count=1, **self.params)
     logging.info("Building Model")
     model.build_vocab(sentences=self.data)
     model.train(sentences=self.data,
                 total_examples=self.total_examples,
                 epochs=self.epoch)
     logging.info("Training complete. Saving model")
     if save:
         model_name = f"word2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec"
         model_path = get_path(f'/models/word2vec/{model_name}')
         return model.save(model_path)
     else:
         return model
Example #9
0
 def sentence2vec(self, parentmodel=None, save=True):
     model_map = {
         "word2vec": self.word2vec(save=False),
         "glove": self.glove(save=False)
     }
     model = model_map.get(parentmodel, self.fasttext(save=False))
     sentence_model = SIF(model)
     sentence_model = sentence_model.train(self.data)
     logging.info("Training complete. Saving model")
     if save:
         model_name = f"sentence2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec"
         model_path = get_path(f'/models/sentence2vec/{model_name}')
         return sentence_model.save(model_path)
     else:
         return sentence_model
Example #10
0
 def doc2vec(self):
     model = Doc2Vec(dm=self.dbow,
                     vector_size=self.dim,
                     min_count=3,
                     window=10,
                     hs=self.hs,
                     iter=self.epoch,
                     workers=max(1,
                                 cpu_count() - 1))
     logging.info("Building Model")
     model.build_vocab(self.data)
     model.train(utils.shuffle(self.data),
                 total_examples=self.total_examples,
                 epochs=self.epoch)
     logging.info("Training complete. Saving model")
     model_name = f"doc2vec_{'dbow' if not self.dbow else 'dm'}_{self.dim}.vec"
     model_path = get_path(f'/models/doc2vec/{model_name}')
     return model.save(model_path)
Example #11
0
    def phrase2vec(self):
        phrases = Phrases(self.data,
                          min_count=5,
                          threshold=7,
                          progress_per=1000)
        phrase_model = Phraser(phrases)
        training_data = [phrase_model[sentence] for sentence in self.data]

        model = Word2Vec(min_count=1, **self.params)
        logging.info("Building Model")
        model.build_vocab(sentences=training_data)
        model.train(sentences=training_data,
                    total_examples=len(training_data),
                    epochs=self.epoch)
        logging.info("Training complete. Saving model")
        model_name = f"phrase2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec"
        model_path = get_path(f'/models/word2vec/{model_name}')
        return model.save(model_path)
Example #12
0
def prepare_data(prep=False, do_decode=False):
    data = pandas.read_csv(get_path('data') + "/clean_data.csv")
    if prep:
        text = data.apply(
            lambda r: simple_preprocess(r['clean_text'], min_len=3), axis=1)
    else:
        text = data.clean_text.values
    if not do_decode:
        labels = data.bill_class
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(labels)
    else:
        labels = data.bill_class
    tok = Tokenizer(num_words=30000)
    tok.fit_on_texts(text)
    word_counts = tok.word_counts
    vocab = [''] + [
        w for (w, _) in sorted(word_counts.items(), key=star(lambda _, c: -c))
    ]
    vocab = [i for i in vocab if 15 > len(i) > 1]
    return text, labels, data['bill_class'].unique(), vocab, tok
Example #13
0
def nassai_cli(action, batch, epoch, mode, text, data, cbow=True, dbow=False, glove=True):
    base_data_path = data
    clean_data_path = get_path('data') + "/clean_data.csv"

    if action == "preprocess":
        from code import preprocessing
        return preprocessing.preprocess_data(base_data_path)
    elif action == "build_embedding":
        if dbow:
            builder = Embedding(embedding_type="doc2vec", data=clean_data_path, dbow=dbow, epoch=epoch)
            return builder.build()
        builder = Embedding(embedding_type="word2vec", data=clean_data_path, cbow=cbow, epoch=epoch)
        return builder.build()
    elif action == "train":
        word2vec_embedding = get_path('models') + '/doc2vec/nassai_word2vec.vec'
        doc2vec_embedding = get_path('models') + '/doc2vec/nassai_doc2vec.vec'
        if mode == "doc2vec":
            model_list = [
                ("doc2vec_bnb_mean_embedding", BernNB(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")),
                ("doc2vec_svm_mean_embedding", SVM(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")),
                ("doc2vec_linear_svm_mean_embedding", LinearSVM(glove=False, embedding_path=doc2vec_embedding, tfidf="mean_embedding")),

                ("doc2vec_bnb_tfidfemmbedding", BernNB(glove=glove, use_tfidf=True)),
                ("doc2vec_svm_tfidfembedding", (SVM(glove=glove, use_tfidf=True))),
                ("doc2vec_linear_svm_tfidfembedding", LinearSVM(glove=glove, use_tfidf=True)),

                ("lstm_doc2vec_glove", LSTMClassifier(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding, layers=4)),
                ("fchollet_cnn_doc2vec_glove", FCholletCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),
                ("bilstm_doc2vec_glove", BLSTM2DCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),
                ("ykimcnn_doc2vec_glove", YKimCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding))

            ]

        elif mode == "word2vec":
            model_list = [
                ("bnb_mean_embedding", BernNB(glove=glove, tfidf="mean_embedding")),
                ("svm_mean_embedding", (SVM(glove=glove, tfidf="mean_embedding"))),
                ("linear_svm_mean_embedding", LinearSVM(glove=glove, tfidf="mean_embedding")),

                ("bnb_tfidfemmbedding", BernNB(glove=glove, use_tfidf=True)),
                ("svm_tfidfembedding", (SVM(glove=glove, use_tfidf=True))),
                ("linear_svm_tfidfembedding", LinearSVM(glove=glove, use_tfidf=True)),
                # ("mlp_mean_embedding", MLP(glove=glove, tfidf="mean_embedding"), 1),
                # ("mlp_tfidfemmbedding", MLP(glove=glove, tfidf="tfidf_embedding_vectorizer"), 1)
            ]
        else:
            model_list = [
                ("word2vec_bnb_mean_embedding", BernNB(glove=glove, embedding_path=word2vec_embedding, tfidf="mean_embedding")),
                ("word2vec_svm_mean_embedding", SVM(glove=glove, embedding_path=word2vec_embedding, tfidf="mean_embedding")),
                ("word2vec_linear_svm_mean_embedding", LinearSVM(glove=False, embedding_path=word2vec_embedding, tfidf="mean_embedding")),
                ("lstm_word2vec_glove", LSTMClassifier(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding, layers=4)),
                ("fchollet_cnn_doc2vec_glove", FCholletCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),
                ("bilstm_word2vec_glove", BLSTM2DCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),
                ("ykimcnn_word2vec_glove", YKimCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),

                ("doc2vec_bnb_mean_embedding", BernNB(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")),
                ("doc2vec_svm_mean_embedding", SVM(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")),
                ("doc2vec_linear_svm_mean_embedding", LinearSVM(glove=glove, embedding_path=doc2vec_embedding, tfidf="mean_embedding")),
                ("lstm_doc2vec_glove", LSTMClassifier(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding, layers=4)),
                ("fchollet_cnn_doc2vec_glove", FCholletCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),
                ("bilstm_doc2vec_glove", BLSTM2DCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),
                ("ykimcnn_doc2vec_glove", YKimCNN(train_embeddings=False, batch=True, glove=glove, units=256, embedding_path=doc2vec_embedding)),

                ("doc2vec_bnb_tfidfemmbedding", BernNB(glove=glove, use_tfidf=True)),
                ("doc2vec_svm_tfidfembedding", (SVM(glove=glove, use_tfidf=True))),
                ("doc2vec_linear_svm_tfidfembedding", LinearSVM(glove=True, use_tfidf=True))

            ]
        return run(model_list, mode=mode, batch=batch, layers=4, dropout_rate=0.25)

    else:
        model = load_model(mode, '')
        pred = model.predict([text])
        click.echo("TEXT : {}".format(text))
        print()
        click.echo("PREDICTION: {}".format(pred))