Exemple #1
0
def doc_vect(filename):
    documents = []
    from gensim.models.doc2vec import TaggedDocument
    with open(filename, 'r') as fo:
        for line in fo.readlines():
            tokens = line.strip().split('\t')
            if tokens[2] != str(0):
                sentence = TaggedDocument(tokens[1].split(), [tokens[0]])
                documents.append(sentence)
    print len(documents)
    from gensim.models.doc2vec import Doc2Vec
    import multiprocessing
    cores = multiprocessing.cpu_count()
    simple_models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        Doc2Vec(documents,
                dm=1,
                dm_concat=1,
                size=100,
                window=5,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
        # PV-DBOW
        Doc2Vec(documents,
                dm=0,
                size=100,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(documents,
                dm=1,
                dm_mean=1,
                size=100,
                window=5,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
    ]
    '''Inspect model'''
    # for model in simple_models[:1]:
    #     print model
    #     for label in range(1, 11):
    #         inferred_docvec = model.docvecs[str(label)]
    #         print label
    #         print('%s:\n %s' % (model, model.most_similar(str(label))))

    models_by_name = OrderedDict(
        (str(model), model) for model in simple_models)
    from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[0]])

    return models_by_name
def doc_vect(alldocs):
    print 'Doc2Vec Each Tag is ID'
    train_docs = [doc for doc in alldocs if doc.split == 'train']
    test_docs = [doc for doc in alldocs if doc.split == 'test']
    print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))
    documents = []
    for doc in train_docs:
        sentence = TaggedDocument(doc.words, doc.tags)
        documents.append(sentence)
    print len(documents)
    cores = multiprocessing.cpu_count()
    simple_models = [
                # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
                Doc2Vec(documents, dm=1, dm_concat=1, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores),
                # PV-DBOW
                Doc2Vec(documents, dm=0, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores),
                # PV-DM w/average
                Doc2Vec(documents, dm=1, dm_mean=1, size=400, window=5, negative=5, hs=1, sample=1e-3, iter=20, min_count=1, workers=cores),
                    ]

    models_by_name = OrderedDict((str(model), model) for model in simple_models)
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

    for name, model in models_by_name.items():
        print name
        train_targets, train_regressors = zip(*[(doc.sentiment, model.docvecs[doc.tags[0]]) for doc in train_docs])
        test_targets, test_regressors = zip(*[(doc.sentiment, model.infer_vector(doc.words)) for doc in test_docs])
        util.logit(train_regressors, train_targets, test_regressors, test_targets)
        util.svm(train_regressors, train_targets, test_regressors, test_targets)
Exemple #3
0
def doc_vect(filename):
    """Using ground-truth labels or predicted labels in Word2Vec"""
    df = label_generate(filename)
    names = list(df.columns.values)
    # print names
    textid = names.index('Text')
    predid = names.index('Label')
    documents = []
    from gensim.models.doc2vec import TaggedDocument
    for line in df.itertuples():
        # print line
        # print line[textid+1]
        # print line[predid+1]
        sentence = TaggedDocument(line[textid + 1].split(),
                                  [str(line[predid + 1])])
        documents.append(sentence)
    print len(documents)
    from gensim.models.doc2vec import Doc2Vec
    import multiprocessing
    cores = multiprocessing.cpu_count()
    simple_models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        Doc2Vec(documents,
                dm=1,
                dm_concat=1,
                size=100,
                window=5,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
        # PV-DBOW
        Doc2Vec(documents,
                dm=0,
                size=100,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(documents,
                dm=1,
                dm_mean=1,
                size=100,
                window=5,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
    ]

    models_by_name = OrderedDict(
        (str(model), model) for model in simple_models)
    from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[0]])

    pickle.dump(models_by_name, open('data/doc2vec.pick', 'w'))
Exemple #4
0
def vect_all(X_train, y_train, X_test, y_predict):
    documents = []
    from gensim.models.doc2vec import TaggedDocument
    for line in zip(X_train, y_train):
        sentence = TaggedDocument(line[0].split(), [str(line[1])])
        documents.append(sentence)
    for line in zip(X_test, y_predict):
        sentence = TaggedDocument(line[0].split(), [str(line[1])])
        documents.append(sentence)
    print len(documents)
    from gensim.models.doc2vec import Doc2Vec

    cores = multiprocessing.cpu_count()
    simple_models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        Doc2Vec(documents,
                dm=1,
                dm_concat=1,
                size=100,
                window=5,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
        # PV-DBOW
        Doc2Vec(documents,
                dm=0,
                size=100,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(documents,
                dm=1,
                dm_mean=1,
                size=100,
                window=5,
                negative=5,
                hs=0,
                min_count=1,
                workers=cores),
    ]

    models_by_name = OrderedDict(
        (str(model), model) for model in simple_models)
    from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[0]])
    return models_by_name
Exemple #5
0
def train_models(docs, passes):
    cores = multiprocessing.cpu_count()
    models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        gensim.models.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
        # PV-DBOW 
        gensim.models.Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
        # PV-DM w/average
        gensim.models.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
        #PV-DM, concatinate
        gensim.models.Doc2Vec(size=100, window=8, min_count=5, workers=4),
    ]

    print('Initilizing all models, build vocabs...')
    '''
    models[0].build_vocab(docs)
    for model in models[1:]:
        model.reset_from(models[0])
    '''
    for model in models:
        print('---', model)
        model.build_vocab(docs)
    
    #models_by_name = OrderedDict((str(model), model) for model in models)
    models_by_name = OrderedDict()
    models_by_name['default'] = models[3]
    models_by_name['dmc'] = models[0]
    models_by_name['dbow'] = models[1]
    models_by_name['dmm'] = models[2]
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([models[1], models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([models[1], models[0]])

    #learning rate decreaseing
    alpha, min_alpha= (0.025, 0.001)
    alpha_delta = (alpha - min_alpha) / passes

    for epoch in range(passes):
        print('====================pass', epoch)
        #shuffle(docs)
        for name, model in models_by_name.items():

            model.alpha, model.min_alpha = alpha, alpha#set learning rate

            print('---training model', name, 'at alpha', alpha)
            with elapsed_timer() as elapsed:
                model.train(docs)
                print('------finished after %0.1fs'%elapsed())
            if epoch%3==0:
                visualize(model.docvecs, docs, name + '_' + str(epoch))
    
        alpha -= alpha_delta
 def load_model(self, model_dir):
     """Load models
     """
     for name in self.name_to_models.keys():
         self.name_to_models[name] = Doc2Vec.load(model_dir + name)
     loaded_model = ConcatenatedDoc2Vec(list(self.name_to_models.values()))
     return loaded_model
Exemple #7
0
    def fit(self, documents, y=None):
        if self.model_name:
            print("Loading Book2Vec")
            model_data = os.path.join(self.model_dir,
                                      'model_%s.doc2vec' % self.model_name)
            if self.model_name == 'dbow_dmm' or self.model_name == 'dbow_dmc':
                m1 = os.path.join(
                    self.model_dir,
                    'model_%s.doc2vec' % self.model_name.split('_')[0])
                m2 = os.path.join(
                    self.model_dir,
                    'model_%s.doc2vec' % self.model_name.split('_')[1])
                model1 = Doc2Vec.load(m1)
                model2 = Doc2Vec.load(m2)
                self.model_ = ConcatenatedDoc2Vec([model1, model2])
                self.num_features_ = model1.syn0.shape[1] + model2.syn0.shape[1]
            else:
                self.model_ = Doc2Vec.load(model_data)
                self.num_features_ = self.model_.syn0.shape[1]
            print(self.num_features_)
            print("Done Loading vectors")
        else:
            raise OSError("Model does not exit")

        return self
Exemple #8
0
    def fit(self, documents, y=None):
        print("Here===> {}".format(self.model_name))
        if self.model_name:
            print("Loading Vectors")
            model_data = os.path.join(self.model_dir,
                                      "model_%s.doc2vec" % self.model_name)
            if self.model_name == "dbow_dmm" or self.model_name == "dbow_dmc":
                m1 = os.path.join(
                    self.model_dir,
                    "model_%s.doc2vec" % self.model_name.split("_")[0])
                m2 = os.path.join(
                    self.model_dir,
                    "model_%s.doc2vec" % self.model_name.split("_")[1])
                model1 = Doc2Vec.load(m1)
                model2 = Doc2Vec.load(m2)
                self.model_ = ConcatenatedDoc2Vec([model1, model2])
                self.num_features_ = model1.wv.syn0.shape[
                    1] + model2.wv.syn0.shape[1]
            else:
                self.model_ = Doc2Vec.load(model_data)
                self.num_features_ = self.model_.wv.syn0.shape[1]
            print(self.num_features_)
            print("Done Loading vectors")
        else:
            print("Hereeeeee ", self.model_name)
            raise OSError("Model does not exit")

        return self
def infer_embedding(model_no, reviews, reviews_size, concatenate):
    # para model_no: which Doc2Vec model / list if concatenate
    # para reviews: all reviews (training or test)
    # type reviews: list(list(str))
    # type reviews_size: int
    # para concatenate: whether to concatenate two doc2vec models
    if not concatenate:
        model = load_model(model_no)
        print("description of the doc2vec model\t", str(model))
    else:
        model_1 = load_model(model_no[0])
        model_2 = load_model(model_no[1])
        print("description of the 1st doc2vec model\t", str(model_1))
        print("description of the 2nd doc2vec model\t", str(model_2))
        model = ConcatenatedDoc2Vec([model_1, model_2])

    sentiment_review = namedtuple('Sentiment_Review', 'words tags sentiment')
    allreviews = []
    bar = progressbar.ProgressBar()
    for i in bar(range(len(reviews))):
        tags = [i]
        if i < reviews_size:
            allreviews.append(sentiment_review(reviews[i], tags, 0.))
        else:
            allreviews.append(sentiment_review(reviews[i], tags, 1.))
    # infer the doc2vec embeddings with given model
    vectors, labels = vector_4_learning(model, allreviews)
    return vectors, labels
Exemple #10
0
def build_doc2vec(all_docs, size_in):
    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    models_list = [
        # Doc2Vec(dm = 1, dm_mean= 1, size = 400, window = 10, negative = 11, hs = 0, min_count= 1, workers= cores),
        #Doc2Vec(dm=1, dm_mean=1, size=500, window=10, negative=11, hs=0, min_count=1, workers=cores),
        Doc2Vec(dm=1,
                dm_mean=1,
                size=size_in,
                window=3,
                negative=11,
                hs=0,
                min_count=1,
                workers=cores),
        # Doc2Vec(dm=0, size=400, negative=11, hs=0, min_count=1, workers=cores),
        #Doc2Vec(dm=0, size=500, negative=11, hs=0, min_count=1, workers=cores),
        Doc2Vec(dm=0, size=500, negative=11, hs=0, min_count=1, workers=cores)
    ]

    # classifying_docs should be in console before running this
    models_list[0].build_vocab(all_docs)
    print models_list[0]
    for model in models_list[1:]:
        model.reset_from(models_list[0])
        print model

    models_by_name = OrderedDict((str(model), model) for model in models_list)

    models_by_name['dbow+dmm_500'] = ConcatenatedDoc2Vec(
        [models_list[0], models_list[1]])
    #models_by_name['dbow+dmm_500'] = ConcatenatedDoc2Vec([models_list[1], models_list[4]])
    #models_by_name['dbow+dmm_600'] = ConcatenatedDoc2Vec([models_list[2], models_list[5]])

    return models_by_name
Exemple #11
0
    def get_model(self):
        if not os.path.isfile(self.dbow_path) and not os.path.isfile(self.dm_path):
            self.create_models()

        dbow_model = Doc2Vec.load(self.dbow_path)
        dm_model = Doc2Vec.load(self.dm_path)

        return ConcatenatedDoc2Vec([dbow_model, dm_model])
Exemple #12
0
def get_text_feature(texts):
    model_dbow = Doc2Vec.load(os.path.dirname(os.path.abspath(__file__))+'/doc2vector_dbow.doc2vec')
    model_dm = Doc2Vec.load(os.path.dirname(os.path.abspath(__file__))+'/doc2vector_dm.doc2vec')
    model = ConcatenatedDoc2Vec([model_dm, model_dbow])
    result = []
    i = 1
    for text in texts:
        if i % 1000 == 0:
            print(i,"texts of",len(texts),"texts have been loaded")
        i += 1
        result.append(model.infer_vector(document=Preprocessing(text),alpha=0.025,min_alpha=0.025,steps=100))
    return torch.tensor(result)
def create_models(corpus, vector_size):
    models = [
        # PV-DM w/ concatenation
        Doc2Vec(dm=1,
                dm_concat=1,
                vector_size=vector_size,
                window=5,
                sample=1e-4,
                negative=5,
                hs=0,
                min_count=3,
                workers=WORKERS),

        # PV-DBOW
        Doc2Vec(dm=0,
                vector_size=vector_size,
                negative=5,
                hs=0,
                sample=1e-4,
                min_count=3,
                workers=WORKERS),

        # PV-DM w/ average
        Doc2Vec(dm=1,
                dm_mean=1,
                vector_size=vector_size,
                window=10,
                sample=1e-4,
                negative=5,
                hs=0,
                min_count=3,
                workers=WORKERS)
    ]

    build_vocabulary(models, corpus)

    models_by_name = OrderedDict((str(model), model) for model in models)
    models_by_name["dbow + dmm"] = ConcatenatedDoc2Vec([models[1], models[2]])
    models_by_name["dbow + dmc"] = ConcatenatedDoc2Vec([models[1], models[0]])
    return models_by_name
 def __init__(self, model_dm_path, model_dbow_path):
     """
     Load DM and DBOW trained models, create concatenated model
     :param model_dm_path: path to DM model
     :param model_dbow_path: path to DBOW model
     """
     logger.info('Combining DM and DBOW models')
     self.tokenizer = RegexpTokenizer(r'\w+')
     self.model_dm = Doc2Vec.load(model_dm_path)
     self.model_dbow = Doc2Vec.load(model_dbow_path)
     self.model_dbow.delete_temporary_training_data(
         keep_doctags_vectors=True, keep_inference=True)
     self.model_dm.delete_temporary_training_data(keep_doctags_vectors=True,
                                                  keep_inference=True)
     self.concatenated_model = ConcatenatedDoc2Vec(
         [self.model_dbow, self.model_dm])
Exemple #15
0
def doc2vec_feats(train, test, dim1, dim2):
    train_corpus = train['Text'].values
    test_corpus = test['Text'].values
    corpus = pd.concat((train, test), axis=0)
    cores = multiprocessing.cpu_count()

    train_doc2vec_dm(corpus['Text'].values, dim1, cores)
    train_doc2vec_dbow(corpus['Text'].values, dim2, cores)

    print('Computing doc2vec(PV-DM + PV-DBOW) feature...')
    mode1_dm = gensim.models.doc2vec.Doc2Vec.load('data/models/doc2vec/' +
                                                  str(dim1) +
                                                  '_dim/neurips_d2v_dm')
    mode1_dbow = gensim.models.doc2vec.Doc2Vec.load('data/models/doc2vec/' +
                                                    str(dim2) +
                                                    '_dim/neurips_d2v_dbow')
    model = ConcatenatedDoc2Vec([mode1_dm, mode1_dbow])

    cor = list(read_corpus(train_corpus))
    train_feats = np.array(
        list(model.infer_vector(cor[idx].words) for idx in range(len(cor))))
    del cor
    pd.DataFrame(data=train_feats).to_csv('data/features/doc2vec_dm' +
                                          str(dim1) + '_dbow' + str(dim2) +
                                          '_train.csv',
                                          header=False,
                                          index=False)

    cor = list(read_corpus(test_corpus))
    test_feats = np.array(
        list(model.infer_vector(cor[idx].words) for idx in range(len(cor))))
    pd.DataFrame(data=test_feats).to_csv('data/features/doc2vec_dm' +
                                         str(dim1) + '_dbow' + str(dim2) +
                                         '_test.csv',
                                         header=False,
                                         index=False)
Exemple #16
0
# Testing F1 score: 0.10642747269276188

print(
    logreg.predict(
        [model_dmm.infer_vector(tokenize_text('fractura proximal'),
                                steps=20)]))
# ['s52.102']

# Puliamo la RAM
model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                          keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

# Model paring
new_model = ConcatenatedDoc2Vec([model_dbow,
                                 model_dmm])  # Concatenazione dei due modelli


def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0],
                                 model.infer_vector(doc.words, steps=20))
                                for doc in sents])
    return targets, regressors


# training regressione logistica
y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
Exemple #17
0
def labeldoc_vect(alldocs):
    print 'LabelDoc2Vec with lineNO as ID'
    train_docs = [doc for doc in alldocs if doc.split == 'train']
    test_docs = [doc for doc in alldocs if doc.split == 'test']
    # unlable_docs = [doc for doc in alldocs if doc.split == 'extra']
    print('%d docs: %d train, %d test' %
          (len(alldocs), len(train_docs), len(test_docs)))
    documents = []
    for doc in train_docs:
        slable = []
        if doc.split == 'train':
            slable = ['s' + str(doc.label)]
        sentence = LabeledTaggedDocument(doc.words, doc.tags, slable)
        documents.append(sentence)
    # for doc in unlable_docs:
    #     slable = []
    #     if doc.split == 'train':
    #         slable = ['s'+str(doc.label)]
    #     sentence = LabeledTaggedDocument(doc.words, doc.tags, slable)
    #     documents.append(sentence)
    print len(documents)
    cores = multiprocessing.cpu_count()
    simple_models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        LabelDoc2Vec(documents,
                     dm=1,
                     dm_concat=1,
                     size=size,
                     window=window,
                     negative=5,
                     hs=1,
                     sample=1e-3,
                     iter=iter,
                     min_count=1,
                     workers=cores),
        # PV-DBOW
        LabelDoc2Vec(documents,
                     dm=0,
                     size=size,
                     window=window,
                     negative=5,
                     hs=1,
                     sample=1e-3,
                     iter=iter,
                     min_count=1,
                     workers=cores),
        # PV-DM w/average
        LabelDoc2Vec(documents,
                     dm=1,
                     dm_mean=1,
                     size=size,
                     window=window,
                     negative=5,
                     hs=1,
                     sample=1e-3,
                     iter=iter,
                     min_count=1,
                     workers=cores),
    ]

    models_by_name = OrderedDict(
        (str(model), model) for model in simple_models)
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[0]])

    for name, model in models_by_name.items():
        print name
        train_targets, train_regressors = zip(*[(doc.label,
                                                 model.docvecs[doc.tags[0]])
                                                for doc in train_docs])
        test_targets, test_regressors = zip(
            *[(doc.label, model.infer_vector_label(doc.words))
              for doc in test_docs])
        data_util.logit(train_regressors, train_targets, test_regressors,
                        test_targets)
dm_model_concat = Doc2Vec(dm=1,
                          dm_concat=1,
                          vector_size=50,
                          workers=cores,
                          epochs=100,
                          min_count=2,
                          negative=5,
                          hs=0,
                          sample=0)
dm_model_concat.build_vocab(train_tagged_c.tolist())
dm_model_concat.train(train_tagged_c,
                      total_examples=len(train_tagged_c),
                      epochs=dm_model_concat.epochs)

# mixed_model = ConcatenatedDoc2Vec([skip_gram_model, dm_model])
mixed_model_with_concat = ConcatenatedDoc2Vec(
    [skip_gram_model, dm_model_concat])

doc_vectors = []
for ind_ in order_level_one_month.index:
    doc_vectors.append(mixed_model_with_concat.docvecs[ind_])

doc_vec_df = pd.DataFrame(doc_vectors)

doc_vec_df.columns = [f"custvec_{col}" for col in range(100)]
order_level_one_month = pd.concat([order_level_one_month, doc_vec_df], axis=1)
skip_gram_model.delete_temporary_training_data(False, False)  # free up memory
dm_model_concat.delete_temporary_training_data(False, False)  # free up memory
##########################

skip_gram_model = Doc2Vec(dm=0,
                          vector_size=50,
Exemple #19
0
from gensim import utils
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.svm import LinearSVR

OPTIMAL_INFER_STEPS_FOR_TEST = 9000
INFER_STEPS = OPTIMAL_INFER_STEPS_FOR_TEST

# Load models
model_dir = "./reviews/models/"
model_dbow = Doc2Vec.load(model_dir + 'model.dbow')
model_dm_mean = Doc2Vec.load(model_dir + 'model.dm_mean')
model_dm_concat = Doc2Vec.load(model_dir + "model.dm_concat")

model = ConcatenatedDoc2Vec([model_dbow, model_dm_concat])
#model = model_dm_mean
#model = model_dm_concat
#model = model_dbow

# Training set
label_to_num = OrderedDict()  # Label to the number of label's sentences
label_to_num[0] = 50000
label_to_num[1] = 50000
label_to_num[2] = 50000
label_to_num[3] = 50000
label_to_train_tag = {
    0: "TRAIN_ONE_STAR_",
    1: "TRAIN_TWO_STAR_",
    2: "TRAIN_THREE_STAR_",
    3: "TRAIN_FOUR_STAR_"
Exemple #20
0
              epochs=5,
              batch_size=10,
              verbose=2)

model_dmm.save('/Users/ganfeng/Documents/Data Science/Projects/Natural Language Processing Project/Neural_Nets/model_dmm_01')

prediction_model_dmm = np.argmax(model_dmm.predict(test_vecs_dmm), axis=-1)
np.save('/Users/ganfeng/Documents/Data Science/Projects/Natural Language Processing Project/Neural_Nets/prediction_model_dmm.npy', prediction_model_dmm)

print(accuracy_score(prediction_model_dmm, label_test))
print(classification_report(prediction_model_dmm, label_test))


##########################################
# Concatenate the above two models
model_merge = ConcatenatedDoc2Vec([gensim_model_d2v_dbow_load, gensim_model_d2v_dmm_load])
train_vecs_merge = get_vectors(model_merge, df_review_train_loaded['text'][0:600000], 200)
validation_vecs_merge = get_vectors(model_merge, df_review_train_loaded['text'][600000:800000], 200)
test_vecs_merge = get_vectors(model_merge, df_review_test_loaded['text'], 200)

# Modeling of the neural nets
seed = 100
np.random.seed(seed)
model_merge = Sequential()
model_merge.add(Dense(50, activation='relu', input_dim=200))
model_merge.add(Dense(6, activation='softmax'))
opt = keras.optimizers.Adam(learning_rate=0.0002)
model_merge.compile(optimizer=opt,loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model_merge.fit(train_vecs_merge,
                y_train_final,
                validation_data=(validation_vecs_merge, y_validation_final),
Exemple #21
0
def d2v(train_tagged, valid_tagged, d2v_model, d2v_alpha, d2v_min_alpha,
        vector_size, min_count, max_vocab_size):

    model_dm = Doc2Vec(
        dm=1,  # 1: PV-DM. 0: PV-DBOW
        dm_mean=1,  # 0: use the sum of the context word vectors. 1: use the mean
        epochs=20,
        seed=1234,
        workers=1,
        alpha=d2v_alpha,
        min_alpha=
        d2v_min_alpha,  # initial learning rate; linearly drops to min_alpha as training progresses                   
        vector_size=vector_size,  # feature vector dim
        min_count=min_count,
        max_vocab_size=max_vocab_size,
        hs=0,  # 1: hierarchical softmax is used
        negative=5
    )  # use negative sampling - how many “noise words” should be drawn

    model_dbow = Doc2Vec(
        dm=0,  # 1: PV-DM. 0: PV-DBOW
        epochs=20,
        seed=1234,
        workers=1,
        alpha=d2v_alpha,
        min_alpha=
        d2v_min_alpha,  # initial learning rate; linearly drops to min_alpha as training progresses                   
        vector_size=vector_size,  # feature vector dim
        min_count=min_count,
        max_vocab_size=max_vocab_size,
        hs=0,  # 1: hierarchical softmax is used
        negative=5
    )  # use negative sampling - how many “noise words” should be drawn

    if d2v_model == "dm":
        model = model_dm
        model.build_vocab(train_tagged)
        model.train(train_tagged,
                    total_examples=len(train_tagged.values),
                    epochs=model.epochs)

    elif d2v_model == "dbow":
        model = model_dbow
        model.build_vocab(train_tagged)
        model.train(train_tagged,
                    total_examples=len(train_tagged.values),
                    epochs=model.epochs)

    else:
        # Train DM
        model_dm.build_vocab(train_tagged)
        model_dm.train(train_tagged,
                       total_examples=len(train_tagged.values),
                       epochs=model_dm.epochs)
        # Train DBOW
        model_dbow.build_vocab(train_tagged)
        model_dbow.train(train_tagged,
                         total_examples=len(train_tagged.values),
                         epochs=model_dbow.epochs)
        # Delete temporary training data to free up RAM
        model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                                  keep_inference=True)
        model_dm.delete_temporary_training_data(keep_doctags_vectors=True,
                                                keep_inference=True)
        # Combine DM and DBOW
        model = ConcatenatedDoc2Vec([model_dbow, model_dm])

    train_docs = train_tagged.values
    Y_train, X_train_vec = zip(*[(doc.tags[0],
                                  model.infer_vector(doc.words, steps=20))
                                 for doc in train_docs])

    valid_docs = valid_tagged.values
    Y_valid, X_valid_vec = zip(*[(doc.tags[0],
                                  model.infer_vector(doc.words, steps=20))
                                 for doc in valid_docs])

    return X_train_vec, Y_train, X_valid_vec, Y_valid
Exemple #22
0
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores)]
]

# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(doc_list)  # PV-DM/concat requires one special NULL word so it serves as template
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
print("got model")
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
#Create dbow+dmm model (concatenation of model 2 and 3)
train_model =  ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])

#Load XLNet features
xlnet_train = np.load("bert_features.npy").tolist()
xlnet_features = np.load("bert_test_features.npy").tolist()
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes
print("begin evaluation")
accs = []
filterwarnings('ignore')
for epoch in range(passes):
    ## Shuffle data
    # random.seed(epoch)
    docs_features = random.sample(list(zip(doc_list, xlnet_train)), len(doc_list))
    random.seed(epoch)
    doc_list = random.sample(doc_list, len(doc_list))
def all_techniques(data1, target, text, test_percent, min_df, max_df,
                   ngram_range1, ngram_range2, vector_size1, window1,
                   negative1, min_count1):

    #Doc2vec parameters: vector_size, window, negative, min_count

    #train - test split
    X_train, X_test, y_train, y_test = data_prep(data1, target, text,
                                                 test_percent)

    #Simple BOW ********************************************************************************************************

    #min_df is used for removing terms that appear too infrequently. For example:
    #min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
    #min_df = 5 means "ignore terms that appear in less than 5 documents".
    #max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:
    #max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
    #max_df = 25 means "ignore terms that appear in more than 25 documents".

    #WITH PLAIN LYRICS
    TEXT = text

    #tokenization
    vect = CountVectorizer(max_df=max_df, min_df=min_df).fit(X_train[TEXT])

    #transform the documents in the training data to a document-term matrix
    X_train_vectorized = vect.transform(X_train[TEXT])
    X_test_vectorized = vect.transform(X_test[TEXT])

    #models with only lyrics
    output1_bow_plain_lyrics = log_reg_and_svm(X_train_vectorized, y_train,
                                               X_test_vectorized, y_test)

    #Tf-Idf ******************************************************************************************************************

    vect_tf = TfidfVectorizer(min_df=min_df, max_df=max_df).fit(X_train[TEXT])

    #transform
    X_train_vectorized_tf = vect_tf.transform(X_train[TEXT])
    X_test_vectorized_tf = vect_tf.transform(X_test[TEXT])

    #models with only lyrics
    output3_tfidf_plain_lyrics = log_reg_and_svm(X_train_vectorized_tf,
                                                 y_train, X_test_vectorized_tf,
                                                 y_test)

    #N-grams ******************************************************************************************************************

    #document frequency of 5 and extracting 1-grams and 2-grams...
    vect3 = CountVectorizer(min_df=min_df,
                            ngram_range=(ngram_range1,
                                         ngram_range2)).fit(X_train[TEXT])

    X_train_vectorized_ng = vect3.transform(X_train[TEXT])
    X_test_vectorized_ng = vect3.transform(X_test[TEXT])

    #models with only lyrics
    output5_ngram_plain_lyrics = log_reg_and_svm(X_train_vectorized_ng,
                                                 y_train, X_test_vectorized_ng,
                                                 y_test)

    #Doc2Vec ********************************************************************************************************************************************************

    train_doc = pd.DataFrame(pd.concat([X_train[TEXT], y_train], axis=1))
    test_doc = pd.DataFrame(pd.concat([X_test[TEXT], y_test], axis=1))

    train_doc2 = train_doc.apply(lambda x: TaggedDocument(
        words=tokenize_text(x[TEXT]), tags=[x[target]]),
                                 axis=1)
    test_doc2 = test_doc.apply(lambda x: TaggedDocument(
        words=tokenize_text(x[TEXT]), tags=[x[target]]),
                               axis=1)

    #DBOW is the Doc2Vec model analogous to Skip-gram model in Word2Vec.
    #The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.
    #We set the minimum word count to 2 in order to discard words with very few occurrences.

    cores = multiprocessing.cpu_count()
    model_dbow = Doc2Vec(dm=0,
                         vector_size=vector_size1,
                         window=window1,
                         negative=negative1,
                         min_count=min_count1,
                         hs=0,
                         workers=cores,
                         epochs=200)
    train_corpus = [x for x in train_doc2.values]
    model_dbow.build_vocab([x for x in train_doc2.values])

    model_dbow.train(train_corpus,
                     total_examples=model_dbow.corpus_count,
                     epochs=model_dbow.epochs)

    y_train_doc, X_train_doc = get_vectors(model_dbow, train_doc2)
    y_test_doc, X_test_doc = get_vectors(model_dbow, test_doc2)

    #models
    output7_doc2vec_dbow_plain_lyrics = log_reg_and_svm(
        X_train_doc, y_train_doc, X_test_doc, y_test_doc)

    #dm = 1 model
    model_dmm = Doc2Vec(dm=1,
                        dm_mean=1,
                        vector_size=vector_size1,
                        window=window1,
                        negative=negative1,
                        min_count=min_count1,
                        workers=cores,
                        epochs=200)
    model_dmm.build_vocab([x for x in train_doc2.values])

    model_dmm.train(train_corpus,
                    total_examples=model_dbow.corpus_count,
                    epochs=model_dbow.epochs)

    y_train_doc, X_train_doc = get_vectors(model_dmm, train_doc2)
    y_test_doc, X_test_doc = get_vectors(model_dmm, test_doc2)

    #models
    output8_doc2vec_dm_plain_lyrics = log_reg_and_svm(X_train_doc, y_train_doc,
                                                      X_test_doc, y_test_doc)

    #Mix of dbow and dmm
    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                              keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                             keep_inference=True)

    new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

    y_train_doc, X_train_doc = get_vectors(new_model, train_doc2)
    y_test_doc, X_test_doc = get_vectors(new_model, test_doc2)

    #models
    output9_doc2vec_dbowdb_plain_lyrics = log_reg_and_svm(
        X_train_doc, y_train_doc, X_test_doc, y_test_doc)

    xxd11 = pd.DataFrame(output1_bow_plain_lyrics, index=['bow_plain_lyrics'])
    xxd11 = xxd11.append(
        pd.DataFrame(output3_tfidf_plain_lyrics, index=['tfidf_plain_lyrics']))
    xxd11 = xxd11.append(
        pd.DataFrame(output5_ngram_plain_lyrics, index=['ngram_plain_lyrics']))
    xxd11 = xxd11.append(
        pd.DataFrame(output7_doc2vec_dbow_plain_lyrics,
                     index=['doc2vec_dbow_plain_lyrics']))
    xxd11 = xxd11.append(
        pd.DataFrame(output8_doc2vec_dm_plain_lyrics,
                     index=['doc2vec_dm_plain_lyrics']))
    xxd11 = xxd11.append(
        pd.DataFrame(output9_doc2vec_dbowdb_plain_lyrics,
                     index=['doc2vec_dbowdb_plain_lyrics']))

    return xxd11
def main():
    parser = argparse.ArgumentParser(description="")

    # Add options
    parser.add_argument("-v",
                        "--verbosity",
                        action="count",
                        default=0,
                        help="increase output verbosity")

    # Add arguments

    parser.add_argument("input_file", help="The input file to be projected")
    # parser.add_argument("speech_feats_file", help="The input file to be projected")
    # parser.add_argument("out_path_file", help="The input file to be projected")
    args = parser.parse_args()
    transcription_data_file = args.input_file
    df_ = pd.read_csv(transcription_data_file, sep='|')
    df_.columns = ['utterance', 'text']

    df_.index = range(df_.shape[0])

    print(df_.head())

    # df_['text']=df_['text'].apply(nltk.word_tokenize)
    print(df_.head())
    train_tagged = df_.apply(lambda r: TaggedDocument(
        words=tokenize_text(r['text']), tags=r.utterance),
                             axis=1)  # print(unsup_reviews.head())

    # # print(X_clean.shape)

    model_dbow = Doc2Vec(dm=0,
                         vector_size=300,
                         negative=5,
                         hs=0,
                         min_count=2,
                         sample=0,
                         workers=cores)
    model_dbow.build_vocab(train_tagged)

    # %%time
    for epoch in range(30):
        model_dbow.train(utils.shuffle(train_tagged),
                         total_examples=len(train_tagged.values),
                         epochs=1)
        model_dbow.alpha -= 0.002
        model_dbow.min_alpha = model_dbow.alpha

    n_dim = 300

    model_dmm = Doc2Vec(dm=1,
                        dm_mean=1,
                        vector_size=300,
                        window=10,
                        negative=5,
                        min_count=1,
                        workers=cores,
                        alpha=0.065,
                        min_alpha=0.065)
    model_dmm.build_vocab(train_tagged)

    # %%time
    for epoch in range(30):
        model_dmm.train(utils.shuffle(train_tagged),
                        total_examples=len(train_tagged.values),
                        epochs=1)
        model_dmm.alpha -= 0.002
        model_dmm.min_alpha = model_dmm.alpha

    model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                              keep_inference=True)
    model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                             keep_inference=True)

    from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
    new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])
    #Get training set vectors from our models
    x_doc2vec = OrderedDict()
    for utt, text in zip(df_['utterance'].to_list(), df_['text'].to_list()):
        tokens = model_dm.infer_vector(text)
        x_doc2vec[utt] = tokens

    df_doc2vec = pd.DataFrame(x_doc2vec).T
    df_doc2vec.columns = [
        'doc2vec_{}'.format(str(i).zfill(3)) for i in range(n_dim)
    ]
    df_doc2vec['utterance'] = df_doc2vec.index
    df_doc2vec.to_csv('output_doc2vec_features.csv', index=False)
    fname = get_tmpfile("my_doc2vec_model")
    model.save(fname)
    model = Doc2Vec.load(
        fname)  # you can continue training with the loaded model!
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(
    f1_score(y_test, y_pred, average='weighted')))

model_dbow.delete_temporary_training_data(keep_doctags_vectors=True,
                                          keep_inference=True)
model_dmm.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])


def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0],
                                 model.infer_vector(doc.words, steps=20))
                                for doc in sents])
    return targets, regressors


y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
Exemple #26
0
def label_vect(alldocs):
    print 'Label2Vec with pre-classification'
    train_docs = [doc for doc in alldocs if doc.split == 'train']
    test_docs = [doc for doc in alldocs if doc.split == 'test']
    non_docs = [doc for doc in alldocs if doc.split == 'extra']
    print('%d docs: %d train-sentiment, %d test-sentiment' %
          (len(alldocs), len(train_docs), len(test_docs)))
    ylin = pre_class(train_docs, test_docs, non_docs)
    documents = []
    for doc in train_docs:
        sentence = TaggedDocument(doc.words, [str(doc.sentiment)])
        documents.append(sentence)
    i = 0
    for doc in test_docs + non_docs:
        sentence = TaggedDocument(doc.words, [str(ylin[i])])
        documents.append(sentence)
        i += 1
    print len(documents)
    cores = multiprocessing.cpu_count()
    simple_models = [
        # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
        Doc2Vec(documents,
                dm=1,
                dm_concat=1,
                size=100,
                window=10,
                negative=5,
                hs=1,
                sample=1e-3,
                min_count=1,
                workers=cores),
        # PV-DBOW
        Doc2Vec(documents,
                dm=0,
                size=100,
                window=10,
                negative=5,
                hs=1,
                sample=1e-3,
                min_count=1,
                workers=cores),
        # PV-DM w/average
        Doc2Vec(documents,
                dm=1,
                dm_mean=1,
                size=100,
                window=10,
                negative=5,
                hs=1,
                sample=1e-3,
                min_count=1,
                workers=cores),
    ]

    models_by_name = OrderedDict(
        (str(model), model) for model in simple_models)
    models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[2]])
    models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
        [simple_models[1], simple_models[0]])

    for name, model in models_by_name.items():
        print name
        train_targets, train_regressors = zip(*[(doc.sentiment,
                                                 model.infer_vector(doc.words))
                                                for doc in train_docs])
        test_targets, test_regressors = zip(*[(doc.sentiment,
                                               model.infer_vector(doc.words))
                                              for doc in test_docs])
        util.logit(train_regressors, train_targets, test_regressors,
                   test_targets)
Exemple #27
0
y_test, X_test = vector_for_learning(dmm_model, test_tagged)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

logreg.score(X_train, y_train)
logreg.score(X_test, y_test)
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

wdv_model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)
dmm_model.delete_temporary_training_data(keep_doctags_vectors=True,
                                         keep_inference=True)

# Concatenated Model (Doc2Vec)

new_model = ConcatenatedDoc2Vec([wdv_model, dmm_model])


def get_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0],
                                 model.infer_vector(doc.words, steps=20))
                                for doc in sents])
    return targets, regressors


y_train, X_train = get_vectors(new_model, train_tagged)
y_test, X_test = get_vectors(new_model, test_tagged)

from imblearn.over_sampling import SMOTE
Exemple #28
0
def concatenate_models(model1, model2):
    model = ConcatenatedDoc2Vec([model1, model2])
    return model
Exemple #29
0
    model.reset_from(simple_models[0])
    print(model)

#simple_models[0].intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#simple_models[1].intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
#simple_models[2].intersect_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

#for model in simple_models:
#    model.syn0_lockf[:] = 1

#########################################################################################################

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec(
    [simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec(
    [simple_models[1], simple_models[0]])

##########################################################################################################
import numpy as np
from random import sample


def logistic_predictor_from_data(train_targets, train_regressors):

    classifier = LogisticRegression()
    classifier.fit(np.asarray(train_regressors), np.asarray(train_targets))
    return classifier

Exemple #30
0
    Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=10, epochs =50),
]

## Concatenated Model

## Train both PV-DBOW and PV-DM and combine the two

documents = train_corpus
models[0].build_vocab(documents)
models[1].reset_from(models[0])

for model in models:
   model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
new_model = ConcatenatedDoc2Vec((models[0], models[1]))

inferred_vector = model.infer_vector(train_corpus[0].words)
sims = model.docvecs.most_similar([inferred_vector])
print(sims)

:::{note}
A thread on how to use `most_similar()` with `ConcatenatedDoc2Vec`: [link](https://stackoverflow.com/questions/54186233/doc2vec-infer-most-similar-vector-from-concatenateddocvecs)
:::

# model 1
inferred_vector =new_model.models[0].infer_vector(train_corpus[0].words)
sims2 = new_model.models[0].docvecs.most_similar([inferred_vector])
print(sims2)
# model 2
inferred_vector =new_model.models[1].infer_vector(train_corpus[0].words)