def test_build(Xtrain, ytrain, Xtest, ytest):
    '''
    Load the three varieties of Doc2Vec models that were previously saved.
    Build a random forest model for each Doc2Vec model. Test each random
    forest model with the same test data, and write the results to a CSV
    file for each Doc2Vec model.
    '''
    print "Loading the model..."
    models = [Doc2Vec.load("Doc2Vec_dbow_d300_n5_t4"), \
    Doc2Vec.load("Doc2Vec_dm-c_d300_n5_w5_t4"),  \
    Doc2Vec.load("Doc2Vec_dm-m_d300_n5_w10_t4")]
    filenames = ['Doc2Vec_dbow.csv', 'Doc2Vec_dm-c.csv', 'Doc2Vec_dm-m.csv']
    forests = []
    for model in models:
        forests.append(build_forest(model, Xtrain, ytrain))
    for i in xrange(3):
        model = models[i]
        forest = forests[i]
        filename = filenames[i]
        features = []
        print "Creating feature list for test data..."
        for id in Xtest['id']:
            # remove the extra quotes around the id
            features.append(model.docvecs[id[1:-1]])
        print "Predicting test sentiment..."
        use_forest(forest, features, ytest, filename)
 def __init__(self, note_type, model_file, max_notes, dbow_file=None):
     self.note_type = note_type
     self.max_notes = max_notes
     self.model = Doc2Vec.load(model_file)
     if dbow_file:
         self.dbow = Doc2Vec.load(dbow_file)
     else:
         self.dbow = None
def load_model():
    '''
        Loading and Building Train and Test Data
    '''
    # loading labels
    labels = pickle.load(open('labels.p', 'rb'))

    # Using LabelEncoder to convert string to numerical value.
    label_encoder = preprocessing.LabelEncoder()
    transformed_labels = label_encoder.fit_transform(labels)

    transformed_labels = np.array(transformed_labels)

    transformed_labels = label_binarize(transformed_labels,
                                        np.unique(transformed_labels))

    print('Found %d Labels' % len(label_encoder.classes_))
    print('Labels:', label_encoder.classes_)

    # initialising feature array
    cow_arrays = np.zeros((247543, 300))

    # learning model Distributed memory model
    model = Doc2Vec.load('./acm_cow.d2v')

    # updating training arrays
    for i in range(247543):
        prefix_train_pos = "SET_" + str(i)
        cow_arrays[i] = model.docvecs[prefix_train_pos]

    train_arrays_cow, test_arrays_cow, train_labels_cow, test_labels_cow = \
        train_test_split(cow_arrays, transformed_labels,
                         test_size=0.1, random_state=42)

    # initialising feature array
    skip_arrays = np.zeros((247543, 300))

    # learning model Distributed Bag of words model
    model = Doc2Vec.load('./acm_skip.d2v')

    # updating training arrays
    for i in range(247543):
        prefix_train_pos = "SET_" + str(i)
        skip_arrays[i] = model.docvecs[prefix_train_pos]

    train_arrays_skip, test_arrays_skip, train_labels_skip, test_labels_skip = \
        train_test_split(skip_arrays, transformed_labels,
                         test_size=0.1, random_state=42)

    to_return = (train_arrays_cow, train_labels_cow,
                 test_arrays_cow, test_labels_cow,
                 train_arrays_skip, train_labels_skip,
                 test_arrays_skip, test_labels_skip)

    return to_return
Exemple #4
0
def load_questions(modelname,f_name,mapname,a_modelname):
    model = Doc2Vec.load(modelname)
    a_model = Doc2Vec.load(a_modelname)
    qids = list(enumerate([int(q) for q in open(f_name)]))
    rev_qids = [(item,index) for index,item in qids]
    qid_dict = dict(rev_qids)
    Q = []
    doc_dict = load_doc_hashes(mapname)
    for fname in os.listdir("questions"):
        Q.append(load_question(fname,model.docvecs.doctag_syn0,qid_dict,doc_dict,a_model))
    return Q
def main():
    model = Doc2Vec.load('400_pvdm_doc2vec.d2v')
    model_dbow = Doc2Vec.load('400_pvdbow_doc2vec.d2v')
    #mistake pvdm is actually pv-dbow
    path = 'datasets/'

    files = [f for f in listdir(path) if isfile(join(path,f))]
    files.pop(0)

    data_loader = DataLoader(path)

    domains = data_loader.csv_files


    names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6}

    domain_features = data_loader.get_feature_matrix(names)

    #get size
    n_total_documents = 0

    for domain in domain_features:
        n_total_documents+=len(domain[0])

    all_features = numpy.zeros(shape=(n_total_documents, 800))
    all_labels = numpy.asarray([])
    i = 0

    for domain in domain_features:
        features, labels = domain
        all_labels = numpy.hstack((all_labels, labels))
        for feature_vector in features:
            preprocessed_line = list(preprocess(feature_vector))
            all_features[i, 0:400] = numpy.float_(model.infer_vector(preprocessed_line))
            all_features[i, 400:] = numpy.float_(model_dbow.infer_vector(preprocessed_line))
            i+=1
    all_labels = numpy.asarray(all_labels)
    all_labels[all_labels == -1] = 0
    all_labels = numpy.intc(all_labels)
    train, test = data_loader.create_random_samples(all_features, all_labels)
    train_x, train_y = train
    test_x, test_y = test

    classifier = NeuralNet(n_hidden_units=[200], output_size=2, batch_size=20, n_epochs=200, dropout=True,
                                   activation_function='relu', learning_rate=.3, momentum=True, momentum_term=.5)

    classifier.train(train_x, train_y)
    classifier.test(test_x, test_y)
Exemple #6
0
def do():
    global shouldStemData
    global shouldSaveModel
    from os.path import isfile
    from gensim.models import Doc2Vec
    from sys import argv

    if not isfile(modelname):# or (len(argv) > 1 and argv[1] == '--update'):
        parsed = parseData(trainData)
        print 'Begin stemming data'
        parsed = stemData(parsed[:10000])
        if False:
            try:
                print 'Write stemmed data'
                f = open('stemmed_data.csv', 'w')
                f.write('\n'.join(map(lambda x: ' '.join(x), parsed)))
            except Exception:
                print 'Failed to write'
            finally:
                try:
                    f.close()
                except Exception:
                    print ''

        print 'Begin training'
        if False:#len(argv) > 1 and argv[1] == '--update':
            print 'Update model'
            model = Doc2Vec.load(modelname)
            model.train(documents=parsed)
        else:
            model = Doc2Vec(documents=parsed)#, size=100, workers=4, window=5, min_count=5)
        
        if shouldSaveModel:
            print 'Save model'
            model.save(modelname)

    else:
        stemData([])
        model = Doc2Vec.load(modelname)

    print 'Get results'
    t = ''
    try:
        t = getResults(model)
    except Exception:
        for x in model.most_similar(happy):
            print x[0].encode('utf8')
    open('res.txt', 'w').write(t.encode('utf8'))
def transform_input(vectorsize):
    # this loads the premade model saved as amzn.d2v and transforms writes its vectors into arrays that can be input into the scikit learn algorithms
    print('Loading Doc2Vec model...')
    try:
        model = Doc2Vec.load('./amzn.d2v')
    except Exception as exception:
        print('No existing model found. Starting to create a model...')
        train_size = 50000
        d2v_source(train_size)
        model = create_doc2vec_model(vectorsize)

    # load or generate train and test data
    try:
        with open('train.txt') as f:
            train_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('test.txt') as f:
            test_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('train_target.txt') as f:
            target = np.asarray([int(line.rstrip('\n')) for line in f])
        with open('test_target.txt') as f:
            target_test = np.asarray([int(line.rstrip('\n')) for line in f])
    
    except Exception as exception:
        print('No train data found. Generating new train and test files....')
        train_size = 50000
        test_size = 20000
        review_lines(train_size,test_size)
        with open('train.txt') as f:
            train_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('test.txt') as f:
            test_raw = np.asarray([line.rstrip('\n') for line in f])
        with open('train_target.txt') as f:
            target = np.asarray([int(line.rstrip('\n')) for line in f])

        with open('test_target.txt') as f:
            target_test = np.asarray([int(line.rstrip('\n')) for line in f])

    # infer vectors for the sentences of the train and test sets
    # I do this by creating a list of strings out of the document and then converting that into a vector
    # this takes forever...so for further use, I will only do this for new train and test sets and save the vectors
    try:
         train_arrays = np.loadtxt('train_vectors.txt')
         test_arrays = np.loadtxt('test_vectors.txt')
    except Exception as exception:
    
        train_arrays = np.zeros((target.shape[0],vectorsize))
        test_arrays = np.zeros((target_test.shape[0],vectorsize))

        print('Vectorizing the train and test data...')

        for i in range(target.shape[0]):
            train_arrays[i,:] = model.infer_vector(train_raw[i].split())

        for i in range(target_test.shape[0]):
            test_arrays[i,:] = model.infer_vector(test_raw[i].split())

        np.savetxt('train_vectors.txt',train_arrays)
        np.savetxt('test_vectors.txt',test_arrays)

    return train_arrays, target, test_arrays, target_test
Exemple #8
0
def load_or_train(sentences=None,dim=83,epochs=10):
      # Doc2Vec params
      # --------------
      # min_count: words appearing more than..
      # window: size of the skip-gram model
      # size: vector embedding size
      # sample: higher frecuency words are downsampled with this
      # negative: noise factor in context (neagtive sampling)
      # workers: parallel processing factor
      try:
          print "> Loading model.."
          model = Doc2Vec.load("doc2vec.model")
      except IOError:
          print "> No pretrained model found or loading failed."
          model = Doc2Vec(min_count=1, size=dim, window=10, negative=5, sample=1e-4, workers=7)
          if not sentences:
              print "> No labeled sentences provided. Building them now."
              sentences = labeled_sentences()
          print "> Building vocabulary.. (this may take a awhile)"
          train_sentences, test_sentences = sentences.to_array()
          model.build_vocab(train_sentences+test_sentences)
          print "> Training Doc2Vec.. (this may take awhile)"
          for i in range(epochs):
              print "--> Epoch %d"%i
              model.train(sentences.permutate())
          model.train_size = sentences.train_size
          model.test_size = sentences.test_size
          model.test_sentences = test_sentences
          model.save('./doc2vec.model')
      return model
def get_model():
  try:
    model = Doc2Vec.load(DOC2VEC_MODEL)
    return model
  except:
    print "Model couldn't be loaded"
    return None
def instance_generator(reviews_path, model_path):
    print "Loading model"
    model = Doc2Vec.load(model_path)
    print "Model loaded"
    with gzip.open(reviews_path, 'rt') as file:
        for index, line in enumerate(file):
            review = json.loads(line)
            yield model.infer_vector(review['reviewText'].split()), review['overall']
Exemple #11
0
def load_embeddings(arg=None):
    if arg == 'zh_tw':  # dim = 400
        model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('cn_word2vec'), binary=False)
    elif arg == 'CVAT':  # dim = 50
        model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT'))
    elif arg == 'IMDb':  # dim = 100
        model = Doc2Vec.load(get_file_path('test_doc2vec_model'))
    elif arg == 'CVAT_docvecs':  # dim = 50
        model = Doc2Vec.load(get_file_path('docvecs_CVAT'))
    elif arg == 'google_news':
        model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('google_news'), binary=True)
    elif arg == 'vader':
        model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v')
    else:
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model
 def __init__(self, filename=None, min_count=1, alpha_initial=0.002,
              alpha_start=0.0005, alpha_end=0.0002, min_iters=10,
              monitor=None):
     Doc2Vec.__init__(self)
     if filename is not None:
         self.load_from_pickle(filename)
     self.checkpoint = {}
     self.filename = filename
     self.min_count = min_count
     self.alpha_initial = alpha_initial
     self.alpha_start = alpha_start
     self.alpha_end = alpha_end
     self.min_iters = min_iters
     if monitor is None:
         monitor = lambda *x: None
     self.monitor = monitor
     assert 'train_lbls' in dir(self)
def puebaSimpleCosenos():
	model = Doc2Vec.load('./imdb_dm.d2v')

	source = 'data/trainneg.txt'
	generador = GeneraVectores(model)
	vecs = generador.getVecsFromFile(source)

	print "coseno primer vector, trainneg"
	print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))
Exemple #14
0
def load_model(language, models_path, models):
    if check_lang:
        path = models_path.format(language) + models[language]
        print path
        model = Doc2Vec.load(path)
        assert model.docvecs.count > 0
        return model
    else:
        return None
def do_doc2vec(label_tweet, text_tweet):

    # Traitement : exécute Doc2Vec sur l'ensemble des
    # tweets étiquetés passés en paramètre.

    # Retourne : la matrice des vecteurs lignes associés à chaque
    # tweet.
    
    print("-> Doc2Vec...")
    
    documents = [TaggedDocument(words = text.split(),
                             tags = [label]) for (label, text) in zip(label_tweet, text_tweet)]

    model = None

    filename_cache = ('model_nbdocs_' + str(args.amount) +
                          '_dim_' + str(args.dim) +
                          '.doc2vec')
    
    if not os.path.exists(filename_cache):
    
        model = Doc2Vec(documents, size = args.dim,
                    min_count = 1, workers = 4)
    
        model.save(filename_cache)
        
    else:
        model = Doc2Vec.load(filename_cache)
    
    data = None
    
    if args.coeff != 1:
        print("    pondération des #tags : " + str(args.coeff))
    
    if args.tfidf:
        print("    tfidf...")
        data = do_tfidf(text_tweet, model)
    elif args.mean:
        print("    mean...")
        data = do_mean(text_tweet, model, True)
    else:
        print("    sum...")
        data = do_mean(text_tweet, model)
    
    
    print("    ok!")
    
    # rassembler les labels de chaque tweet
    # avec les vecteurs correspondants
    
    data = pd.DataFrame(data)
    
    final_data = pd.DataFrame({'id' : label_tweet})
    final_data = pd.concat([final_data, data], axis = 1)
    
    return final_data
    def __init__(self, test_mod = False):

        self.test_mod = test_mod # режим работы демона
        self.model = Doc2Vec.load('./document2vector_X.d2v')
        self.urls = [
            # ('^$', self.index),
            # ('^upload_tra$', self.upload_tra),
            # ('^upload_tst$', self.upload_tst),
            # ('^reset_model$', self.reset_model)
        ]
def load_embeddings(arg=None):
    if arg == 'zh_tw':  # dim = 400
        model = gensim.models.Word2Vec.load_word2vec_format(None, binary=False)
    elif arg == 'CVAT':  # dim = 50
        model = gensim.models.Word2Vec.load(None)
    elif arg == 'twitter':  # dim = 50
        model = Doc2Vec.load('./data/acc/docvecs_twitter.d2v')
    else:
        raise Exception('Wrong Argument.')
    print('Load Model Complete.')
    return model
def load_vecs():
    model = Doc2Vec.load('./dbowtweets.d2v')
    train_arrays = numpy.zeros((14640, 100))
    label = ""
    count = 0

    for i in range(1, 14641):
        label = 'SENT_' + str(i)
        train_arrays[count] = model.docvecs[label]
        count += 1

    return train_arrays
Exemple #19
0
def predict():
    train = get_reviews('data/imdb/train_data.csv')
    test = get_reviews('data/imdb/test_data.csv')

    model = Doc2Vec.load(model_name)

    train_features = get_features(train, model)
    train_labels = train['sentiment'].as_matrix().reshape((len(train), 1))
    test_features = get_features(test, model)
    test_labels = test['sentiment'].as_matrix().reshape((len(test), 1))

    neural_network(train_features, train_labels, test_features, test_labels)
Exemple #20
0
def test_doc2vec_inference_saveload():
    tagged_docs = [TaggedDocument(simple_preprocess(doc), [i])
                   for i, doc in enumerate(documents)]
    model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10)
    model.save(TEST_FILE)
    del model
    model = Doc2Vec.load(TEST_FILE)
    os.remove(TEST_FILE)
    d2v = Doc2VecInference(model, DEFAULT_ANALYZER)
    match_op = Matching()
    retrieval = Retrieval(d2v, matching=match_op).fit(documents)
    result = retrieval.query("scientists")
    assert result[0] == 1
Exemple #21
0
    def _set_params(self, params):
        self.user_factors = params['P']
        self.item_factors = params['Q']
        self.item_bias = params['b_i']
        self.nn_w1 = params['nn_w1']
        self.nn_w2 = params['nn_w2']
        self.global_bias = params['avg_train_rating'] if 'avg_train_rating' in params else None

        if self.movie_to_imdb is None and 'movie_to_imdb' in params:
            self.movie_to_imdb = params['movie_to_imdb']

        if self.d2v_model is None and 'd2v_model' in self.config:
            self.d2v_model = Doc2Vec.load(self.config['d2v_model'])
def load_and_emit_vectors(filename):
    model = Doc2Vec.load('../project_snapshot/imdb.d2v')
    dataset = pickle.load(open('gensim_data.frmt'))
    vecs = []
    i = 0
    with open(filename, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter = '\t')
        for row in spamreader:
            i += 1
            val = tokenize(row[3].lower().translate(None, string.punctuation))
            x = model.infer_vector(val)
            print i
            vecs.append(x)
    pickle.dump(np.array(vecs), open('doc2vec_features', 'w'))
Exemple #23
0
    def _set_params(self, params):
        self.user_factors = params['P']
        self.item_factors = params['Q']
        self.item_bias = params['b_i']
        self.global_bias = params['avg_train_rating'] if 'avg_train_rating' in params else None

        if self.movie_to_imdb is None and 'movie_to_imdb' in params:
            self.movie_to_imdb = params['movie_to_imdb']

        if self.user_pref_model is None and 'user_pref_nn_params' in params:
            self.user_pref_model = UserPrefModel(self.config)
            self.user_pref_model.set_params(params['user_pref_nn_params'])
        if self.d2v_model is None and 'd2v_model' in self.config:
            self.d2v_model = Doc2Vec.load(self.config['d2v_model'])
 def load_from_pickle(self, filename):
     """
     This loads a pretrained Word2Vec file into this Doc2Vec class.
     """
     model_w2v = Doc2Vec.load(filename)
     for attr in dir(model_w2v):
         if attr == '__dict__':
             continue
         # Skip methods that we already have in this class
         if attr in dir(self) and callable(getattr(model_w2v, attr)):
             continue
         try:
             setattr(self, attr, getattr(model_w2v, attr))
         except AttributeError:
             continue
Exemple #25
0
	def nearest_neighbour(self, fname) :
		"""
		Finds the "n_no" of nearest neighbours for each Query Question and writes it 
		in a file "fname" given as a parameter.
		"""
                qout = open(fname, "w")
                model = Doc2Vec.load('my_model.doc2vec')
		for i in range(self.train_size+1,self.q_total) :
                        j=0
                        qout.write(self.q_actual[i]);
			for items in model.docvecs.most_similar(i) :
				qout.write("NN %d (%s) --- " % (j+1, items[1])+self.q_actual[items[0]])
                                j=j+1
                print "Written Successfully in file "+fname+" !!!"
		qout.close();		
 def load_from_w2v(self, filename):
     """
     This loads a pretrained Word2Vec file into this Doc2Vec class.
     """
     model_w2v = Doc2Vec.load_word2vec_format(filename, binary=False)
     self._vocab_from = Word2Vec._vocab_from
     self._prepare_sentences = model_w2v._prepare_sentences
     for attr in dir(model_w2v):
         if attr == '__dict__':
             continue
         if attr in dir(self) and callable(getattr(model_w2v, attr)):
             continue
         try:
             setattr(self, attr, getattr(model_w2v, attr))
         except AttributeError:
             continue
Exemple #27
0
def create_si_user_model(config, ratings):
    if 'si_user_d2v_model' in config:
        d2v_model = Doc2Vec.load(config['si_user_d2v_model'])
        feature_vec_dict = UserIdToDocVec(d2v_model.docvecs, ratings)
    elif 'si_user_vector_dict' in config:
        feature_vec_dict = dd.io.load(config['si_user_vector_dict'])

    si_user_nn = list(config['si_user_nn_hidden'])
    si_user_nn.insert(0, config['nb_latent_f'])
    si_user_nn.append(int(feature_vec_dict[config['si_user_valid_id']].shape[0]))
    config['si_user_nn'] = si_user_nn
    si_user_model = NNSideInfoModel(config['si_user_nn'],
                                    config['si_user_reg_lambda'],
                                    config['si_user_cosine_lambda'],
                                    feature_vec_dict)
    return si_user_model, config
Exemple #28
0
def train_mfnn(config):
    ratings = pd.read_csv(config['ratings_path'])

    config['nb_users'] = len(ratings['user_id'].unique())
    config['nb_movies'] = len(ratings['movie_id'].unique())

    train = pd.read_csv(config['train_path'])
    test = pd.read_csv(config['test_path'])
    val = None
    if config['val']:
        val = pd.read_csv(config['val_path'])

    zero_sampler = None
    if 'zero_sample_factor' in config:
        config['zero_samples_total'] = len(train) * config['zero_sample_factor']
        zero_sampler = ZeroSampler(ratings)

    if config['binarize']:
        train = binarize_ratings(train, pos=config['binarize_pos'], neg=config['binarize_neg'],
                                 threshold=config['binarize_threshold'])
        test = binarize_ratings(test, pos=config['binarize_pos'], neg=config['binarize_neg'],
                                threshold=config['binarize_threshold'])
        if val is not None:
            val = binarize_ratings(val, pos=config['binarize_pos'], neg=config['binarize_neg'],
                                   threshold=config['binarize_threshold'])

    d2v_model = Doc2Vec.load(config['d2v_model'])
    config['nb_d2v_features'] = int(d2v_model.docvecs['107290.txt'].shape[0])

    if config['verbose'] > 0:
        print "experiment: ", config['experiment_name']
        print config

    users, items = create_lookup_tables(ratings)
    movie_to_imdb_dict = movie_to_imdb(ratings)

    if 'theano' in config and config['theano']:
        model = MFNNModel(users, items, config, movie_to_imdb_dict)
        model.user_pref_model = UserPrefModel(config)
    else:
        model = MFNNModelNumpy(users, items, config, movie_to_imdb_dict)

    model.d2v_model = d2v_model
    loss_history = model.fit(train, val=val, test=test, zero_sampler=zero_sampler)

    return model, config, loss_history
Exemple #29
0
    def make_model(self, fname):
        if os.path.isfile(fname):
            with Timer("Load model from a file", self.logger):
                self.model = Doc2Vec.load('./imdb.d2v')
                self.dim = self.model.vector_size

        else:
            with Timer("build model from documents", self.logger):
                sentences = LabeledLineSentence(self.vocab_sources)
                model = Doc2Vec(min_count=1, window=10, size=self.dim, sample=1e-4, negative=5, workers=7)
                model.build_vocab(sentences.to_array())

                for epoch in range(50):
                    self.logger.info('Epoch %d' % epoch)
                    model.train(sentences.sentences_perm())

                model.save(fname)
                self.model = model
Exemple #30
0
def  get_features_by_doc2vec():
    global  max_features
    x_train, x_test, y_train, y_test=load_all_files()

    x_train=cleanText(x_train)
    x_test=cleanText(x_test)

    x_train = labelizeReviews(x_train, 'TRAIN')
    x_test = labelizeReviews(x_test, 'TEST')

    x=x_train+x_test
    cores=multiprocessing.cpu_count()
    #models = [
        # PV-DBOW
    #    Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores),
        # PV-DM w/average
    #    Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter=10, workers=cores),
    #]
    if os.path.exists(doc2ver_bin):
        print "Find cache file %s" % doc2ver_bin
        model=Doc2Vec.load(doc2ver_bin)
    else:
        model=Doc2Vec(dm=0, size=max_features, negative=5, hs=0, min_count=2, workers=cores,iter=60)


        #for model in models:
        #    model.build_vocab(x)
        model.build_vocab(x)

        #models[1].reset_from(models[0])

        #for model in models:
        #    model.train(x, total_examples=model.corpus_count, epochs=model.iter)
        #models[0].train(x, total_examples=model.corpus_count, epochs=model.iter)
        model.train(x, total_examples=model.corpus_count, epochs=model.iter)
        model.save(doc2ver_bin)

    #x_test=getVecs(models[0],x_test,max_features)
    #x_train=getVecs(models[0],x_train,max_features)
    x_test=getVecs(model,x_test,max_features)
    x_train=getVecs(model,x_train,max_features)

    return x_train, x_test, y_train, y_test
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from gensim import utils
from time import time

# assumptions: window is 5 words left and right, eliminate words than dont occur in
# more than 10 docs, use 4 workers for a quadcore machine. Size is the size of vector
# negative=5 implies negative sampling and makes doc2vec faster to train
#model = Doc2Vec(sentence, size=100, window=5, workers=4, min_count=5)

size = 600  #change to 100 and 300 to generate vector with those dimensions

#instantiate our model
model_dm = Doc2Vec(min_count=10,
                   window=5,
                   size=size,
                   sample=1e-3,
                   negative=5,
                   workers=4)

#build vocab over all reviews
model_dm.build_vocab(sentence)

#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
Idx = list(range(len(sentence)))

t0 = time()
for epoch in range(5):
    random.shuffle(Idx)
    perm_sentences = [sentence[i] for i in Idx]
    model_dm.train(perm_sentences)
    print(epoch)
Exemple #32
0
import re
import csv
import nltk as nltk
from nltk.util import ngrams
#from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from contractions import CONTRACTION_MAP
from gensim.models.word2vec import Word2Vec
from gensim.models import Doc2Vec
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import itertools
import pickle
import creds

modelw1v = Word2Vec.load(creds.w2vpath)
model = Doc2Vec.load(creds.d2vpath)
kmeans_model = pickle.load(open(creds.kmodel, 'rb'))

stopwords = [
    "youour", "got", "tho", "im", "u", "ur", 'i', 'me', 'my', 'myself', 'we',
    'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
    'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
    'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
    'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
    'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
    'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
    'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
    'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
Exemple #33
0
for i, sent in enumerate(text1):
    text2.append([sent, ['label' + str(i)]])
text3 = []
for sent, label in text2:
    text3.append(TaggedDocument(sent, label))
del text2


def sentences_perm(sentences):
    shuffle(sentences)
    return sentences


model = Doc2Vec(min_count=10,
                window=5,
                vector_size=100,
                sample=1e-4,
                negative=5,
                workers=8)
model.build_vocab(text3)
token_count = sum([len(sentence) for sentence in text3])
model.train(text3, total_examples=token_count, epochs=10)

model.save('yelp.d2v')
train_arrays = np.zeros((10000, 100))
train_labels = np.zeros(10000)
train_labels = np.array(star)
for i in range(10000):
    train_arrays[i] = model['label' + str(i)]
X_train, X_test, y_train, y_test = train_test_split(train_arrays,
                                                    train_labels,
                                                    test_size=0.2,
Exemple #34
0
def load_model(filepath):
    model = Doc2Vec.load(filepath)
    return model
def train(doc_embedding_size, negative_sample_size, epochs):

    print('doc_embedding_size:', doc_embedding_size)
    print('negative_sample_size:', negative_sample_size)
    print('epochs:', epochs)

    logging.getLogger().setLevel(logging.DEBUG)

    all_docs = utils.load('all_docs')
    alldocs = []
    corpus_size = len(all_docs)

    GoogleJobSkillDocument = namedtuple('GoogleJobSkillDocument', 'words tags')

    for i in range(corpus_size):
        words = all_docs[i].title_words
        tags = [i]
        alldocs.append(GoogleJobSkillDocument(words, tags))
    for i in range(corpus_size):
        words = all_docs[i].detail_words
        tags = [i + corpus_size]
        alldocs.append(GoogleJobSkillDocument(words, tags))

    print('docs size:', len(alldocs))

    doc_list = alldocs[:]
    shuffle(doc_list)

    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    model = Doc2Vec(dm=0,
                    vector_size=doc_embedding_size,
                    negative=negative_sample_size,
                    hs=0,
                    min_count=2,
                    sample=0,
                    epochs=epochs,
                    workers=cores)

    # Build corpus
    model.build_vocab(alldocs)
    print("%s vocabulary scanned & state initialized" % model)
    print("vocab size:", len(model.wv.vocab))
    print("docvecs size:", len(model.docvecs))

    # Train
    print("Training %s" % model)
    model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs)

    # Save
    title_emb, detail_emb = utils.split_embeddings(model.docvecs, 2)
    doc_emb = utils.concat_embeddings([title_emb, detail_emb])

    title_emb = utils.normalize_embeddings(title_emb)
    detail_emb = utils.normalize_embeddings(detail_emb)
    doc_emb = utils.normalize_embeddings(doc_emb)

    utils.save_doc_embeddings(title_emb,
                              'gensim_dbow_title',
                              negative_size=negative_sample_size)
    utils.save_doc_embeddings(detail_emb,
                              'gensim_dbow_detail',
                              negative_size=negative_sample_size)
    utils.save_doc_embeddings(doc_emb,
                              'gensim_dbow',
                              negative_size=negative_sample_size)
Exemple #36
0
from gensim.models import doc2vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
#형태소 분석
import jpype
from konlpy.tag import Kkma

# 파일로부터 모델을 읽는다. 없으면 생성한다.
d2v_faqs = Doc2Vec.load('d2v_faqs.model')
faqs = pd.read_csv('faq.csv')

kkma = Kkma()
filter_kkma = [
    'NNG',  #보통명사
    'NNP',  #고유명사
    'OL',  #외국어
]


def tokenize_kkma(doc):
    jpype.attachThreadToJVM()
    token_doc = ['/'.join(word) for word in kkma.pos(doc)]
    return token_doc


def tokenize_kkma_noun(doc):
    jpype.attachThreadToJVM()
    texte2 = texte2.replace("\n", ' ')
    texte2 = texte2.replace("/", ' ')
    texte2 = texte2.replace("!", ' ')
    texte2 = texte2.replace("?", ' ')
    texte2 = texte2.replace("\"", ' ')
    texte2 = texte2.replace("'", ' ')
    texte2 = texte2.replace("\#", ' ')
    texte2 = texte2.split()
    texte2 = [
        token for token in texte2
        if len(token) and token.lower() not in stopwords
    ]
    return ' '.join(texte2)


model = Doc2Vec.load('tweetmodel.model')


def norme(vec):
    return np.sqrt(np.sum(vec * vec))


def prediction(vec1, vec2):
    val = np.sum((vec1 * vec2))
    val /= (norme(vec1) * norme(vec2))
    if val > 1:
        return np.arccos(
            1
        )  #because of the structure of floats it can appen that the result is almost 1 but superior
    else:
        return np.arccos(val)
Exemple #38
0
    lambda x: cleaner.replace_null_with_empty_string(x))
data['readable_text'] = data['value'].apply(
    lambda x: cleaner.get_readable_text(x))
data['processed_value'] = data['value'].apply(
    lambda x: cleaner.clean_html_and_extract_text(x))

documents = data['processed_value'].tolist()

labeledDocs = []

for i, document in enumerate(documents):
    labeledDocs.append(LabeledSentence(document.split(), "label_" + str(i)))

model = Doc2Vec(min_count=1,
                window=10,
                size=100,
                sample=1e-4,
                negative=5,
                workers=8)
model.build_vocab(labeledDocs)

import random

for epoch in range(10):
    random.shuffle(labeledDocs)
    model.train(labeledDocs,
                total_examples=model.corpus_count,
                epochs=model.iter)

model.save('accounting.d2v')

model = Doc2Vec.load('accounting.d2v')
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from keras.models import model_from_json
import numpy
import os
# random shuffle
from random import shuffle


# classifier
from sklearn.linear_model import LogisticRegression

import logging
import sys

model = Doc2Vec.load('./imdb.d2v')
train_arrays = numpy.zeros((4000, 100))
train_labels = numpy.zeros(4000)

for i in range(2000):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model.docvecs[prefix_train_pos]
    train_arrays[2000 + i] = model.docvecs[prefix_train_neg]
    train_labels[i] = 1
    train_labels[2000 + i] = 0
print len(train_arrays[0])
#print train_labels

test_arrays = numpy.zeros((798, 100))
test_labels = numpy.zeros(798)
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(
                        utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences

    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

sources = {'/home/jason/Desktop/word2vec-sentiments/test-neg.txt':'TEST_NEG', '/home/jason/Desktop/word2vec-sentiments/test-pos.txt':'TEST_POS', '/home/jason/Desktop/word2vec-sentiments/train-neg.txt':'TRAIN_NEG', '/home/jason/Desktop/word2vec-sentiments/train-pos.txt':'TRAIN_POS', '/home/jason/Desktop/word2vec-sentiments/train-unsup.txt':'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())

for epoch in range(50):
    logger.info('Epoch %d' % epoch)
    model.train(sentences.sentences_perm())

model.save('./imdb.d2v')

def labelize_data(comments, label):
    result = []
    prefix = label
    for j, t in zip(comments.index, comments):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % j]))
    return result

all_comments = df['comment']
all_comments_wv = labelize_data(all_comments, 'all')

print(all_comments_wv)

cores = multiprocessing.cpu_count()
model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_comments_wv)])

for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_comments_wv)]), total_examples=len(all_comments_wv), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha


def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
Exemple #42
0
import gensim
import numpy as np
from gensim.models import Doc2Vec
import time
import tqdm
import os
import re

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

print("Loading model of DM para2vec:")
model = Doc2Vec.load('trained_models/para_vectors_dm.d2v')

# Construct the training arrays and labels
train_arrays = np.zeros((25000, 300))
train_labels = np.zeros(25000)
print("Creating training arrays and labels:")
for i in tqdm.trange(12500):
    train_arrays[i] = model['train_pos' + str(i)]
    train_labels[i] = 1
    train_arrays[i + 12500] = model['train_neg' + str(i + 12500)]
    train_labels[i + 12500] = 0

# Construct the test arrays and labels
test_arrays = np.zeros((25000, 300))
test_labels = np.zeros(25000)
print("Creating test arrays and labels:")
Exemple #43
0
	def sentences_perm(self):
		shuffle(self.sentences)
		return self.sentences

def trainDoc2Vec():
	pass

if __name__ == '__main__':

	sentences = LabeledLineSentence()
	
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	try:
		print "Trying to load model"
		model = Doc2Vec.load("../models/test_perm_10_epoch.d2v")
	except Exception, e:

		print "Model not found, constructin model with size 300, window 30, alpha 0.025, 5 iterations"
		model = Doc2Vec(min_count=3, window=30, size=300, sample=1e-4, negative=5, alpha=0.025, min_alpha=0.025, workers=4)  # use fixed learning rate
		model.build_vocab(sentences.to_array())

		# model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4)

		# model.build_vocab(sentences.to_array())

		for epoch in range(5):
			model.train(sentences.sentences_perm())

			# model.train(sentences)
			# model.alpha -= 0.002  # decrease the learning rate
from gensim import corpora

import getDictinary


class genSentence:
    myDict = corpora.dictionary.Dictionary.load_from_text(u"字典.txt")

    def __iter__(self):
        dictSet = set(genSentence.myDict.values())
        for tName in getDictinary.dirIterator_tuple():
            wordsList_s = " ".join(
                codecs.open(tName[0], 'rb', 'utf-8').readlines()).split()
            wordsList_st = [item for item in wordsList_s if item in dictSet]
            documents = TaggedDocument(wordsList_st, [])
            yield documents
            #yield wordsList_st


if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    doc2vecModel = Doc2Vec(genSentence(),
                           size=100,
                           window=8,
                           min_count=5,
                           workers=4)

    doc2vecModel.save('doc2vec.model')
Exemple #45
0
import numpy as np
from gensim.models import Doc2Vec
from utils import load_config, load_lab_seq_sp, embedding_to_file, labels_to_file

config = load_config("config.json")

dim = config["embedding_dimensions"]
model_sym_path = config["model_sym_path"]
model_SP_path = config["model_sp_path"]
training_data_file = config["training_data"]
embeddings_data_path = config["trained_embeddings_path"]
labels_data_path = config["labels"]

model_sym = Doc2Vec.load(model_sym_path)
model_SP = Doc2Vec.load(model_SP_path)

input_trajectories, input_sp, labels = load_lab_seq_sp(training_data_file)

labels_dict = dict.fromkeys(labels)
num_trajs = len(list(labels_dict))
print("Found {} unique user trajectories".format(num_trajs))

sum_vector = np.zeros(dim, dtype=np.float64)
index = 0
export_labels = []
total_labels = len(labels)

for label in labels:
    if index % 500 == 0:
        print("Evaluating traj {}/{} of user {}".format(index, total_labels, label))
Exemple #46
0
 def sent2vec(self):
     self.d2v = Doc2Vec(self.sents,
                        size=self.sent_n,
                        window=8,
                        min_count=5,
                        workers=4)
# classifier
from sklearn.linear_model import LogisticRegression
from gensim.models import Doc2Vec
import numpy
from GeneraVectores import GeneraVectores
from sklearn import svm
from NNet import NeuralNet

if __name__ == '__main__':
    model = Doc2Vec.load('./imdb_dbow.d2v')

    #print model["TRAIN_POS_8029"]
    #exit()
    dim = 100
    train_arrays = numpy.zeros((25000, dim))
    train_labels = numpy.zeros(25000)

    generador = GeneraVectores(model)
    Pos = generador.getVecsFromFile("data/trainpos.txt")
    print "generados vectores Pos"
    Neg = generador.getVecsFromFile("data/trainneg.txt")
    print "generados vectores Neg"

    for i in range(12500):
        train_arrays[i] = Pos[i]
        train_arrays[12500 + i] = Neg[i]
        train_labels[i] = 1
        train_labels[12500 + i] = 0

    test_arrays = numpy.zeros((25000, dim))
    test_labels = numpy.zeros(25000)
Exemple #48
0
# distinct file?

from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

simple_models = [
    # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1,
            dm_concat=1,
            size=100,
            window=5,
            negative=5,
            hs=0,
            min_count=2,
            workers=cores),

    # PV-DBOW
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),

    # PV-DBOW w/ word training
    Doc2Vec(dm=0,
            dbow_words=1,
            size=100,
            negative=5,
            hs=0,
            min_count=2,
            workers=cores),
Exemple #49
0
def get_trained_model(sentences, dimension=300):
    model = Doc2Vec(documents=sentences, size=dimension)
    return model
Exemple #50
0
# load vectorizer
with open(f'{main_path}/data/svc_model/svc_tf_idf_vectorizer.pk',
          'rb') as filename:
    vectorizer = pickle.load(filename)
print(vectorizer)

# load model
with open(f'{main_path}/data/svc_model/svc_model_tf_idf.joblib.pkl',
          'rb') as filename:
    model = joblib.load(filename)
print(model)

# load doc2vec model
fname = get_tmpfile(
    f'{main_path}/data/doc2vec_model/doc2vec_model_final_new.mdl')
doc2vec_model = Doc2Vec.load(fname)
print(doc2vec_model)

with open(f'{main_path}/data/doc2vec_model/logistic_reg_doc2vec.joblib.pkl',
          'rb') as filename:
    doc2vec_log_reg = joblib.load(filename)
print(doc2vec_log_reg)

# Vader instance
sid = SentimentIntensityAnalyzer()
# https://github.com/fnielsen/afinn
afinn = Afinn()


def intention_finder(text):
    '''intentions_list = []
_window = 2
_min_count = 5
_epochs = 10

args = sys.argv
sys.stdout.writelines("# of args: " + str(len(args)))

for arg in args:
    print(arg)

if len(args) >= 2:
    path_to_model = args[1]
    if os.path.isfile(args[1]):
        sys.stdout.writelines("valid file found!")
    else:
        sys.stdout.writelines(args[1] + " is not a file!")

if len(args) == 5:
    _window = args[2]
    _min_count = args[3]
    _epochs = args[4]

model = Doc2Vec.load(path_to_model, mmap=None)
model._clear_post_train()

model.train(documents=model.docvecs,
            total_examples=model.corpus_count,
            epochs=int(_epochs))

model.save(path_to_model)
print("Trained model and saved to " + path_to_model)
Exemple #52
0
            userprofile = ' '.join(neg_list[k].return_value_work_exp())
            neg_profile_list.append(userprofile)

        pos_label_list = ['pos_profile_'+ str(k) for k in range(len(pos_profile_list))]
        neg_label_list = ['neg_profile_'+ str(k) for k in range(len(pos_profile_list))]

        # documents = labeled_data_sentence.LabeledLineSentence(pos_profile_list,pos_label_list,neg_profile_list,neg_label_list)
        # model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
        # model.build_vocab(documents.to_array())
        # for epoch in range(20):
        #     model.train(documents.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter)
        #     # model.train(documents.sentences_perm())
        # filename = '/Users/pengyuzhou/Downloads/word_embedding_result/job_title_'+globalparameter.jobtitle_list[i]+'.d2v'
        # model.save(filename)
        # filename = '/Users/pengyuzhou/Downloads/glove.6B/glove.6B.100d.txt.word2vec'
        model = Doc2Vec.load(filename)

        pos_vector_list = []
        neg_vector_list = []
        for x in range(len(pos_list)):
            pos_vector_list.append(model.infer_vector(pos_profile_list[x]))
        for x in range(len(neg_list)):
            neg_vector_list.append(model.infer_vector(neg_profile_list[x]))

        # model = Doc2Vec.load('/Users/pengyuzhou/Downloads/word_embedding_result/job_title_'+globalparameter.jobtitle_list[i]+'.d2v')

        test2 = model.most_similar('software')

        train_arrays = numpy.zeros((500,100))
        train_labels = numpy.zeros(500)
Exemple #53
0
def main(corpora, p2v_dir, p2v_file, diag_dir, epoch):

    SentimentDocument = namedtuple('SentimentDocument',
                                   'words tags split sentiment')

    if ('IMDB' in corpora):
        alldocs = []  # will hold all docs in original order
        with open('alldata-id.txt', encoding='utf-8') as alldata:
            for line_no, line in enumerate(alldata):
                tokens = gensim.utils.to_unicode(line).split()
                words = tokens[1:]
                tags = [
                    line_no
                ]  # `tags = [tokens[0]]` would also work at extra memory cost
                split = ['train', 'test', 'extra', 'extra'
                         ][line_no // 25000]  # 25k train, 25k test, 25k extra
                sentiment = [
                    1.0, 0.0, 1.0, 0.0, None, None, None, None
                ][line_no // 12500]  # [12.5K pos, 12.5K neg]*2 then unknown
                alldocs.append(SentimentDocument(words, tags, split,
                                                 sentiment))
        train_docs = [
            ' '.join(doc.words) for doc in alldocs if doc.split == 'train'
        ]
        test_docs = [
            ' '.join(doc.words) for doc in alldocs if doc.split == 'test'
        ]

    elif ('20ng' in corpora):
        train_docs = newsgroups_train.data
        test_docs = newsgroups_test.data

    for column in parameters:
        i = p2v_file.find(column)
        if (i != -1):
            value = p2v_file[i:].split()[1]
            df.set_value(epoch, column, value)
        else:
            df.set_value(epoch, column, default_parameters[column])

    p2v_model = Doc2Vec.load(p2v_dir + p2v_file)
    f = open(p2v_dir + p2v_file + 'test', 'rb')
    p = pickle.load(f)
    if ('IMDB' in corpora):
        dev = 50
        p2v_DocumentVectors0 = np.array([
            p2v_model.docvecs['SENT_' + str(i)]
            for i in range(12000, 12500 - dev)
        ] + [
            p2v_model.docvecs['SENT_' + str(i)]
            for i in range(12500 + dev, 13000)
        ])
        y_1 = [1] * (500 - dev)
        y_0 = [0] * (500 - dev)
        train_labels = y_1 + y_0

        test_labels = [1] * dev + [0] * dev
    else:
        p2v_DocumentVectors0 = np.array([
            p2v_model.docvecs[tag] for tag in p2v_model.docvecs.doctags
            if 'train' in tag
        ])
        test_labels = [p[i][1][0].split()[2] for i in p]
        train_labels = [
            tag.split()[2] for tag in model_d2v.docvecs.doctags
            if 'train' in tag
        ]

    p2v_DocumentVectors1 = np.concatenate([p[i][0].reshape(1, -1) for i in p])

    for classifier in classifiers:
        accuracy, best = Classification(classifier, p2v_DocumentVectors0,
                                        train_labels, p2v_DocumentVectors1,
                                        test_labels)
        #write it all into DataFrame
        df.set_value(epoch, classifier, accuracy)
        df.set_value(epoch, 'best_parameters' + classifier, best)
        df.set_value(epoch, 'epoch', epoch)
        df.to_csv(diag_dir + "Res_PV_IMDB.csv")
        print(accuracy)

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1


d2v_model = Doc2Vec.load("doc2vec_models/doc2vec_07022020_105155.model")

original_vecs = sk_pre.StandardScaler().fit_transform(
    d2v_model.docvecs.vectors_docs)
pca = PCA(n_components=100).fit(original_vecs)

sum_pca_variance = 0.0
pca_variance_threshold = 0
for variance in pca.explained_variance_ratio_:
    sum_pca_variance += variance

    if sum_pca_variance >= 0.90:
        pca_variance_threshold = pca.explained_variance_ratio_.tolist().index(
            variance)
        break
Exemple #55
0
# use fit after running LSTM on all inputs.

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM

from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedLineDocument

print('Loading original doc2vec model...')
doc2vec_model = model = Doc2Vec.load(
    '../doc2vec/small_wiki_subset.en.doc2vec.model')

print('Build sequential model fed by LSTM...')
in_out_neurons = 2
hidden_neurons = 300

model = Sequential()
model.add(LSTM(in_out_neurons, hidden_neurons, return_sequences=False))
model.add(Dense(hidden_neurons, in_out_neurons))
model.add(Activation("linear"))
model.compile(loss="mean_squared_error", optimizer="adam")

print('model compiled!')

print('Training Sequential/LSTM model...')
model.fit(
    doc2vec_model.docvecs,
    None,
    batch_size=batch_size,
Exemple #56
0
                           axis=0).reset_index()

# transform all data to labelled doc
pre_char = 100
post_char = 300
lbldoc_all, drop_ind_all = table2lbldoc(df_all_doc2vec, pre_char, post_char)

# for reshuffling
doc_list = lbldoc_all[:]

# generate training drop list and testing drop list from drop_ind_all
drop_ind_train2 = [e for e in drop_ind_all if e < train2.shape[0]]
drop_ind_test2 = [e for e in drop_ind_all if e > train2.shape[0] - 1]

# build model and vocabulary
model = Doc2Vec(dm=1, size=100, window=5, negative=5, hs=0, min_count=2)
model.build_vocab(lbldoc_all)

# train model: decrease learning rate and shuffling
loop = 30
for epoch in range(loop):
    #print(epoch, model.corpus_count, model.iter)
    shuffle(doc_list)  # Shuffling gets better results
    model.train(lbldoc_all,
                total_examples=model.corpus_count,
                epochs=model.iter)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

model.save('models/model.doc2vec')
Exemple #57
0
import numpy as np
import math, sys, gzip, collections, gensim.models.doc2vec
from gensim.models import Doc2Vec
from collections import OrderedDict, namedtuple
import random, unicodedata, re
import datetime

f = '/da4_data/play/api/doc2vecR.200.30.20.5.1518784533.eA.trained'
mod = Doc2Vec.load(f)


def dist(av, bv):
    return (sum(av * bv) / math.sqrt(sum(av * av) * sum(bv * bv)))


#experts in JS
na = 0
f = open("exA.csv", "rb")
for line in f:
    h, m, nc, s, a = line.rstrip().decode('ascii', 'ignore').split(';')
    if a in mod.docvecs:
        av = mod.docvecs[a]
        mv = mod.wv.get_vector(m)
        st = m + ';' + nc + ';' + s + ';' + str(dist(av, mv))
        print(st)
        na += 1
    else:
        sys.stderr.write(a + '\n')

sys.stderr.write(str(na) + '\n')
#print (m +';'+str(dist(av,mv))+';'+ str(mod.wv.most_similar([av])))
Exemple #58
0
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import os
import json
import argparse
import numpy as np

from collections import defaultdict
from gensim.models import Doc2Vec
from keras.models import load_model
""" 載入前面訓練好的 models """
d2v = Doc2Vec.load('model/doc2vec_model.d2v')

print 'Loading classifier model ...'
classifier = load_model('model/classifier_model.h5')
print 'Load classifier model success.'

parser = argparse.ArgumentParser(description='doc2vec nn classifier')
parser.add_argument('--filename',
                    dest='filename',
                    default='',
                    help='要 highlight 的新電影彈幕資料')
parser.add_argument('--preprocess_script_dirname',
                    dest='preprocess_script_dirname',
                    default='./preprocess-script/',
                    help='放前處理程式(preprocess.js)的資料夾路徑')
parser.add_argument('--processed_data_dirname',
                    dest='processed_data_dirname',
                    default='./processed-data/',
                    help='放前處理完後的彈幕資料後的資料夾路徑')
Exemple #59
0
def predictPE(inputDataLabel, model_type):
    '''predictPE is a massive wrapper to run all iterations of training and testing using some input label.
    (eg, impression or report). A dictionary structure of confusion matrices is returned, include a summed
    and normalized version to represent the success of the entire batch. Note that this function is dependent
    on some of the global variables defined above (not proper, but will work for this batch script :))
    :param model_type: right now I just tried "logistic_regression"
    :param inputDataLabel: should be one of "impression" or "rad_report"
    '''
    confusions = dict()
    count = 1
    for remove_stop_words in [True, False]:
        for remove_non_english_chars in [True]:

            # For each set of params, we can take sum across batches of training and testing
            batchuid = "batch-%s-%s" % (int(remove_stop_words),
                                        int(remove_non_english_chars))
            print("Starting batch %s" % (batchuid))
            batchconfusions = dict()
            summed_confusion = pandas.DataFrame(0,
                                                columns=list(lookup.keys()),
                                                index=list(lookup.keys()))

            for holdout in batches:

                # Separate training and test data
                # Question - is there any reason to split via batches? Bias in this?
                train_set = [x for x in batches if x != holdout]
                test_impression = data[inputDataLabel][data.batch == holdout]
                test_labels = pandas.DataFrame(
                    data['disease_state_label'][data.batch == holdout])
                test_ids = data['order_deid'][data.batch == holdout]
                train_impression = data[inputDataLabel][data.batch.isin(
                    train_set)]
                train_labels = pandas.DataFrame(
                    data['disease_state_label'][data.batch.isin(train_set)])
                train_ids = data['order_deid'][data.batch.isin(train_set)]

                train_labels["CLASS"] = "TRAIN"
                test_labels["CLASS"] = "TEST"
                allIds = train_ids.append(test_ids).tolist()
                allLabels = train_labels.append(test_labels)
                allLabels.index = allIds

                # Compile them together
                allImpression = train_impression.tolist(
                ) + test_impression.tolist()
                # sanity check
                assert (len(allIds) == len(allImpression) == len(allLabels))

                # Make some strings for pretty printing of train/test batch
                training_ids = "|".join([str(int(x)) for x in train_set])
                testing_id = "%s" % (int(holdout))

                # Let's have a unique id so we can merge with whole report data later, eg 'holdout(2)-train(3|4)-stopw(1)-nonengrem(1)'
                uid = "holdout(%s)-train(%s)-rmstopw(%s)-rmnoneng(%s)" % (
                    testing_id,  # holdout id
                    training_ids,  # training ids joined with |
                    int(remove_stop_words),  # 0/1
                    int(remove_non_english_chars))  # 0/1

                print(
                    "RUNNING ANALYSIS %s:\n\ntrain(%s)\ntest(%s)\nrmstopw(%s)\nrmnoneng(%s)"
                    % (count, training_ids, testing_id, remove_stop_words,
                       remove_non_english_chars))

                # Do the training
                words_list = TrainSentences(
                    text_list=allImpression,
                    remove_non_english_chars=remove_non_english_chars,
                    remove_stop_words=remove_stop_words)

                labeledDocs = LabeledLineSentence(words_list=words_list,
                                                  labels_df=allLabels)

                # Build the vocabulary
                model = Doc2Vec(size=size,
                                window=window,
                                min_count=min_count,
                                workers=workers,
                                alpha=alpha,
                                min_alpha=min_alpha)  # use fixed learning rate

                # train_words=True
                # train_lbls=True

                # Build the vocabularity and fine tune the alpha (manually control learning rate over 10 epochs)
                model.build_vocab(labeledDocs)
                for it in range(iters):
                    print("Training iteration %s" % (it))
                    model.train(labeledDocs)
                    model.alpha -= 0.002  # decrease the learning rate
                    model.min_alpha = model.alpha  # fix the learning rate, no decay
                    model.train(labeledDocs)

                # This was done manually during testing, for impressions
                #model.save('data/model.doc2vec')

                # Now let's create an object with data frames with training and testing data and labels
                vecs = get_vectors(model=model,
                                   words_list=words_list,
                                   labels=allLabels)
                # df['all']   ....
                # df['train'] ....
                # df['test']  .... ['labels'] <-- df with columns disease_state_label,CLASS, and index as patid
                #                  ['vectors'] <-- index is also patid

                if model_type == "logistic_regression":
                    confusion = predict_logisticRegression(train=vecs['train'],
                                                           test=vecs['test'])
                count += 1
                batchconfusions[uid] = confusion
                summed_confusion += confusion

            # When we finish a set of holdout/training (a batch), add summed and normalized version
            batchconfusions['sum-%s' % (batchuid)] = summed_confusion
            total_confusions = summed_confusion.sum().sum()
            batchconfusions['norm-%s' %
                            (batchuid)] = summed_confusion / total_confusions
            confusions[batchuid] = batchconfusions

    return confusions
import param
import util


############################ 定义评估函数 ############################
def micro_avg_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='micro')


############################ 加载数据 ############################
df_all = pd.read_csv(param.data_path + '/output/corpus/all_data.csv',
                     encoding='utf8',
                     nrows=param.train_num)
df_all['penalty'] = df_all['penalty'] - 1

model = Doc2Vec.load(param.data_path + '/output/model/dm_d2v_12w.model')
x_sp = np.array([model.docvecs[i] for i in range(param.train_num)])

############################ dmd2v stack ############################
np.random.seed(param.seed)  # 固定种子,方便复现
df_stack = pd.DataFrame(index=range(len(df_all)))
tr_num = param.cv_train_num
num_class = len(pd.value_counts(df_all['penalty']))
n = 5

x = x_sp[:tr_num]
y = df_all['penalty'][:tr_num]
x_te = x_sp[tr_num:]
y_te = df_all['penalty'][tr_num:]

feat = 'dmd2v'