def test_build(Xtrain, ytrain, Xtest, ytest): ''' Load the three varieties of Doc2Vec models that were previously saved. Build a random forest model for each Doc2Vec model. Test each random forest model with the same test data, and write the results to a CSV file for each Doc2Vec model. ''' print "Loading the model..." models = [Doc2Vec.load("Doc2Vec_dbow_d300_n5_t4"), \ Doc2Vec.load("Doc2Vec_dm-c_d300_n5_w5_t4"), \ Doc2Vec.load("Doc2Vec_dm-m_d300_n5_w10_t4")] filenames = ['Doc2Vec_dbow.csv', 'Doc2Vec_dm-c.csv', 'Doc2Vec_dm-m.csv'] forests = [] for model in models: forests.append(build_forest(model, Xtrain, ytrain)) for i in xrange(3): model = models[i] forest = forests[i] filename = filenames[i] features = [] print "Creating feature list for test data..." for id in Xtest['id']: # remove the extra quotes around the id features.append(model.docvecs[id[1:-1]]) print "Predicting test sentiment..." use_forest(forest, features, ytest, filename)
def __init__(self, note_type, model_file, max_notes, dbow_file=None): self.note_type = note_type self.max_notes = max_notes self.model = Doc2Vec.load(model_file) if dbow_file: self.dbow = Doc2Vec.load(dbow_file) else: self.dbow = None
def load_model(): ''' Loading and Building Train and Test Data ''' # loading labels labels = pickle.load(open('labels.p', 'rb')) # Using LabelEncoder to convert string to numerical value. label_encoder = preprocessing.LabelEncoder() transformed_labels = label_encoder.fit_transform(labels) transformed_labels = np.array(transformed_labels) transformed_labels = label_binarize(transformed_labels, np.unique(transformed_labels)) print('Found %d Labels' % len(label_encoder.classes_)) print('Labels:', label_encoder.classes_) # initialising feature array cow_arrays = np.zeros((247543, 300)) # learning model Distributed memory model model = Doc2Vec.load('./acm_cow.d2v') # updating training arrays for i in range(247543): prefix_train_pos = "SET_" + str(i) cow_arrays[i] = model.docvecs[prefix_train_pos] train_arrays_cow, test_arrays_cow, train_labels_cow, test_labels_cow = \ train_test_split(cow_arrays, transformed_labels, test_size=0.1, random_state=42) # initialising feature array skip_arrays = np.zeros((247543, 300)) # learning model Distributed Bag of words model model = Doc2Vec.load('./acm_skip.d2v') # updating training arrays for i in range(247543): prefix_train_pos = "SET_" + str(i) skip_arrays[i] = model.docvecs[prefix_train_pos] train_arrays_skip, test_arrays_skip, train_labels_skip, test_labels_skip = \ train_test_split(skip_arrays, transformed_labels, test_size=0.1, random_state=42) to_return = (train_arrays_cow, train_labels_cow, test_arrays_cow, test_labels_cow, train_arrays_skip, train_labels_skip, test_arrays_skip, test_labels_skip) return to_return
def load_questions(modelname,f_name,mapname,a_modelname): model = Doc2Vec.load(modelname) a_model = Doc2Vec.load(a_modelname) qids = list(enumerate([int(q) for q in open(f_name)])) rev_qids = [(item,index) for index,item in qids] qid_dict = dict(rev_qids) Q = [] doc_dict = load_doc_hashes(mapname) for fname in os.listdir("questions"): Q.append(load_question(fname,model.docvecs.doctag_syn0,qid_dict,doc_dict,a_model)) return Q
def main(): model = Doc2Vec.load('400_pvdm_doc2vec.d2v') model_dbow = Doc2Vec.load('400_pvdbow_doc2vec.d2v') #mistake pvdm is actually pv-dbow path = 'datasets/' files = [f for f in listdir(path) if isfile(join(path,f))] files.pop(0) data_loader = DataLoader(path) domains = data_loader.csv_files names = {1: 'title', 4: 'abstract', 5: 'mesh', 'y': 6} domain_features = data_loader.get_feature_matrix(names) #get size n_total_documents = 0 for domain in domain_features: n_total_documents+=len(domain[0]) all_features = numpy.zeros(shape=(n_total_documents, 800)) all_labels = numpy.asarray([]) i = 0 for domain in domain_features: features, labels = domain all_labels = numpy.hstack((all_labels, labels)) for feature_vector in features: preprocessed_line = list(preprocess(feature_vector)) all_features[i, 0:400] = numpy.float_(model.infer_vector(preprocessed_line)) all_features[i, 400:] = numpy.float_(model_dbow.infer_vector(preprocessed_line)) i+=1 all_labels = numpy.asarray(all_labels) all_labels[all_labels == -1] = 0 all_labels = numpy.intc(all_labels) train, test = data_loader.create_random_samples(all_features, all_labels) train_x, train_y = train test_x, test_y = test classifier = NeuralNet(n_hidden_units=[200], output_size=2, batch_size=20, n_epochs=200, dropout=True, activation_function='relu', learning_rate=.3, momentum=True, momentum_term=.5) classifier.train(train_x, train_y) classifier.test(test_x, test_y)
def do(): global shouldStemData global shouldSaveModel from os.path import isfile from gensim.models import Doc2Vec from sys import argv if not isfile(modelname):# or (len(argv) > 1 and argv[1] == '--update'): parsed = parseData(trainData) print 'Begin stemming data' parsed = stemData(parsed[:10000]) if False: try: print 'Write stemmed data' f = open('stemmed_data.csv', 'w') f.write('\n'.join(map(lambda x: ' '.join(x), parsed))) except Exception: print 'Failed to write' finally: try: f.close() except Exception: print '' print 'Begin training' if False:#len(argv) > 1 and argv[1] == '--update': print 'Update model' model = Doc2Vec.load(modelname) model.train(documents=parsed) else: model = Doc2Vec(documents=parsed)#, size=100, workers=4, window=5, min_count=5) if shouldSaveModel: print 'Save model' model.save(modelname) else: stemData([]) model = Doc2Vec.load(modelname) print 'Get results' t = '' try: t = getResults(model) except Exception: for x in model.most_similar(happy): print x[0].encode('utf8') open('res.txt', 'w').write(t.encode('utf8'))
def transform_input(vectorsize): # this loads the premade model saved as amzn.d2v and transforms writes its vectors into arrays that can be input into the scikit learn algorithms print('Loading Doc2Vec model...') try: model = Doc2Vec.load('./amzn.d2v') except Exception as exception: print('No existing model found. Starting to create a model...') train_size = 50000 d2v_source(train_size) model = create_doc2vec_model(vectorsize) # load or generate train and test data try: with open('train.txt') as f: train_raw = np.asarray([line.rstrip('\n') for line in f]) with open('test.txt') as f: test_raw = np.asarray([line.rstrip('\n') for line in f]) with open('train_target.txt') as f: target = np.asarray([int(line.rstrip('\n')) for line in f]) with open('test_target.txt') as f: target_test = np.asarray([int(line.rstrip('\n')) for line in f]) except Exception as exception: print('No train data found. Generating new train and test files....') train_size = 50000 test_size = 20000 review_lines(train_size,test_size) with open('train.txt') as f: train_raw = np.asarray([line.rstrip('\n') for line in f]) with open('test.txt') as f: test_raw = np.asarray([line.rstrip('\n') for line in f]) with open('train_target.txt') as f: target = np.asarray([int(line.rstrip('\n')) for line in f]) with open('test_target.txt') as f: target_test = np.asarray([int(line.rstrip('\n')) for line in f]) # infer vectors for the sentences of the train and test sets # I do this by creating a list of strings out of the document and then converting that into a vector # this takes forever...so for further use, I will only do this for new train and test sets and save the vectors try: train_arrays = np.loadtxt('train_vectors.txt') test_arrays = np.loadtxt('test_vectors.txt') except Exception as exception: train_arrays = np.zeros((target.shape[0],vectorsize)) test_arrays = np.zeros((target_test.shape[0],vectorsize)) print('Vectorizing the train and test data...') for i in range(target.shape[0]): train_arrays[i,:] = model.infer_vector(train_raw[i].split()) for i in range(target_test.shape[0]): test_arrays[i,:] = model.infer_vector(test_raw[i].split()) np.savetxt('train_vectors.txt',train_arrays) np.savetxt('test_vectors.txt',test_arrays) return train_arrays, target, test_arrays, target_test
def load_or_train(sentences=None,dim=83,epochs=10): # Doc2Vec params # -------------- # min_count: words appearing more than.. # window: size of the skip-gram model # size: vector embedding size # sample: higher frecuency words are downsampled with this # negative: noise factor in context (neagtive sampling) # workers: parallel processing factor try: print "> Loading model.." model = Doc2Vec.load("doc2vec.model") except IOError: print "> No pretrained model found or loading failed." model = Doc2Vec(min_count=1, size=dim, window=10, negative=5, sample=1e-4, workers=7) if not sentences: print "> No labeled sentences provided. Building them now." sentences = labeled_sentences() print "> Building vocabulary.. (this may take a awhile)" train_sentences, test_sentences = sentences.to_array() model.build_vocab(train_sentences+test_sentences) print "> Training Doc2Vec.. (this may take awhile)" for i in range(epochs): print "--> Epoch %d"%i model.train(sentences.permutate()) model.train_size = sentences.train_size model.test_size = sentences.test_size model.test_sentences = test_sentences model.save('./doc2vec.model') return model
def get_model(): try: model = Doc2Vec.load(DOC2VEC_MODEL) return model except: print "Model couldn't be loaded" return None
def instance_generator(reviews_path, model_path): print "Loading model" model = Doc2Vec.load(model_path) print "Model loaded" with gzip.open(reviews_path, 'rt') as file: for index, line in enumerate(file): review = json.loads(line) yield model.infer_vector(review['reviewText'].split()), review['overall']
def load_embeddings(arg=None): if arg == 'zh_tw': # dim = 400 model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('cn_word2vec'), binary=False) elif arg == 'CVAT': # dim = 50 model = gensim.models.Word2Vec.load(get_file_path('wordvecs_CVAT')) elif arg == 'IMDb': # dim = 100 model = Doc2Vec.load(get_file_path('test_doc2vec_model')) elif arg == 'CVAT_docvecs': # dim = 50 model = Doc2Vec.load(get_file_path('docvecs_CVAT')) elif arg == 'google_news': model = gensim.models.Word2Vec.load_word2vec_format(get_file_path('google_news'), binary=True) elif arg == 'vader': model = gensim.models.Word2Vec.load('./data/vader_wordvecs.w2v') else: raise Exception('Wrong Argument.') print('Load Model Complete.') return model
def __init__(self, filename=None, min_count=1, alpha_initial=0.002, alpha_start=0.0005, alpha_end=0.0002, min_iters=10, monitor=None): Doc2Vec.__init__(self) if filename is not None: self.load_from_pickle(filename) self.checkpoint = {} self.filename = filename self.min_count = min_count self.alpha_initial = alpha_initial self.alpha_start = alpha_start self.alpha_end = alpha_end self.min_iters = min_iters if monitor is None: monitor = lambda *x: None self.monitor = monitor assert 'train_lbls' in dir(self)
def puebaSimpleCosenos(): model = Doc2Vec.load('./imdb_dm.d2v') source = 'data/trainneg.txt' generador = GeneraVectores(model) vecs = generador.getVecsFromFile(source) print "coseno primer vector, trainneg" print dot(matutils.unitvec(vecs[0]), matutils.unitvec(model.docvecs["TRAIN_NEG_0"]))
def load_model(language, models_path, models): if check_lang: path = models_path.format(language) + models[language] print path model = Doc2Vec.load(path) assert model.docvecs.count > 0 return model else: return None
def do_doc2vec(label_tweet, text_tweet): # Traitement : exécute Doc2Vec sur l'ensemble des # tweets étiquetés passés en paramètre. # Retourne : la matrice des vecteurs lignes associés à chaque # tweet. print("-> Doc2Vec...") documents = [TaggedDocument(words = text.split(), tags = [label]) for (label, text) in zip(label_tweet, text_tweet)] model = None filename_cache = ('model_nbdocs_' + str(args.amount) + '_dim_' + str(args.dim) + '.doc2vec') if not os.path.exists(filename_cache): model = Doc2Vec(documents, size = args.dim, min_count = 1, workers = 4) model.save(filename_cache) else: model = Doc2Vec.load(filename_cache) data = None if args.coeff != 1: print(" pondération des #tags : " + str(args.coeff)) if args.tfidf: print(" tfidf...") data = do_tfidf(text_tweet, model) elif args.mean: print(" mean...") data = do_mean(text_tweet, model, True) else: print(" sum...") data = do_mean(text_tweet, model) print(" ok!") # rassembler les labels de chaque tweet # avec les vecteurs correspondants data = pd.DataFrame(data) final_data = pd.DataFrame({'id' : label_tweet}) final_data = pd.concat([final_data, data], axis = 1) return final_data
def __init__(self, test_mod = False): self.test_mod = test_mod # режим работы демона self.model = Doc2Vec.load('./document2vector_X.d2v') self.urls = [ # ('^$', self.index), # ('^upload_tra$', self.upload_tra), # ('^upload_tst$', self.upload_tst), # ('^reset_model$', self.reset_model) ]
def load_embeddings(arg=None): if arg == 'zh_tw': # dim = 400 model = gensim.models.Word2Vec.load_word2vec_format(None, binary=False) elif arg == 'CVAT': # dim = 50 model = gensim.models.Word2Vec.load(None) elif arg == 'twitter': # dim = 50 model = Doc2Vec.load('./data/acc/docvecs_twitter.d2v') else: raise Exception('Wrong Argument.') print('Load Model Complete.') return model
def load_vecs(): model = Doc2Vec.load('./dbowtweets.d2v') train_arrays = numpy.zeros((14640, 100)) label = "" count = 0 for i in range(1, 14641): label = 'SENT_' + str(i) train_arrays[count] = model.docvecs[label] count += 1 return train_arrays
def predict(): train = get_reviews('data/imdb/train_data.csv') test = get_reviews('data/imdb/test_data.csv') model = Doc2Vec.load(model_name) train_features = get_features(train, model) train_labels = train['sentiment'].as_matrix().reshape((len(train), 1)) test_features = get_features(test, model) test_labels = test['sentiment'].as_matrix().reshape((len(test), 1)) neural_network(train_features, train_labels, test_features, test_labels)
def test_doc2vec_inference_saveload(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10) model.save(TEST_FILE) del model model = Doc2Vec.load(TEST_FILE) os.remove(TEST_FILE) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
def _set_params(self, params): self.user_factors = params['P'] self.item_factors = params['Q'] self.item_bias = params['b_i'] self.nn_w1 = params['nn_w1'] self.nn_w2 = params['nn_w2'] self.global_bias = params['avg_train_rating'] if 'avg_train_rating' in params else None if self.movie_to_imdb is None and 'movie_to_imdb' in params: self.movie_to_imdb = params['movie_to_imdb'] if self.d2v_model is None and 'd2v_model' in self.config: self.d2v_model = Doc2Vec.load(self.config['d2v_model'])
def load_and_emit_vectors(filename): model = Doc2Vec.load('../project_snapshot/imdb.d2v') dataset = pickle.load(open('gensim_data.frmt')) vecs = [] i = 0 with open(filename, 'rb') as csvfile: spamreader = csv.reader(csvfile, delimiter = '\t') for row in spamreader: i += 1 val = tokenize(row[3].lower().translate(None, string.punctuation)) x = model.infer_vector(val) print i vecs.append(x) pickle.dump(np.array(vecs), open('doc2vec_features', 'w'))
def _set_params(self, params): self.user_factors = params['P'] self.item_factors = params['Q'] self.item_bias = params['b_i'] self.global_bias = params['avg_train_rating'] if 'avg_train_rating' in params else None if self.movie_to_imdb is None and 'movie_to_imdb' in params: self.movie_to_imdb = params['movie_to_imdb'] if self.user_pref_model is None and 'user_pref_nn_params' in params: self.user_pref_model = UserPrefModel(self.config) self.user_pref_model.set_params(params['user_pref_nn_params']) if self.d2v_model is None and 'd2v_model' in self.config: self.d2v_model = Doc2Vec.load(self.config['d2v_model'])
def load_from_pickle(self, filename): """ This loads a pretrained Word2Vec file into this Doc2Vec class. """ model_w2v = Doc2Vec.load(filename) for attr in dir(model_w2v): if attr == '__dict__': continue # Skip methods that we already have in this class if attr in dir(self) and callable(getattr(model_w2v, attr)): continue try: setattr(self, attr, getattr(model_w2v, attr)) except AttributeError: continue
def nearest_neighbour(self, fname) : """ Finds the "n_no" of nearest neighbours for each Query Question and writes it in a file "fname" given as a parameter. """ qout = open(fname, "w") model = Doc2Vec.load('my_model.doc2vec') for i in range(self.train_size+1,self.q_total) : j=0 qout.write(self.q_actual[i]); for items in model.docvecs.most_similar(i) : qout.write("NN %d (%s) --- " % (j+1, items[1])+self.q_actual[items[0]]) j=j+1 print "Written Successfully in file "+fname+" !!!" qout.close();
def load_from_w2v(self, filename): """ This loads a pretrained Word2Vec file into this Doc2Vec class. """ model_w2v = Doc2Vec.load_word2vec_format(filename, binary=False) self._vocab_from = Word2Vec._vocab_from self._prepare_sentences = model_w2v._prepare_sentences for attr in dir(model_w2v): if attr == '__dict__': continue if attr in dir(self) and callable(getattr(model_w2v, attr)): continue try: setattr(self, attr, getattr(model_w2v, attr)) except AttributeError: continue
def create_si_user_model(config, ratings): if 'si_user_d2v_model' in config: d2v_model = Doc2Vec.load(config['si_user_d2v_model']) feature_vec_dict = UserIdToDocVec(d2v_model.docvecs, ratings) elif 'si_user_vector_dict' in config: feature_vec_dict = dd.io.load(config['si_user_vector_dict']) si_user_nn = list(config['si_user_nn_hidden']) si_user_nn.insert(0, config['nb_latent_f']) si_user_nn.append(int(feature_vec_dict[config['si_user_valid_id']].shape[0])) config['si_user_nn'] = si_user_nn si_user_model = NNSideInfoModel(config['si_user_nn'], config['si_user_reg_lambda'], config['si_user_cosine_lambda'], feature_vec_dict) return si_user_model, config
def train_mfnn(config): ratings = pd.read_csv(config['ratings_path']) config['nb_users'] = len(ratings['user_id'].unique()) config['nb_movies'] = len(ratings['movie_id'].unique()) train = pd.read_csv(config['train_path']) test = pd.read_csv(config['test_path']) val = None if config['val']: val = pd.read_csv(config['val_path']) zero_sampler = None if 'zero_sample_factor' in config: config['zero_samples_total'] = len(train) * config['zero_sample_factor'] zero_sampler = ZeroSampler(ratings) if config['binarize']: train = binarize_ratings(train, pos=config['binarize_pos'], neg=config['binarize_neg'], threshold=config['binarize_threshold']) test = binarize_ratings(test, pos=config['binarize_pos'], neg=config['binarize_neg'], threshold=config['binarize_threshold']) if val is not None: val = binarize_ratings(val, pos=config['binarize_pos'], neg=config['binarize_neg'], threshold=config['binarize_threshold']) d2v_model = Doc2Vec.load(config['d2v_model']) config['nb_d2v_features'] = int(d2v_model.docvecs['107290.txt'].shape[0]) if config['verbose'] > 0: print "experiment: ", config['experiment_name'] print config users, items = create_lookup_tables(ratings) movie_to_imdb_dict = movie_to_imdb(ratings) if 'theano' in config and config['theano']: model = MFNNModel(users, items, config, movie_to_imdb_dict) model.user_pref_model = UserPrefModel(config) else: model = MFNNModelNumpy(users, items, config, movie_to_imdb_dict) model.d2v_model = d2v_model loss_history = model.fit(train, val=val, test=test, zero_sampler=zero_sampler) return model, config, loss_history
def make_model(self, fname): if os.path.isfile(fname): with Timer("Load model from a file", self.logger): self.model = Doc2Vec.load('./imdb.d2v') self.dim = self.model.vector_size else: with Timer("build model from documents", self.logger): sentences = LabeledLineSentence(self.vocab_sources) model = Doc2Vec(min_count=1, window=10, size=self.dim, sample=1e-4, negative=5, workers=7) model.build_vocab(sentences.to_array()) for epoch in range(50): self.logger.info('Epoch %d' % epoch) model.train(sentences.sentences_perm()) model.save(fname) self.model = model
def get_features_by_doc2vec(): global max_features x_train, x_test, y_train, y_test=load_all_files() x_train=cleanText(x_train) x_test=cleanText(x_test) x_train = labelizeReviews(x_train, 'TRAIN') x_test = labelizeReviews(x_test, 'TEST') x=x_train+x_test cores=multiprocessing.cpu_count() #models = [ # PV-DBOW # Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores), # PV-DM w/average # Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter=10, workers=cores), #] if os.path.exists(doc2ver_bin): print "Find cache file %s" % doc2ver_bin model=Doc2Vec.load(doc2ver_bin) else: model=Doc2Vec(dm=0, size=max_features, negative=5, hs=0, min_count=2, workers=cores,iter=60) #for model in models: # model.build_vocab(x) model.build_vocab(x) #models[1].reset_from(models[0]) #for model in models: # model.train(x, total_examples=model.corpus_count, epochs=model.iter) #models[0].train(x, total_examples=model.corpus_count, epochs=model.iter) model.train(x, total_examples=model.corpus_count, epochs=model.iter) model.save(doc2ver_bin) #x_test=getVecs(models[0],x_test,max_features) #x_train=getVecs(models[0],x_train,max_features) x_test=getVecs(model,x_test,max_features) x_train=getVecs(model,x_train,max_features) return x_train, x_test, y_train, y_test
from gensim.models import Doc2Vec import gensim.models.doc2vec from gensim import utils from time import time # assumptions: window is 5 words left and right, eliminate words than dont occur in # more than 10 docs, use 4 workers for a quadcore machine. Size is the size of vector # negative=5 implies negative sampling and makes doc2vec faster to train #model = Doc2Vec(sentence, size=100, window=5, workers=4, min_count=5) size = 600 #change to 100 and 300 to generate vector with those dimensions #instantiate our model model_dm = Doc2Vec(min_count=10, window=5, size=size, sample=1e-3, negative=5, workers=4) #build vocab over all reviews model_dm.build_vocab(sentence) #We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy. Idx = list(range(len(sentence))) t0 = time() for epoch in range(5): random.shuffle(Idx) perm_sentences = [sentence[i] for i in Idx] model_dm.train(perm_sentences) print(epoch)
import re import csv import nltk as nltk from nltk.util import ngrams #from nltk.corpus import stopwords from nltk.stem import SnowballStemmer from contractions import CONTRACTION_MAP from gensim.models.word2vec import Word2Vec from gensim.models import Doc2Vec from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer import itertools import pickle import creds modelw1v = Word2Vec.load(creds.w2vpath) model = Doc2Vec.load(creds.d2vpath) kmeans_model = pickle.load(open(creds.kmodel, 'rb')) stopwords = [ "youour", "got", "tho", "im", "u", "ur", 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
for i, sent in enumerate(text1): text2.append([sent, ['label' + str(i)]]) text3 = [] for sent, label in text2: text3.append(TaggedDocument(sent, label)) del text2 def sentences_perm(sentences): shuffle(sentences) return sentences model = Doc2Vec(min_count=10, window=5, vector_size=100, sample=1e-4, negative=5, workers=8) model.build_vocab(text3) token_count = sum([len(sentence) for sentence in text3]) model.train(text3, total_examples=token_count, epochs=10) model.save('yelp.d2v') train_arrays = np.zeros((10000, 100)) train_labels = np.zeros(10000) train_labels = np.array(star) for i in range(10000): train_arrays[i] = model['label' + str(i)] X_train, X_test, y_train, y_test = train_test_split(train_arrays, train_labels, test_size=0.2,
def load_model(filepath): model = Doc2Vec.load(filepath) return model
def train(doc_embedding_size, negative_sample_size, epochs): print('doc_embedding_size:', doc_embedding_size) print('negative_sample_size:', negative_sample_size) print('epochs:', epochs) logging.getLogger().setLevel(logging.DEBUG) all_docs = utils.load('all_docs') alldocs = [] corpus_size = len(all_docs) GoogleJobSkillDocument = namedtuple('GoogleJobSkillDocument', 'words tags') for i in range(corpus_size): words = all_docs[i].title_words tags = [i] alldocs.append(GoogleJobSkillDocument(words, tags)) for i in range(corpus_size): words = all_docs[i].detail_words tags = [i + corpus_size] alldocs.append(GoogleJobSkillDocument(words, tags)) print('docs size:', len(alldocs)) doc_list = alldocs[:] shuffle(doc_list) cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" model = Doc2Vec(dm=0, vector_size=doc_embedding_size, negative=negative_sample_size, hs=0, min_count=2, sample=0, epochs=epochs, workers=cores) # Build corpus model.build_vocab(alldocs) print("%s vocabulary scanned & state initialized" % model) print("vocab size:", len(model.wv.vocab)) print("docvecs size:", len(model.docvecs)) # Train print("Training %s" % model) model.train(doc_list, total_examples=len(doc_list), epochs=model.epochs) # Save title_emb, detail_emb = utils.split_embeddings(model.docvecs, 2) doc_emb = utils.concat_embeddings([title_emb, detail_emb]) title_emb = utils.normalize_embeddings(title_emb) detail_emb = utils.normalize_embeddings(detail_emb) doc_emb = utils.normalize_embeddings(doc_emb) utils.save_doc_embeddings(title_emb, 'gensim_dbow_title', negative_size=negative_sample_size) utils.save_doc_embeddings(detail_emb, 'gensim_dbow_detail', negative_size=negative_sample_size) utils.save_doc_embeddings(doc_emb, 'gensim_dbow', negative_size=negative_sample_size)
from gensim.models import doc2vec, Doc2Vec from gensim.models.doc2vec import TaggedDocument import pandas as pd from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import nltk #형태소 분석 import jpype from konlpy.tag import Kkma # 파일로부터 모델을 읽는다. 없으면 생성한다. d2v_faqs = Doc2Vec.load('d2v_faqs.model') faqs = pd.read_csv('faq.csv') kkma = Kkma() filter_kkma = [ 'NNG', #보통명사 'NNP', #고유명사 'OL', #외국어 ] def tokenize_kkma(doc): jpype.attachThreadToJVM() token_doc = ['/'.join(word) for word in kkma.pos(doc)] return token_doc def tokenize_kkma_noun(doc): jpype.attachThreadToJVM()
texte2 = texte2.replace("\n", ' ') texte2 = texte2.replace("/", ' ') texte2 = texte2.replace("!", ' ') texte2 = texte2.replace("?", ' ') texte2 = texte2.replace("\"", ' ') texte2 = texte2.replace("'", ' ') texte2 = texte2.replace("\#", ' ') texte2 = texte2.split() texte2 = [ token for token in texte2 if len(token) and token.lower() not in stopwords ] return ' '.join(texte2) model = Doc2Vec.load('tweetmodel.model') def norme(vec): return np.sqrt(np.sum(vec * vec)) def prediction(vec1, vec2): val = np.sum((vec1 * vec2)) val /= (norme(vec1) * norme(vec2)) if val > 1: return np.arccos( 1 ) #because of the structure of floats it can appen that the result is almost 1 but superior else: return np.arccos(val)
lambda x: cleaner.replace_null_with_empty_string(x)) data['readable_text'] = data['value'].apply( lambda x: cleaner.get_readable_text(x)) data['processed_value'] = data['value'].apply( lambda x: cleaner.clean_html_and_extract_text(x)) documents = data['processed_value'].tolist() labeledDocs = [] for i, document in enumerate(documents): labeledDocs.append(LabeledSentence(document.split(), "label_" + str(i))) model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8) model.build_vocab(labeledDocs) import random for epoch in range(10): random.shuffle(labeledDocs) model.train(labeledDocs, total_examples=model.corpus_count, epochs=model.iter) model.save('accounting.d2v') model = Doc2Vec.load('accounting.d2v')
from gensim.models.doc2vec import TaggedDocument from gensim.models import Doc2Vec from keras.models import model_from_json import numpy import os # random shuffle from random import shuffle # classifier from sklearn.linear_model import LogisticRegression import logging import sys model = Doc2Vec.load('./imdb.d2v') train_arrays = numpy.zeros((4000, 100)) train_labels = numpy.zeros(4000) for i in range(2000): prefix_train_pos = 'TRAIN_POS_' + str(i) prefix_train_neg = 'TRAIN_NEG_' + str(i) train_arrays[i] = model.docvecs[prefix_train_pos] train_arrays[2000 + i] = model.docvecs[prefix_train_neg] train_labels[i] = 1 train_labels[2000 + i] = 0 print len(train_arrays[0]) #print train_labels test_arrays = numpy.zeros((798, 100)) test_labels = numpy.zeros(798)
with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]) def to_array(self): self.sentences = [] for source, prefix in self.sources.items(): with utils.smart_open(source) as fin: for item_no, line in enumerate(fin): self.sentences.append(LabeledSentence( utils.to_unicode(line).split(), [prefix + '_%s' % item_no])) return self.sentences def sentences_perm(self): shuffle(self.sentences) return self.sentences sources = {'/home/jason/Desktop/word2vec-sentiments/test-neg.txt':'TEST_NEG', '/home/jason/Desktop/word2vec-sentiments/test-pos.txt':'TEST_POS', '/home/jason/Desktop/word2vec-sentiments/train-neg.txt':'TRAIN_NEG', '/home/jason/Desktop/word2vec-sentiments/train-pos.txt':'TRAIN_POS', '/home/jason/Desktop/word2vec-sentiments/train-unsup.txt':'TRAIN_UNS'} sentences = LabeledLineSentence(sources) model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=7) model.build_vocab(sentences.to_array()) for epoch in range(50): logger.info('Epoch %d' % epoch) model.train(sentences.sentences_perm()) model.save('./imdb.d2v')
def labelize_data(comments, label): result = [] prefix = label for j, t in zip(comments.index, comments): result.append(LabeledSentence(t.split(), [prefix + '_%s' % j])) return result all_comments = df['comment'] all_comments_wv = labelize_data(all_comments, 'all') print(all_comments_wv) cores = multiprocessing.cpu_count() model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065) model_ug_dbow.build_vocab([x for x in tqdm(all_comments_wv)]) for epoch in range(30): model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_comments_wv)]), total_examples=len(all_comments_wv), epochs=1) model_ug_dbow.alpha -= 0.002 model_ug_dbow.min_alpha = model_ug_dbow.alpha def get_vectors(model, corpus, size): vecs = np.zeros((len(corpus), size)) n = 0 for i in corpus.index: prefix = 'all_' + str(i) vecs[n] = model.docvecs[prefix] n += 1
import gensim import numpy as np from gensim.models import Doc2Vec import time import tqdm import os import re from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn import svm from sklearn.metrics import accuracy_score from sklearn.neural_network import MLPClassifier print("Loading model of DM para2vec:") model = Doc2Vec.load('trained_models/para_vectors_dm.d2v') # Construct the training arrays and labels train_arrays = np.zeros((25000, 300)) train_labels = np.zeros(25000) print("Creating training arrays and labels:") for i in tqdm.trange(12500): train_arrays[i] = model['train_pos' + str(i)] train_labels[i] = 1 train_arrays[i + 12500] = model['train_neg' + str(i + 12500)] train_labels[i + 12500] = 0 # Construct the test arrays and labels test_arrays = np.zeros((25000, 300)) test_labels = np.zeros(25000) print("Creating test arrays and labels:")
def sentences_perm(self): shuffle(self.sentences) return self.sentences def trainDoc2Vec(): pass if __name__ == '__main__': sentences = LabeledLineSentence() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) try: print "Trying to load model" model = Doc2Vec.load("../models/test_perm_10_epoch.d2v") except Exception, e: print "Model not found, constructin model with size 300, window 30, alpha 0.025, 5 iterations" model = Doc2Vec(min_count=3, window=30, size=300, sample=1e-4, negative=5, alpha=0.025, min_alpha=0.025, workers=4) # use fixed learning rate model.build_vocab(sentences.to_array()) # model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=4) # model.build_vocab(sentences.to_array()) for epoch in range(5): model.train(sentences.sentences_perm()) # model.train(sentences) # model.alpha -= 0.002 # decrease the learning rate
from gensim import corpora import getDictinary class genSentence: myDict = corpora.dictionary.Dictionary.load_from_text(u"字典.txt") def __iter__(self): dictSet = set(genSentence.myDict.values()) for tName in getDictinary.dirIterator_tuple(): wordsList_s = " ".join( codecs.open(tName[0], 'rb', 'utf-8').readlines()).split() wordsList_st = [item for item in wordsList_s if item in dictSet] documents = TaggedDocument(wordsList_st, []) yield documents #yield wordsList_st if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) doc2vecModel = Doc2Vec(genSentence(), size=100, window=8, min_count=5, workers=4) doc2vecModel.save('doc2vec.model')
import numpy as np from gensim.models import Doc2Vec from utils import load_config, load_lab_seq_sp, embedding_to_file, labels_to_file config = load_config("config.json") dim = config["embedding_dimensions"] model_sym_path = config["model_sym_path"] model_SP_path = config["model_sp_path"] training_data_file = config["training_data"] embeddings_data_path = config["trained_embeddings_path"] labels_data_path = config["labels"] model_sym = Doc2Vec.load(model_sym_path) model_SP = Doc2Vec.load(model_SP_path) input_trajectories, input_sp, labels = load_lab_seq_sp(training_data_file) labels_dict = dict.fromkeys(labels) num_trajs = len(list(labels_dict)) print("Found {} unique user trajectories".format(num_trajs)) sum_vector = np.zeros(dim, dtype=np.float64) index = 0 export_labels = [] total_labels = len(labels) for label in labels: if index % 500 == 0: print("Evaluating traj {}/{} of user {}".format(index, total_labels, label))
def sent2vec(self): self.d2v = Doc2Vec(self.sents, size=self.sent_n, window=8, min_count=5, workers=4)
# classifier from sklearn.linear_model import LogisticRegression from gensim.models import Doc2Vec import numpy from GeneraVectores import GeneraVectores from sklearn import svm from NNet import NeuralNet if __name__ == '__main__': model = Doc2Vec.load('./imdb_dbow.d2v') #print model["TRAIN_POS_8029"] #exit() dim = 100 train_arrays = numpy.zeros((25000, dim)) train_labels = numpy.zeros(25000) generador = GeneraVectores(model) Pos = generador.getVecsFromFile("data/trainpos.txt") print "generados vectores Pos" Neg = generador.getVecsFromFile("data/trainneg.txt") print "generados vectores Neg" for i in range(12500): train_arrays[i] = Pos[i] train_arrays[12500 + i] = Neg[i] train_labels[i] = 1 train_labels[12500 + i] = 0 test_arrays = numpy.zeros((25000, dim)) test_labels = numpy.zeros(25000)
# distinct file? from gensim.models import Doc2Vec import gensim.models.doc2vec from collections import OrderedDict import multiprocessing cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise" simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores), # PV-DBOW Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores), # PV-DBOW w/ word training Doc2Vec(dm=0, dbow_words=1, size=100, negative=5, hs=0, min_count=2, workers=cores),
def get_trained_model(sentences, dimension=300): model = Doc2Vec(documents=sentences, size=dimension) return model
# load vectorizer with open(f'{main_path}/data/svc_model/svc_tf_idf_vectorizer.pk', 'rb') as filename: vectorizer = pickle.load(filename) print(vectorizer) # load model with open(f'{main_path}/data/svc_model/svc_model_tf_idf.joblib.pkl', 'rb') as filename: model = joblib.load(filename) print(model) # load doc2vec model fname = get_tmpfile( f'{main_path}/data/doc2vec_model/doc2vec_model_final_new.mdl') doc2vec_model = Doc2Vec.load(fname) print(doc2vec_model) with open(f'{main_path}/data/doc2vec_model/logistic_reg_doc2vec.joblib.pkl', 'rb') as filename: doc2vec_log_reg = joblib.load(filename) print(doc2vec_log_reg) # Vader instance sid = SentimentIntensityAnalyzer() # https://github.com/fnielsen/afinn afinn = Afinn() def intention_finder(text): '''intentions_list = []
_window = 2 _min_count = 5 _epochs = 10 args = sys.argv sys.stdout.writelines("# of args: " + str(len(args))) for arg in args: print(arg) if len(args) >= 2: path_to_model = args[1] if os.path.isfile(args[1]): sys.stdout.writelines("valid file found!") else: sys.stdout.writelines(args[1] + " is not a file!") if len(args) == 5: _window = args[2] _min_count = args[3] _epochs = args[4] model = Doc2Vec.load(path_to_model, mmap=None) model._clear_post_train() model.train(documents=model.docvecs, total_examples=model.corpus_count, epochs=int(_epochs)) model.save(path_to_model) print("Trained model and saved to " + path_to_model)
userprofile = ' '.join(neg_list[k].return_value_work_exp()) neg_profile_list.append(userprofile) pos_label_list = ['pos_profile_'+ str(k) for k in range(len(pos_profile_list))] neg_label_list = ['neg_profile_'+ str(k) for k in range(len(pos_profile_list))] # documents = labeled_data_sentence.LabeledLineSentence(pos_profile_list,pos_label_list,neg_profile_list,neg_label_list) # model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8) # model.build_vocab(documents.to_array()) # for epoch in range(20): # model.train(documents.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter) # # model.train(documents.sentences_perm()) # filename = '/Users/pengyuzhou/Downloads/word_embedding_result/job_title_'+globalparameter.jobtitle_list[i]+'.d2v' # model.save(filename) # filename = '/Users/pengyuzhou/Downloads/glove.6B/glove.6B.100d.txt.word2vec' model = Doc2Vec.load(filename) pos_vector_list = [] neg_vector_list = [] for x in range(len(pos_list)): pos_vector_list.append(model.infer_vector(pos_profile_list[x])) for x in range(len(neg_list)): neg_vector_list.append(model.infer_vector(neg_profile_list[x])) # model = Doc2Vec.load('/Users/pengyuzhou/Downloads/word_embedding_result/job_title_'+globalparameter.jobtitle_list[i]+'.d2v') test2 = model.most_similar('software') train_arrays = numpy.zeros((500,100)) train_labels = numpy.zeros(500)
def main(corpora, p2v_dir, p2v_file, diag_dir, epoch): SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment') if ('IMDB' in corpora): alldocs = [] # will hold all docs in original order with open('alldata-id.txt', encoding='utf-8') as alldata: for line_no, line in enumerate(alldata): tokens = gensim.utils.to_unicode(line).split() words = tokens[1:] tags = [ line_no ] # `tags = [tokens[0]]` would also work at extra memory cost split = ['train', 'test', 'extra', 'extra' ][line_no // 25000] # 25k train, 25k test, 25k extra sentiment = [ 1.0, 0.0, 1.0, 0.0, None, None, None, None ][line_no // 12500] # [12.5K pos, 12.5K neg]*2 then unknown alldocs.append(SentimentDocument(words, tags, split, sentiment)) train_docs = [ ' '.join(doc.words) for doc in alldocs if doc.split == 'train' ] test_docs = [ ' '.join(doc.words) for doc in alldocs if doc.split == 'test' ] elif ('20ng' in corpora): train_docs = newsgroups_train.data test_docs = newsgroups_test.data for column in parameters: i = p2v_file.find(column) if (i != -1): value = p2v_file[i:].split()[1] df.set_value(epoch, column, value) else: df.set_value(epoch, column, default_parameters[column]) p2v_model = Doc2Vec.load(p2v_dir + p2v_file) f = open(p2v_dir + p2v_file + 'test', 'rb') p = pickle.load(f) if ('IMDB' in corpora): dev = 50 p2v_DocumentVectors0 = np.array([ p2v_model.docvecs['SENT_' + str(i)] for i in range(12000, 12500 - dev) ] + [ p2v_model.docvecs['SENT_' + str(i)] for i in range(12500 + dev, 13000) ]) y_1 = [1] * (500 - dev) y_0 = [0] * (500 - dev) train_labels = y_1 + y_0 test_labels = [1] * dev + [0] * dev else: p2v_DocumentVectors0 = np.array([ p2v_model.docvecs[tag] for tag in p2v_model.docvecs.doctags if 'train' in tag ]) test_labels = [p[i][1][0].split()[2] for i in p] train_labels = [ tag.split()[2] for tag in model_d2v.docvecs.doctags if 'train' in tag ] p2v_DocumentVectors1 = np.concatenate([p[i][0].reshape(1, -1) for i in p]) for classifier in classifiers: accuracy, best = Classification(classifier, p2v_DocumentVectors0, train_labels, p2v_DocumentVectors1, test_labels) #write it all into DataFrame df.set_value(epoch, classifier, accuracy) df.set_value(epoch, 'best_parameters' + classifier, best) df.set_value(epoch, 'epoch', epoch) df.to_csv(diag_dir + "Res_PV_IMDB.csv") print(accuracy)
class EpochLogger(CallbackAny2Vec): '''Callback to log information about training''' def __init__(self): self.epoch = 0 def on_epoch_begin(self, model): print("Epoch #{} start".format(self.epoch)) def on_epoch_end(self, model): print("Epoch #{} end".format(self.epoch)) self.epoch += 1 d2v_model = Doc2Vec.load("doc2vec_models/doc2vec_07022020_105155.model") original_vecs = sk_pre.StandardScaler().fit_transform( d2v_model.docvecs.vectors_docs) pca = PCA(n_components=100).fit(original_vecs) sum_pca_variance = 0.0 pca_variance_threshold = 0 for variance in pca.explained_variance_ratio_: sum_pca_variance += variance if sum_pca_variance >= 0.90: pca_variance_threshold = pca.explained_variance_ratio_.tolist().index( variance) break
# use fit after running LSTM on all inputs. from keras.models import Sequential from keras.layers.core import Dense, Activation from keras.layers.recurrent import LSTM from gensim.corpora import WikiCorpus from gensim.models import Word2Vec, Doc2Vec from gensim.models.doc2vec import TaggedLineDocument print('Loading original doc2vec model...') doc2vec_model = model = Doc2Vec.load( '../doc2vec/small_wiki_subset.en.doc2vec.model') print('Build sequential model fed by LSTM...') in_out_neurons = 2 hidden_neurons = 300 model = Sequential() model.add(LSTM(in_out_neurons, hidden_neurons, return_sequences=False)) model.add(Dense(hidden_neurons, in_out_neurons)) model.add(Activation("linear")) model.compile(loss="mean_squared_error", optimizer="adam") print('model compiled!') print('Training Sequential/LSTM model...') model.fit( doc2vec_model.docvecs, None, batch_size=batch_size,
axis=0).reset_index() # transform all data to labelled doc pre_char = 100 post_char = 300 lbldoc_all, drop_ind_all = table2lbldoc(df_all_doc2vec, pre_char, post_char) # for reshuffling doc_list = lbldoc_all[:] # generate training drop list and testing drop list from drop_ind_all drop_ind_train2 = [e for e in drop_ind_all if e < train2.shape[0]] drop_ind_test2 = [e for e in drop_ind_all if e > train2.shape[0] - 1] # build model and vocabulary model = Doc2Vec(dm=1, size=100, window=5, negative=5, hs=0, min_count=2) model.build_vocab(lbldoc_all) # train model: decrease learning rate and shuffling loop = 30 for epoch in range(loop): #print(epoch, model.corpus_count, model.iter) shuffle(doc_list) # Shuffling gets better results model.train(lbldoc_all, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.save('models/model.doc2vec')
import numpy as np import math, sys, gzip, collections, gensim.models.doc2vec from gensim.models import Doc2Vec from collections import OrderedDict, namedtuple import random, unicodedata, re import datetime f = '/da4_data/play/api/doc2vecR.200.30.20.5.1518784533.eA.trained' mod = Doc2Vec.load(f) def dist(av, bv): return (sum(av * bv) / math.sqrt(sum(av * av) * sum(bv * bv))) #experts in JS na = 0 f = open("exA.csv", "rb") for line in f: h, m, nc, s, a = line.rstrip().decode('ascii', 'ignore').split(';') if a in mod.docvecs: av = mod.docvecs[a] mv = mod.wv.get_vector(m) st = m + ';' + nc + ';' + s + ';' + str(dist(av, mv)) print(st) na += 1 else: sys.stderr.write(a + '\n') sys.stderr.write(str(na) + '\n') #print (m +';'+str(dist(av,mv))+';'+ str(mod.wv.most_similar([av])))
#!/usr/bin/env python # -*- coding=utf-8 -*- import os import json import argparse import numpy as np from collections import defaultdict from gensim.models import Doc2Vec from keras.models import load_model """ 載入前面訓練好的 models """ d2v = Doc2Vec.load('model/doc2vec_model.d2v') print 'Loading classifier model ...' classifier = load_model('model/classifier_model.h5') print 'Load classifier model success.' parser = argparse.ArgumentParser(description='doc2vec nn classifier') parser.add_argument('--filename', dest='filename', default='', help='要 highlight 的新電影彈幕資料') parser.add_argument('--preprocess_script_dirname', dest='preprocess_script_dirname', default='./preprocess-script/', help='放前處理程式(preprocess.js)的資料夾路徑') parser.add_argument('--processed_data_dirname', dest='processed_data_dirname', default='./processed-data/', help='放前處理完後的彈幕資料後的資料夾路徑')
def predictPE(inputDataLabel, model_type): '''predictPE is a massive wrapper to run all iterations of training and testing using some input label. (eg, impression or report). A dictionary structure of confusion matrices is returned, include a summed and normalized version to represent the success of the entire batch. Note that this function is dependent on some of the global variables defined above (not proper, but will work for this batch script :)) :param model_type: right now I just tried "logistic_regression" :param inputDataLabel: should be one of "impression" or "rad_report" ''' confusions = dict() count = 1 for remove_stop_words in [True, False]: for remove_non_english_chars in [True]: # For each set of params, we can take sum across batches of training and testing batchuid = "batch-%s-%s" % (int(remove_stop_words), int(remove_non_english_chars)) print("Starting batch %s" % (batchuid)) batchconfusions = dict() summed_confusion = pandas.DataFrame(0, columns=list(lookup.keys()), index=list(lookup.keys())) for holdout in batches: # Separate training and test data # Question - is there any reason to split via batches? Bias in this? train_set = [x for x in batches if x != holdout] test_impression = data[inputDataLabel][data.batch == holdout] test_labels = pandas.DataFrame( data['disease_state_label'][data.batch == holdout]) test_ids = data['order_deid'][data.batch == holdout] train_impression = data[inputDataLabel][data.batch.isin( train_set)] train_labels = pandas.DataFrame( data['disease_state_label'][data.batch.isin(train_set)]) train_ids = data['order_deid'][data.batch.isin(train_set)] train_labels["CLASS"] = "TRAIN" test_labels["CLASS"] = "TEST" allIds = train_ids.append(test_ids).tolist() allLabels = train_labels.append(test_labels) allLabels.index = allIds # Compile them together allImpression = train_impression.tolist( ) + test_impression.tolist() # sanity check assert (len(allIds) == len(allImpression) == len(allLabels)) # Make some strings for pretty printing of train/test batch training_ids = "|".join([str(int(x)) for x in train_set]) testing_id = "%s" % (int(holdout)) # Let's have a unique id so we can merge with whole report data later, eg 'holdout(2)-train(3|4)-stopw(1)-nonengrem(1)' uid = "holdout(%s)-train(%s)-rmstopw(%s)-rmnoneng(%s)" % ( testing_id, # holdout id training_ids, # training ids joined with | int(remove_stop_words), # 0/1 int(remove_non_english_chars)) # 0/1 print( "RUNNING ANALYSIS %s:\n\ntrain(%s)\ntest(%s)\nrmstopw(%s)\nrmnoneng(%s)" % (count, training_ids, testing_id, remove_stop_words, remove_non_english_chars)) # Do the training words_list = TrainSentences( text_list=allImpression, remove_non_english_chars=remove_non_english_chars, remove_stop_words=remove_stop_words) labeledDocs = LabeledLineSentence(words_list=words_list, labels_df=allLabels) # Build the vocabulary model = Doc2Vec(size=size, window=window, min_count=min_count, workers=workers, alpha=alpha, min_alpha=min_alpha) # use fixed learning rate # train_words=True # train_lbls=True # Build the vocabularity and fine tune the alpha (manually control learning rate over 10 epochs) model.build_vocab(labeledDocs) for it in range(iters): print("Training iteration %s" % (it)) model.train(labeledDocs) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.train(labeledDocs) # This was done manually during testing, for impressions #model.save('data/model.doc2vec') # Now let's create an object with data frames with training and testing data and labels vecs = get_vectors(model=model, words_list=words_list, labels=allLabels) # df['all'] .... # df['train'] .... # df['test'] .... ['labels'] <-- df with columns disease_state_label,CLASS, and index as patid # ['vectors'] <-- index is also patid if model_type == "logistic_regression": confusion = predict_logisticRegression(train=vecs['train'], test=vecs['test']) count += 1 batchconfusions[uid] = confusion summed_confusion += confusion # When we finish a set of holdout/training (a batch), add summed and normalized version batchconfusions['sum-%s' % (batchuid)] = summed_confusion total_confusions = summed_confusion.sum().sum() batchconfusions['norm-%s' % (batchuid)] = summed_confusion / total_confusions confusions[batchuid] = batchconfusions return confusions
import param import util ############################ 定义评估函数 ############################ def micro_avg_f1(y_true, y_pred): return f1_score(y_true, y_pred, average='micro') ############################ 加载数据 ############################ df_all = pd.read_csv(param.data_path + '/output/corpus/all_data.csv', encoding='utf8', nrows=param.train_num) df_all['penalty'] = df_all['penalty'] - 1 model = Doc2Vec.load(param.data_path + '/output/model/dm_d2v_12w.model') x_sp = np.array([model.docvecs[i] for i in range(param.train_num)]) ############################ dmd2v stack ############################ np.random.seed(param.seed) # 固定种子,方便复现 df_stack = pd.DataFrame(index=range(len(df_all))) tr_num = param.cv_train_num num_class = len(pd.value_counts(df_all['penalty'])) n = 5 x = x_sp[:tr_num] y = df_all['penalty'][:tr_num] x_te = x_sp[tr_num:] y_te = df_all['penalty'][tr_num:] feat = 'dmd2v'