News_docs = [doc for doc in alldocs] #print(News_docs[0]) print('%d News_docs' % (len(alldocs))) #shuffle? from random import shuffle doc_list = alldocs[:] shuffle(doc_list) # 1. create the Doc2Vec model # you need to adjust the hyper-parameters, e.g. size and iter cores = multiprocessing.cpu_count() simple_models = [ # PV-DBOW plain Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, epochs=20, workers=cores), # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0, epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05'), # PV-DM w/ concatenation - big, slow, experimental mode # window=5 (both sides) approximates paper's apparent 10-word total window size Doc2Vec(dm=1, dm_concat=1, vector_size=100, window=5, negative=5, hs=0, min_count=2, sample=0, epochs=20, workers=cores), ] # 2. build vocabulary for model in simple_models: model.build_vocab(alldocs) print("%s vocabulary scanned & state initialized" % model) models_by_name = OrderedDict((str(model), model) for model in simple_models)
def generate_word_embeddings(papers): global document_similarity lines = [] with open(DATASET_INPUT_FILE_PATH + papers, 'rb') as f: for line in tqdm(f, desc='Read papers'): lines.append(json.loads(line)) lines.sort(key=lambda x: x['year']) ids = extract_keys(lines, 'id') titles = extract_keys(lines, 'title') abstracts = extract_keys(lines, 'paperAbstract') out_citations = extract_keys(lines, 'outCitations') # TODO: DO NOT HARDCODE THIS is_test = False train_ids, eval_ids = split_data(ids, 0.8, 0.9, is_test) train_abstracts, eval_abstracts = split_data(abstracts, 0.8, 0.9, is_test) train_titles, eval_titles = split_data(titles, 0.8, 0.9, is_test) train_out_citations, eval_out_citations = split_data( out_citations, 0.8, 0.9, is_test) # dictionary = unk_train(train_abstracts) train_docs = create_tagged_doc(train_abstracts) model = Doc2Vec(workers=11, min_count=5, window=10, size=100, alpha=0.025, iter=20) model.build_vocab(train_docs) model.train(train_docs, epochs=model.iter, total_examples=model.corpus_count) eval_score = [] matching_citation_count = 1 min_rank = float("inf") # TODO: changed eval_abstracts -> eval_titles for i, eval_abstract in tqdm( list(enumerate(eval_titles[:10])), desc='Generating rankings for evaluation set'): rankings = [] eval_split = eval_abstract.lower().split() if len(eval_split): # TODO: changed train_abstracts -> train_titles for j, train_abstract in tqdm( list(enumerate(train_titles)), desc='Iterating through train titles'): train_split = train_abstract.lower().split() if len(train_split): document_similarity = model.wmdistance( train_split, eval_split) rankings.append((document_similarity, j)) rankings.sort(key=lambda x: x[0]) out_citations = eval_out_citations[i] if len(out_citations): # gets the rankings of the training papers in the correct order ranking_ids = get_from_rankings(rankings, train_ids) true_citations = [ citation for citation in ranking_ids if citation in out_citations ] if len(true_citations): matching_citation_count += 1 rank = ranking_ids.index(true_citations[0]) + 1 min_rank = min(min_rank, rank) eval_score.append(1.0 / rank) print("\nEval Score for iteration " + str(i) + ": " + str(1.0 / rank) + "\n") print("matching citation count = " + str(matching_citation_count)) print(eval_score) print("min rank = " + str(min_rank)) print(sum(eval_score) / matching_citation_count)
'TRAIN_NEG': 'train-neg.txt', 'TRAIN_POS': 'train-pos.txt', 'TRAIN_UNS': 'train-unsup.txt' } documents = TaggedLineDocs(sources) log.info('loaded %i documents', len(documents.sentences)) epochs = 30 vec_size = 100 log.info('Initializing D2V model') model = Doc2Vec(min_count=3, window=10, size=vec_size, sample=1e-4, negative=5, workers=4, dm=0, seed=seed, iter=epochs) model.build_vocab(documents) log.info('Training D2V Epochs %i', epochs) model.train(documents, total_examples=model.corpus_count, epochs=model.iter) log.info('Model Save') model.save('./imdb.d2v') # log.info('Load Pre-trained D2V Model') # model = Doc2Vec.load('./imdb.d2v')
train_tagged = train.apply( lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=r.score), axis=1) test_tagged = test.apply( lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=r.score), axis=1) print("1") """Building vocabulary""" model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample=0, workers=4) model_dbow.build_vocab([x for x in tqdm(train_tagged.values)]) print("2") """Initialise model""" for epoch in range(30): model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1) model_dbow.alpha -= 0.002 model_dbow.min_alpha = model_dbow.alpha
# tagged_documents_iterator = LabeledLineSentence( documents_tokenized, range(len(documents_tokenized))) # list(tagged_documents_iterator) # model = Doc2Vec(size=500, window=10, workers=11, alpha=0.025, min_alpha=0.025, iter=10, min_count=1) # model.build_vocab(tagged_documents_iterator) # In[12]: model.wv # In[13]: model.train(tagged_documents_iterator,
def run_doc2vec(train_docs, dev_docs, test_docs, dm, size, window, alpha, negative, sample, cores, min_count, passes, output, diagnostics=False): assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise" model = Doc2Vec(dm=dm, size=size, window=window, alpha=alpha, min_alpha=alpha, negative=negative, sample=sample, workers=cores, min_count=min_count, iter=1) model.build_vocab(train_docs) train_shuffled = train_docs whole_duration = 0 if (diagnostics): infer_vecs = np.zeros((len(test_docs), size)) dev_vecs = np.zeros((len(dev_docs), size)) test_vectors = dict() dev_vectors = dict() neighb_num = 10 words = [] for doc in train_docs: words += doc.words counter = collections.Counter(words) if not os.path.exists('diagnostics/'): os.mkdir('diagnostics/') i = output.find('/') diag_folder = 'diagnostics' + output[i:].replace(' ', '_').replace( '-', '').replace('.txt', '') + '/' if not os.path.exists(diag_folder): os.mkdir(diag_folder) tmp_dir = 'temp' + output[:i].replace('.txt', '') + '/' if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) par_cols = [ 'dm', 'size', 'window', 'alpha', 'negative', 'sample', 'min_count', 'epochs', 'cores' ] par_use = [ dm, size, window, alpha, negative, sample, min_count, passes, cores ] par_df = pd.DataFrame(columns=par_cols) for par_c, par_u in zip(par_cols, par_use): par_df.loc[0, par_c] = par_u par_df.to_csv(diag_folder + 'pars.csv') df = pd.DataFrame(columns=['neighbours']) p_ids = np.linspace(0, len(train_docs) - 1, num=5) dev = np.zeros(passes + 1) train = np.zeros(passes + 1) train_N = 30 train_ids_for_cost = np.linspace(0, len(train_docs) - 1, num=train_N) train_for_cost = [] for i in train_ids_for_cost: train_for_cost += [train_docs[int(i)]] print("START %s" % datetime.datetime.now()) with elapsed_timer() as elapsed: for epoch in range(passes): if (diagnostics): if (epoch == 0): diagnose(diag_folder, model, counter, p_ids, neighb_num, df, dev, train, epoch, alpha, passes, train_for_cost, train_N, dev_docs, dev_vectors, dev_vecs, output) shuffle(train_shuffled) if (epoch + 1 < 6): model.alpha = 0.2 else: model.alpha = alpha model.min_alpha = model.alpha model.train(train_shuffled, total_examples=len(train_docs), epochs=1) print('epoch %d' % (epoch + 1)) #N = 1000 if (diagnostics): diagnose(diag_folder, model, counter, p_ids, neighb_num, df, dev, train, epoch + 1, alpha, passes, train_for_cost, train_N, dev_docs, dev_vectors, dev_vecs, output) for i, doc in enumerate(test_docs): infer_vecs[i, :] = model.infer_vector(doc.words, alpha=alpha, min_alpha=alpha, steps=passes) test_vectors[i] = tuple([infer_vecs[i, :], doc.tags]) test = cost(model, test_vectors, test_docs, len(test_docs)) #print (test) whole_duration += elapsed() model.save(output) f = open(output + 'test', 'wb') pickle.dump(test_vectors, f) if (diagnostics): dev_pickle = open(diag_folder + 'dev.npy', 'wb') pickle.dump(dev, dev_pickle) train_pickle = open(diag_folder + 'train.npy', 'wb') pickle.dump(train, train_pickle) print('dev_cost (%d documents)' % len(dev_docs), dev) print('train_cost', train) print('infer_cost', test) print("END %s" % str(datetime.datetime.now())) print("duration %s" % str(whole_duration))
file_logistic_reg = "clsf_logistic_reg_" + label_name + ".pickle" p.dump(clsf_logistic_reg, open(file_logistic_reg, 'wb')) # Report - Logistic Regression pred_logistic_reg = clsf_logistic_reg.predict(x_test) acc_logistic_reg = accuracy_score(pred_logistic_reg, y_test) * 100 """## Doc2vec and Logistic Regression""" ## Doc2vec and Logistic Regression x_train_labeled = label_sentences(x_train, 'train') x_test_labeled = label_sentences(x_test, 'test') all_data = x_train_labeled + x_test_labeled clsf_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065) clsf_dbow.build_vocab([x for x in tqdm(all_data)]) for epoch in range(30): clsf_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1) clsf_dbow.alpha -= 0.002 clsf_dbow.min_alpha = clsf_dbow.alpha train_vectors_dbow = get_vectors(clsf_dbow, len(x_train_labeled), 300, 'train') test_vectors_dbow = get_vectors(clsf_dbow, len(x_test_labeled), 300, 'test') clsf_d2v = LogisticRegression(n_jobs=1, C=1e5) clsf_d2v = clsf_d2v.fit(train_vectors_dbow, y_train)
from gensim import models from gensim.models import Doc2Vec from gensim.models.doc2vec import LabeledSentence from collections import namedtuple import gensim.models.doc2vec doc1=["latest categories","my events", "get all my tags","all my feeds","new groups"] LabeledSentence() # Load data # doc1 = ["This is a sentence", "This is another sentence"] # Transform data (you can add more data preprocessing steps) docs = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for i, text in enumerate(doc1): words = text.lower().split() tags = [i] docs.append(analyzedDocument(words, tags)) # Train model (set min_count = 1, if you want the model to work with the provided example data set) model = Doc2Vec(docs, size = 100, window = 300, min_count = 1, workers = 4) # Get the vectors print(model.docvecs[0]) print(model.docvecs[1])
from gensim.models import Doc2Vec from gensim.models.doc2vec import LabeledSentence from datetime import datetime import sys class LabeledLineSentence(object): def __init__(self, filename): self.filename = filename def __iter__(self): for uid, line in enumerate(open(self.filename)): words=line.strip().split() yield LabeledSentence(words[1:],[words[0]]) if __name__=='__main__': print(sys.argv[0]) input_file=sys.argv[1] output_file=sys.argv[2] sentences=LabeledLineSentence(input_file) model = Doc2Vec(alpha=0.025, min_alpha=0.025) # use fixed learning rate model.build_vocab(sentences) for epoch in range(10): print(datetime.now(),'epoch:',epoch) model.train(sentences) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.save(output_file) print('运行完毕,文档向量已输出到',output_file)
with open("sentiment labelled sentences/%s_labelled.txt" % fname) as f: for item_no, line in enumerate(f): line_split = line.strip().split('\t') sent = line_split[0].lower() sent = re.sub(r'\'', '', sent) sent = re.sub(r'\W', ' ', sent) sent = re.sub(r'\s+', ' ', sent).strip() #sentences.append(LabeledSentence(sent.split(), ["%s_%d" % (fname, item_no)])) sentences.append( TaggedDocument(sent.split(), ["%s_%d" % (fname, item_no)])) sentiments.append(int(line_split[1])) sentences import random class PermuteSentences(object): def __iter__(self): shuffled = list(sentences) random.shuffle(shuffled) for sent in shuffled: yield sent permuter = PermuteSentences() model = Doc2Vec(permuter, min_count=1) #model.most_similar('tasty') model.wv.most_similar('tasty')
for text in entity: words = text.split() tags = [i] alldocs.append(SentimentDocument(words, tags)) i = i + 1 doc_list = alldocs[:] # for reshuffling per pass cores = multiprocessing.cpu_count() simple_models = [ # PV-DBOW Doc2Vec(dm=0, size=400, negative=5, hs=0, min_count=20, workers=cores, dbow_words=1), ] simple_models[0].build_vocab( alldocs ) # PV-DM/concat requires one special NULL word so it serves as template print(simple_models[0]) models_by_name = OrderedDict((str(model), model) for model in simple_models) best_error = defaultdict( lambda: 1.0) # to selectively-print only best errors achieved
def main(): ''' create a parser to rad the command line arugments arguments are returned as an object ''' parser = argparse.ArgumentParser(description="machine learning model") parser.add_argument('-c', '--csv_filename', default=None, help='filename for the csv dataset') parser.add_argument('-d', '--debug', action='store_true', help='turn on debug statements') args = parser.parse_args() if args.csv_filename: csv_filename = args.csv_filename else: parser.print_help() exit() if args.debug: debug = True else: debug = False ''' open the csv file for processing ''' if debug: print("Starting to read CSV") df = process_csv(csv_filename) if debug: print("Read CSV ---> DONE!") ''' obtain the test number from the csv filename this is used later for saving model and classifier ''' try: test_num = args.csv_filename.rsplit("/")[-1].rsplit("_")[0] except: test_num = 0 ''' perform labelling of given n-grams doc2vec requires each document to be labelled ''' if debug: print("Starting to process ngrams") ngrams = df['ngram'] labelled_ngrams = [] for i in range(len(ngrams)): labelled_ngrams.append(TaggedDocument(ngrams[i].split(), [i])) if debug: print("Process ngrams ---> DONE!") ''' create a model and export it to the given path ''' if debug: print("Creating model") model = Doc2Vec(dm=1, min_count=1, window=10, vector_size=150, sample=1e-4, negative=10) if debug: print("Model --> CREATED!") if debug: print("Training model") trained_model = train_d2v_model(model, labelled_ngrams, n_epochs=20) if debug: print("Model --> TRAINED!") d2v_path = os.path.join(os.get_pwd(), "doc2vec_model{}".format(test_num)) save_d2v_model(trained_model, d2v_path) model_loaded = load_d2v_model(d2v_path) ''' generate the target array this is needed as target can only be integers however in our case the dataset contains the 'Benign' and 'Malware' tags transformation is achieved using the module LabelEncoder from sklearn ''' le = LabelEncoder() le.fit(["Benign", "Malware"]) target = le.transform(df['label']) ''' get the inference vectors from d2v model ''' data = [] for i in range(len(df['ngram'])): data.append(model_loaded[i]) ''' create a split for test and training data currently its 70% and 30% ''' x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=0) ''' initialize an SVM classifier and perform fitting for the given training data ''' client = Client(processes=False) clf = svm.SVC(kernel='linear') with joblib.parallel_backend('dask'): clf.fit(x_train, y_train) ''' create a prediction array to later compare with the test array ''' y_pred = clf.predict(x_test) ''' describe the classifer path and then save the classifier ''' clf_path = "/home/architp/projects/def-daknox/architp/mitacs/ml/model{}.pkl".format( test_num) save_clf(clf, clf_path) ''' finally print the stats ''' print_stats(y_pred, y_test)
import gensim from gensim.models import Doc2Vec from gensim.models.doc2vec import TaggedDocument # In[24]: tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(x)] # In[25]: len(tagged_data) # In[26]: model = Doc2Vec(tagged_data, vector_size=20, epochs=100, min_count=0) #Save this doc2vec model pickle.dump(model, open('docvec.pkl', 'wb')) # In[27]: model.corpus_count # In[28]: o_model = model.docvecs.vectors_docs # In[29]: o_model.shape
train_docs = [doc for doc in alldocs if doc.split == 'train'] test_docs = [doc for doc in alldocs if doc.split == 'test'] doc_list = train_docs[:] # for reshuffling per pass print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs))) ############################################################################################## cores = multiprocessing.cpu_count() simple_models = [ # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size Doc2Vec(dm=1, dm_concat=1, size=150, window=10, sample=1e-2, negative=25, hs=0, min_count=2, workers=cores), # PV-DBOW Doc2Vec(dm=0, size=150, negative=25, window=10, hs=0, min_count=2, workers=cores), # PV-DM w/average Doc2Vec(dm=1, dm_mean=1,
log.info('source load') train_source = {'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS', 'train-unsup.txt':'TRAIN_UNS'} # train_source = {'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS'} test_source = {'test-neg.txt':'TEST_NEG', 'test-pos.txt':'TEST_POS'} log.info('TaggedDocument') train_sentences = TaggedLineSentence(train_source) test_sentences = TaggedLineSentence(test_source) marcos = train_sentences.to_array() log.info('D2V') model = Doc2Vec(min_count=1, window=10, vector_size=150, sample=1e-4, negative=5, workers=7,iter=40) model.build_vocab(train_sentences.to_array()) log.info('Epoch') # log.info('EPOCH: {}'.format(epoch)) model.train(train_sentences.sentences_perm(),total_examples=model.corpus_count,epochs=model.iter) log.info('Model Save') model.save('./imdbc.d2v') model = Doc2Vec.load('./imdbc.d2v') log.info('Sentiment') train_arrays = numpy.zeros((25000, 150)) train_labels = numpy.zeros(25000)
from gensim.models import Doc2Vec import gensim.models.doc2vec print('loading docs...') start_time = time() documents = [ doc for doc in TaggedLineDocument('volume2/processed_body_docs.txt') ] print("--- %s seconds ---" % (time() - start_time)) #documents = [] #with open('/volume/processed_body_docs.txt') as f: # for line in f: # documents.append(TaggedLineDocument(line)) print('training doc2vec model...') start_time = time() model = Doc2Vec(documents, vector_size=200, window=5, min_count=5, workers=14, epochs=20) print("--- %s seconds ---" % (time() - start_time)) print('saving model...') np.save('volume2/new_models/body_features-w2v-200.npy', model.docvecs.doctag_syn0) model.save('volume2/new_models/body_features-w2v-200.doc2vec') print('complete!')
#Tagging document sentences comp_docs = [TaggedDocument( words=[word for word in document[0].lower().split()], tags = [i] ) for i, document in enumerate(train_docs)] # In[48]: max_epoch = 2 vec_size = 20 # Train model model = Doc2Vec(size = vec_size, dm = 0, dbow_words = 1, window = 2, alpha = 0.2) model.build_vocab(comp_docs) for epoch in range(max_epoch): model.train(comp_docs, total_examples = model.corpus_count, epochs = epoch) model.save("Doc2Vec.model") print("Model Saved") # In[78]: def build_model(test_doc, compiled_doc): ''' Parameters -----------
def __generate(self): # Generate full set of LabeledSentences full_labeled_sentences = [] for tag, metadata_map in self.data_source_map.iteritems(): full_labeled_sentences.extend( metadata_map[self.LABELED_SENTENCES_KEY]) # Generate the model print('Instantiating Doc2Vec model...') self.model = Doc2Vec(documents=full_labeled_sentences, min_count=10, size=100, workers=1, sample=1e-4, negative=5) # Save model with metadata such as current epoch time in filename for later processing # TODO: add in a utility to load historical models for comparison model_file_name = "review_model_%d.d2v" % int(round( time.time() * 1000)) save_path = constants.GENERATED_MODEL_OUTPUT_DIR + model_file_name print('Finished model generation. Saving model to %s' % save_path) self.model.save(save_path) print('Finished model generation. Begin fitting classifier...') print('Constructing training vectors') positive_training_vectors = self.__get_vectors_list( constants.POSITIVE_TRAINING_TAG) negative_training_vectors = self.__get_vectors_list( constants.NEGATIVE_TRAINING_TAG) full_training_vectors = positive_training_vectors + negative_training_vectors print('Constructing training labels') positive_training_labels = numpy.ones( shape=len(positive_training_vectors)) negative_training_labels = numpy.zeros( shape=len(negative_training_vectors)) full_training_labels = numpy.concatenate( (positive_training_labels, negative_training_labels), axis=0) print('Fitting classifier to training data') self.classifier.fit(full_training_vectors, full_training_labels) print('Finished fitting classifier. Begin scoring classifier...') print('Constructing testing vectors') positive_testing_vectors = self.__get_vectors_list( constants.POSITIVE_TESTING_TAG) negative_testing_vectors = self.__get_vectors_list( constants.NEGATIVE_TESTING_TAG) full_testing_vectors = positive_testing_vectors + negative_testing_vectors print('Constructing testing labels') positive_testing_labels = numpy.ones( shape=len(positive_testing_vectors)) negative_testing_labels = numpy.zeros( shape=len(negative_testing_vectors)) full_testing_labels = numpy.concatenate( (positive_testing_labels, negative_testing_labels), axis=0) print('Scoring classifier') score = self.classifier.score(full_testing_vectors, full_testing_labels) print('Classifier received a score of %.4f' % score)
def train(self): self.model = Doc2Vec() self.model.build_vocab(self.labelledSents) for i in tqdm(range(10)): self.model.train(self.labelledSents)
[prefix + '_%s' % item_no])) return self.sentences def sentences_perm(self): shuffle(self.sentences) return self.sentences from pathlib import Path if not Path("./imdb.d2v").is_file(): # file exists sentences = LabeledLineSentence(sources) model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=16) model.build_vocab(sentences.to_array()) for epoch in range(20): log.info('Epoch %d' % epoch) model.train( sentences.sentences_perm(), total_examples=model.corpus_count, epochs=model.iter, ) model.save('./imdb.d2v') model = Doc2Vec.load('./imdb.d2v')
def run(self, documents, output_base_dir, vocab_min_count, num_epochs, algorithm, vector_size, alpha, min_alpha, train, window, cores): # As soon as FAST_VERSION is not -1, there are compute-intensive codepaths that avoid holding # the python global interpreter lock, and thus you should start to see multiple cores engaged. # For more details see: https://github.com/RaRe-Technologies/gensim/issues/532 # assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" if cores == None: cores = multiprocessing.cpu_count() negative = 5 hs = 0 docvecs_mapfile = 'docvecs_map.txt' if algorithm == 'pv_dmc': # PV-DM with concatenation # window=5 (both sides) approximates paper's 10-word total window size # PV-DM w/ concatenation adds a special null token to the vocabulary: '\x00' model = Doc2Vec(dm=1, dm_concat=1, vector_size=vector_size, window=window, negative=negative, hs=hs, min_count=vocab_min_count, workers=cores, docvecs_mapfile=docvecs_mapfile) elif algorithm == 'pv_dma': # PV-DM with average # window=5 (both sides) approximates paper's 10-word total window size model = Doc2Vec(dm=1, dm_mean=1, vector_size=vector_size, window=window, negative=negative, hs=hs, min_count=vocab_min_count, workers=cores, docvecs_mapfile=docvecs_mapfile) elif algorithm == 'pv_dbow': # PV-DBOW model = Doc2Vec(dm=0, vector_size=vector_size, window=window, negative=negative, hs=hs, min_count=vocab_min_count, workers=cores, docvecs_mapfile=docvecs_mapfile) else: raise ValueError('Unknown algorithm: %s' % algorithm) logging.info('Algorithm: %s' % str(model)) logging.info('Build vocabulary') model.build_vocab(documents) vocab_size = len(model.wv.vocab) logging.info('Vocabulary size: %d', vocab_size) target_dir = self.make_timestamped_dir(output_base_dir, algorithm, model.vector_size, num_epochs, window) vocab_path = os.path.join(target_dir, 'vocabulary') logging.info('Save vocabulary to: %s', vocab_path) with open(vocab_path, 'w') as f: term_counts = [[term, value.count] for term, value in model.wv.vocab.items()] term_counts.sort(key=lambda x: -x[1]) for x in term_counts: f.write('%s, %d\n' % (x[0], x[1])) if train: logging.info('Shuffle documents') shuffle(documents) logging.info('Train model') model.train(documents, total_examples=len(documents), epochs=num_epochs, start_alpha=alpha, end_alpha=min_alpha) logging.info('Save model to: %s', target_dir) model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model.save(os.path.join(target_dir, 'doc2vec.model')) model_meta = { 'argv': sys.argv, 'target_dir': target_dir, 'algorithm': algorithm, 'window': window, 'vector_size': vector_size, 'alpha': alpha, 'min_alpha': min_alpha, 'num_epochs': num_epochs, 'vocab_min_count': vocab_min_count, 'vocab_size': vocab_size, 'cores': cores, 'negative': negative, 'hs': hs } model_meta_path = os.path.join(target_dir, 'model.meta') logging.info('Save model metadata to: %s', model_meta_path) with open(model_meta_path, 'w') as outfile: json.dump(model_meta, outfile)
'15同居多年未办理结婚登记,是否可以向法院起诉要求离婚' ] # 使用doc2vec来判断 cores = multiprocessing.cpu_count() print(cores) corpora_documents = [] for i, item_text in enumerate(raw_documents): #words_list = util_words_cut.get_class_words_list(item_text) words_list = list(jieba.cut(item_text)) document = TaggedDocument(words=words_list, tags=[i]) corpora_documents.append(document) print(corpora_documents[:2]) model = Doc2Vec(size=89, min_count=1, iter=10) model.build_vocab(corpora_documents) model.train(corpora_documents, total_examples=model.corpus_count, epochs=model.iter) print('#########', model.vector_size) test_data_1 = '你好,我想问一下我想离婚他不想离,孩子他说不要,是六个月就自动生效离婚' #test_cut_raw_1 = util_words_cut.get_class_words_list(test_data_1) test_cut_raw_1 = list(jieba.cut(test_data_1)) print(test_cut_raw_1) inferred_vector = model.infer_vector(test_cut_raw_1) print(inferred_vector) sims = model.docvecs.most_similar([inferred_vector], topn=3) print(sims)
train_corpus[2] ## A TaggedDocument(List of Word Tokens, Int of Tag) ## Model Training %%time from gensim.models import Doc2Vec model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=100) model.build_vocab(train_corpus) model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter) models = [ # PV-DBOW (Skip-Gram equivalent of Word2Vec) Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, min_count=10, epochs=50), # PV-DM w/average (CBOW equivalent of Word2Vec) Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, min_count=10, epochs =50), ] ## Concatenated Model ## Train both PV-DBOW and PV-DM and combine the two documents = train_corpus models[0].build_vocab(documents) models[1].reset_from(models[0]) for model in models: model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer import sys # reload(sys) # sys.setdefaultencoding('utf-8') p_stemmer = PorterStemmer() en_stop = get_stop_words('en') tokenizer = RegexpTokenizer(r'\w+') #This project is a small sample, applying doc2vec with small set of sample (1 million instances) #Main ideas of doc2vec: convert a list of words into a specific vector, #then compare the similarity of two sentences by compare distance between two vectors #1/ Create a model for clustering problem # **Note: Must determine parameter in this model depend on the user # model=Doc2Vec(dm=1,dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2) model = Doc2Vec(alpha=0.025, min_alpha=0.025) #2/ Create a data set for training # - Use all questions( question in attribute question1 and question2) in both file train and test as data for training # - Each question is a separate "doc" need to be converted into a "vec" at some specific space # sent_raw="What is the step by step guide to invest in share market in india?" # # model.build_vocab(sentences=[LabeledSentence()]) # sent=LabeledSentence(words=sent_raw.split(" "),tags=[1]) # model.build_vocab(sentences=[sent]) # print(model.vocab) # model.train(sentences=[sent]) def read_data(): df = pd.read_csv("train.csv", encoding='utf8')
from gensim.models import Doc2Vec import gensim.models.doc2vec from collections import OrderedDict import multiprocessing import time cores = multiprocessing.cpu_count() assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise" simple_models = [ # PV-DBOW Doc2Vec(dm=0, vector_size=100, min_count=3, window=10, negative=5, hs=0, workers=cores), # PV-DM Doc2Vec(dm=1, vector_size=300, window=10, negative=5, hs=0, min_count=2, workers=cores), ] model = simple_models[0] model.build_vocab(docData, update=False)
for file in files: review = '' with open('aclImdb/test/neg/{}'.format(file), 'r', encoding='utf-8') as f: for word in word_tokenize(f.read()): if lemm.lemmatize(word) not in stop_words: review += ' ' + word f.close() review_list.append(review) labels_list.append('neg_' + file) it = LabeledLineSentence(doc_list=review_list, labels_list=labels_list) model = Doc2Vec(size=3000, window=10, dm=0, alpha=0.025, min_alpha=0.025, min_count=5, workers=multiprocessing.cpu_count()) model.build_vocab(it) model.train(it, total_examples=4000, epochs=20) model.save('partial_Doc2Vec.model') model = Doc2Vec.load('partial_Doc2Vec.model') x_train = np.zeros((2000, 3000)) y_train = np.zeros(2000) files = os.listdir('aclImdb/train/pos')[:1000]
def convert_sentences(sentence_list): for i in range(len(sentence_list)): for char in ['.', ',', '!', '?', ';', ':']: sentence_list[i] = sentence_list[i].replace(char, ' ' + char + ' ') return [TaggedDocument(words=sentence_list[i].split(), tags=[i]) for i in range(len(sentence_list))] def normalize(x,p=2): xx = np.linalg.norm(x, p) return x / xx if xx else x cores = multiprocessing.cpu_count() content = ["A head motion during brain imaging has been recognized as a source of image degradation and introduces distortion in positron emission tomography (PET) image. There are several techniques to correct the motion artifact, but these techniques cannot correct the motion during scanning. The aim of this study is to develop a sinogram-based motion correction (SBMC) method to correct directly the head motion during PET scanning using a motion tracking system and list-mode data acquisition. This method is a rebinning procedure by which the lines of response (LOR) are geometrically transformed according to the current values of the six-dimensional motion data. Michelogram was recomposed using rebinned LOR and motion corrected sinogram was generated. In the motion corrected image, the blurring artifact due to motion was reduced by SBMC method.", "A hierarchical controller for dealing with faults and adverse environmental conditions on an automated highway system is proposed. The controller extends a previous control hierarchy designed to work under normal conditions of operation. The faults are classified according to the capabilities remaining on the vehicle or roadside after the fault has occurred. Information about these capabilities is used by supervisors in each of the layers of the hierarchy to select appropriate fault handling strategies. We outline the strategies needed by the supervisors and give examples of their detailed operation", "A highly dependable embedded fault-tolerant memory architecture for high performance massively parallel computing applications and its dependability assurance techniques are proposed and discussed in this paper. The proposed fault tolerant memory provides two distinctive repair mechanisms: the permanent laser redundancy reconfiguration during the wafer probe stage in the factory to enhance its manufacturing yield and the dynamic BIST/BISD/BISR (built-in-self-test-diagnosis-repair)-based reconfiguration of the redundant resources in field to maintain high field reliability. The system reliability which is mainly determined by hardware configuration demanded by software and field reconfiguration/repair utilizing unused processor and memory modules is referred to as HW/SW Co-reliability. Various system configuration options in terms of parallel processing unit size and processor/memory intensity are also introduced and their HW/SW Co-reliability characteristics are discussed. A modeling and assurance technique for HW/SW Co-reliability with emphasis on the dependability assurance techniques based on combinatorial modeling suitable for the proposed memory design is developed and validated by extensive parametric simulations. Thereby, design and Implementation of memory-reliability-optimized and highly reliable fault-tolerant field reconfigurable massively parallel computing systems can be achieved.","A highly efficient color correction approach based on color-encoded fringe projection is proposed, which combine color image segmentation and color intensity interpolation technique. Only 24 designed color patterns are projected and recorded to implement the process with a high brightness DLP projector and a color camera. To establish the correspondence between the designed color intensity and recorded color intensity, the recorded image is firstly segmented into some adjacent grid region by neighboring pixel intensity fitting error, the grid region is then grown to the region boundary employing some process algorithm, thirdly, the region number is labeled and adjusted based on the designed color pattern by searching the region centre coordinate and applying a man-machine conversation method, finally, the color correspondence relation is established according to the designed color pattern pixel index and the labeled grid region number of recorded image. While doing the color correction, firstly, the initial color intensity is searched according to the minimum color distance between the recorded color and designed color. Secondly, color interpolation is implemented to obtain the true color intensity correspondence to recorded color. The proposed approach validity is testified by experiment results.","A high-performance line conditioner with excellent efficiency and power factor is proposed. The line conditioner consists of a three-leg rectifier-inverter, which operates as a boost converter and a buck converter. This boost-buck topology enables constant output voltage regulation, irrespective of input voltage disturbances. In addition the three-leg bridge can reduce the number of switching devices and system loss, while maintaining the capabilities of power factor correction and good output voltage regulation. The power factor controller for the single-phase pulse-width modulated (PWM) rectifier is derived using the feedback linearisation concept. The inverter side acts as a voltage regulator with current-limiting capability for impulsive loads. The disturbance of input voltage is detected using a fast-sensing technique. Experimental results obtained on a 3 kVA prototype show a normal efficiency of over 95% and input power factor of over 99%.","A high-tech information electronic equipment of some given type is designed in order to proceed automatically fault detection and improve the efficiency and accuracy of diagnosis. This thesis which is a part of the program introduces the research of algorithm of fault diagnose expert system of a power supply circuit board of an electronic device and algorithm realization and example proving on the hardware platform. It's quicker and more convenient to locate fault on the circuit boards with this equipment. It's proved that this expert system can solve the problems of high cost and long intervals of maintenance and keep the equipment in a stable status", "A hydrogen-powered fuel cell vehicle is developed, in which a distributed control and communication system based on CAN (Controller Area Network) is built. For vehicle diagnostic purpose, a new on-board fault diagnosis strategy is presented. There are two efficient automotive diagnostic systems based on CAN designed and implemented in this paper: (1)CANoe is a powerful CAN development tool. A fault diagnosis environment based on CANoe is established to satisfy the needs of on-board and off-board fault diagnosis application of FCV. By setting up the communication interface between CANoe and Access, the vehicle fault codes are collected and stored. Meanwhile a database is designed for the management of fault information. (2) A hand-held fault diagnosis equipment as well as a windows analyzer interface is set up. All fault information from FCVpsilas CAN network can be gotten easily by the equipment. With the Serial Communication between the equipment and PC, the fault codes stored in the equipment can be read, analyzed and disposed by PC.", "A key attribute of any tester for FLIR systems is a calibrated uniform source. A uniform source ensures that any anomalies in performance are artifacts of the FLIR being tested and not the tester. Achieving a uniform source from a resistor array based portable infrared scene projector requires implementation of nonuniformity correction algorithms instead of controlling the bonding integrity of a source to a cooler, and the coating properties of the source typical of a conventional blackbody. The necessity to perform the non-uniformity correction on the scene projector is because the source is a two-dimensional array comprised of discrete resistive emitters. Ideally, each emitter of the array would have the same resistance and thus produce the same output for a given drive current. However, there are small variations from emitter to emitter over the thousands of emitters that comprise an array. Once a uniform output is achieved then the output must be calibrated for the system to be used as test equipment. Since the radiance emitted from the monolithic array is created by flowing current through micro resistors, a radiometric approach is used to calibrate the differential output of the scene projector over its dynamic range. The focus of this paper is to describe the approach and results of implementing non-uniformity correction and calibration on a portable infrared scene projector.", "A kind of routing scheme with the ability to tolerate the faults is necessary in the massively parallel multiprocessors. In this paper, we have proposed a kind of fault-tolerant routing scheme in the tori networks. The new routing scheme is called the two-level-turn-model routing scheme, which is based on our investigation of the fault-tolerant properties of the turn-model. Through employing two specific kinds of turn model, our routing scheme could tolerate the convex faults and the concave faults both with a few limitations to their shape. At most five virtual channels would be used to avoid the deadlock occurrence in the tori, no matter whether the fault regions are connected and no matter where the faults locate. Actually, if the fault regions encompass no physical boundary nodes in the tori, totally four virtual channels, each pair for each turn model, would be sufficient to preclude the occurrence of the deadlock. At last, the simulation shows the effectiveness of our scheme."] content1 = convert_sentences(content) model = Doc2Vec(size=10, window=10, min_count=5, workers=cores,alpha=0.025, min_alpha=0.025) model.build_vocab(content1) for epoch in range(10): model.train(content1, total_examples=model.corpus_count, epochs=1000) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay set_trace() model. while True: x = model.infer_vector(content[0], alpha=model.alpha, min_alpha=model.min_alpha) y = model.infer_vector(content[0], alpha=model.alpha, min_alpha=model.min_alpha) x = normalize(x) y = normalize(y)
def __init__(self, model='doc2v', **params): self.embedding = model if self.embedding == 'doc2v': self.fn_model = Doc2Vec(**params)
return tokens train_tagged = train.apply( lambda r: TaggedDocument(words=tokenize_text(r['Desc']), tags=[r.Code]), axis=1) test_tagged = test.apply( lambda r: TaggedDocument(words=tokenize_text(r['Desc']), tags=[r.Code]), axis=1) print(train_tagged.values[30]) model_dmm = Doc2Vec(dm=1, dm_mean=1, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065) model_dmm.build_vocab([x for x in tqdm(train_tagged.values)]) # for epoch in range(30): # model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), # epochs=1) # model_dmm.alpha -= 0.002 # model_dmm.min_alpha = model_dmm.alpha print("Load modello doc2vec") model_dmm = Doc2Vec.load("0Model_Dmm.bin") y_train, X_train = vec_for_learning(model_dmm, train_tagged)
#This function does all cleaning of data using two objects above def nlp_clean(data): new_str = data.lower() dlist = tokenizer.tokenize(new_str) dlist = list(set(dlist).difference(stopword_set)) return dlist class LabeledLineSentence(object): def __init__(self, filename1,filename2): self.filename1 = filename1 self.filename2 = filename2 def __iter__(self): for uid, (line1,line2) in enumerate(zip(open(self.filename1),open(self.filename2))): yield gensim.models.doc2vec.LabeledSentence(nlp_clean(line1),line2.split()) sentences = LabeledLineSentence('/home/eric/Data/reviewlist','/home/eric/Data/idlist') import gensim from gensim.models import Doc2Vec import os import logging logging.basicConfig(level=logging.INFO) print(os.sched_getaffinity) os.sched_setaffinity(0, range(4)) assert gensim.models.doc2vec.FAST_VERSION > -1 model = Doc2Vec(sentences, size=300, min_count=10, alpha=0.025, min_alpha=0.001, workers=multiprocessing.cpu_count(),iter = 15) model.save("/home/eric/Data/doc2vec.model")