from glove import Glove from glove import Corpus '''数据集导入''' # 将文本行存入列表 i = 1 lines = [] for line in open('clean_data.txt', encoding='utf-8'): lines.append(line.split(' ')) print("appending line " + str(i)) i += 1 # 准备数据集 corpus_model = Corpus() corpus_model.fit(lines, window=10) #corpus_model.save('corpus.model') print('Dictionary size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) '''训练模型''' gl = Glove(no_components=200, learning_rate=0.05) gl.fit(corpus_model.matrix, epochs=5, no_threads=1, verbose=True) gl.add_dictionary(corpus_model.dictionary) '''模型保存''' gl.save('glove.model')
def train_glove(inst, meta_data={}): start_total = datetime.now() meta_data["glove_params"] = settings.GLOVE_PARAMS glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS) for params in glove_paramgrid: start = datetime.now() # MAKE CORPUS # set corpus filepath corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format( settings.DATASET, params["window"])) # load if corpus exists if os.path.isfile(corpus_fp): logging.info("Loading existing corpus {}.".format(corpus_fp)) corpus_model = Corpus.load(corpus_fp) logging.info("Successfully loaded existing corpus {}.".format(corpus_fp)) # make a new coocurrence corpus if it does not exist else: logging.info("Creating new corpus at {}.".format(corpus_fp)) corpus_model = Corpus() corpus_model.fit(inst, window=params["window"]) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) corpus_model.save(corpus_fp) logging.info("Dict size: {}.".format(len(corpus_model.dictionary))) logging.info("Collocations: {}.".format(corpus_model.matrix.nnz)) # GLOVE VECTOR TRAINING glove = Glove(no_components=params["dims"], learning_rate=params["lr"]) logging.info("Start fitting GloVe with parameters: {}.".format(params)) glove.fit(corpus_model.matrix, epochs=params["epochs"], no_threads=params["njobs"], verbose=False) glove.add_dictionary(corpus_model.dictionary) os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True) model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET, params["window"], params["lr"], params["epochs"], params["dims"]) glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name)) duration = (datetime.now() - start).total_seconds() meta_data["models"][model_name] = params meta_data["models"][model_name]["duration_training"] = duration logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format( model_name, duration, params)) # SIMILARITY TEST for test_word in settings.TESTSIM_WORDS: if test_word not in meta_data["most_similar"]: meta_data["most_similar"][test_word] = {} logging.info("Querying model {} for {} most similar to \'{}\':".format( model_name, settings.N_TESTSIM, test_word)) sim = glove.most_similar(test_word, number=settings.N_TESTSIM) meta_data["most_similar"][test_word][model_name] = sim logging.info(pprint.pformat(sim)) total_duration = (datetime.now() - start_total).total_seconds() meta_data["glove_duration_training"] = total_duration return meta_data
def main(model_select): data = pd.read_excel("./data/doc_set_final4.xlsx") data.token = data.token.apply(lambda x: literal_eval(x)) data = data.sample(frac=1, random_state=1234) token_list = data.token.tolist() target = data[['new_class', 'new_small_class']] train_x_data, test_x_data, train_y, test_y = train_test_split( token_list, target, test_size=0.3, stratify=target, shuffle=True, random_state=1234) if model_select == 'w2v': w2v_name = 'base_token' print("모델 학습") word2vec_kargs = { 'num_features': 300, 'num_workers': 4, 'window': 8, 'seed': 1234, 'min_word_count': 5, 'min_alpha': 0.025, 'iter': 30 } model = word2vec_model(train_x_data, **word2vec_kargs) print("모델 저장") model_name = './model/word_embedding/Word2vec1({}).model'.format( w2v_name) model.save(model_name) elif model_select == 'd2v': TaggedDocument = namedtuple('TaggedDocument', 'words tags') tagged_train_docs = [ TaggedDocument(d, [c[1]['new_class'], c[1]['new_small_class']]) for d, c in zip(train_x_data, train_y.iterrows()) ] print("모델 학습") doc2vec_kargs = { 'size': 300, 'window': 8, 'min_count': 5, 'alpha': 0.025, 'min_alpha': 0.025, 'workers': 4, 'seed': 1234, 'iter': 50 } model = doc2vec_model(tagged_train_docs, **doc2vec_kargs) print("모델 저장") model.save('./model/word_embedding/Doc2vec_new_small2_4.model') elif model_select == 'fasttext': print("모델 학습") ft_kargs = { 'size': 300, 'window': 5, 'min_count': 3, 'workers': 4, 'seed': 1234 } model = fasttext_model(train_x_data, **ft_kargs) print("모델 저장") model.save('./model/word_embedding/FastText.model') elif model_select == 'glove': glove_kargs = { 'size': 300, 'lr': 0.005, 'random_state': 1234, 'no_threads': 4, 'epoch': 30 } corpus = Corpus() corpus.fit(train_x_data, window=8) glove = Glove(no_components=glove_kargs['size'], learning_rate=glove_kargs['lr']) glove.fit(corpus.matrix, epochs=glove_kargs['epoch'], no_threads=glove_kargs['no_threads'], verbose=True) glove.add_dictionary(corpus.dictionary) print("모델 저장") glove.save('./model/word_embedding/glove.model') else: print("3가지 방식 중에 고르시오")
for sublist in description: for item in sublist: desc_text.append(item) """ print(len(fulltext)) print(len(fulltext[0])) print(len(fulltext[1])) corpus=Corpus() desc=Corpus() corpus.fit(fulltext,window=10) # length of the (symmetric)context window used for cooccurrence desc.fit(description,window=10) desc_glove= Glove(no_components = 100 ,learning_rate=0.05) desc_glove.fit(desc.matrix,epochs=30,no_threads=4,verbose=True) desc_glove.add_dictionary(desc.dictionary) desc_glove.save('/Volumes/Untitled/WithDescription/CS/desc_glove.tsv') glove= Glove(no_components=390,learning_rate=0.05) glove.fit(corpus.matrix,epochs=30,no_threads=4,verbose=True) glove.add_dictionary(corpus.dictionary) #tsne = TSNE(n_components=2, verbose=1,perplexity=2,method='exact') #tsne_results = tsne.fit_transform(desc_glove.word_vectors) print(corpus.dictionary) content_vector=glove.word_vectors #vector with word embeddings with open("/Volumes/Untitled/WithDescription/CS/content.tsv","w+") as my_csv: csvWriter = csv.writer(my_csv,delimiter=' ') csvWriter.writerows(content_vector)
# In[19]: corpus = Corpus() # In[20]: corpus.fit(sentence_corpus, window=10) # In[21]: import sys # In[23]: glove = Glove(no_components=300, learning_rate=0.01) # In[24]: glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) # In[25]: glove.add_dictionary(corpus.dictionary) # In[26]: len(glove.dictionary) # In[29]:
# tokenized = ['<rt>' if token == 'rt' else token for token in tokenized] ##add <rt> token since it exists in glove #tokenized = [token for token in tokenized if not token in stopwords.words('english')] ##remove stopwords #print(tokenized) #tokenized = list(map(lambda x : lemmatizer.lemmatize(x), tokenized)) lr = 1e-4 num_epoch = 9 batch_size_train = 64 batch_size_test = 32 gradient_clip = 0.25 torch.manual_seed(7052020) np.random.seed(7052020) ########################## choose embeddings ########################## embedding_source = 'glove' embeddings = Glove(glove_path).glove_dict # embedding_source = 'bert' # embeddings = Path(bert_path_train) _time = time() # bert_dict_train = None # with open(Path(bert_path_train), 'r') as f: # bert_dict_train = json.load(f) # print("Initialized train embeddings!") # bert_dict_test = None # with open(Path(bert_path_test), 'r') as f: # bert_dict_test = json.load(f) # print("Initialized test embeddings!") train_dataset = DatasetPyTorch(
train = idx[:split] test = idx[split:] X_train = data['review'].values[train] del data gc.collect() preprocessor = get_preprocessor(stem=True, stop=True, min_length=3) folder = 'dump' os.makedirs(folder, exist_ok=True) file = 'setup.npz' if file in os.listdir(folder): logger(f'found saved {file}') glove = Glove(None, preprocessor, random_state=2021) glove.load(f'{folder}/{file}') else: glove = Glove(X_train, preprocessor, random_state=2021) glove.dump_co_occurance(f'{folder}/{file}') logger(f'saved {file}') del X_train gc.collect() print() for dim in [2, 10, 50, 100, 200, 300, 400, 500, 600]: # Needs more iterations to converge for higher dims glove.fit(dim, eta=1e-2, epochs=200 if dim < 300 else 1000,
print('[{}] Reading corpus from file...'.format(chalk.yellow(CORPUS_FILE))) corpus = Corpus.load(CORPUS_FILE) else: nx_G = util.get_nx_graph() walks = util.get_node2vec_walks(nx_G) corpus = Corpus() corpus.fit(walks, window=WINDOW_SIZE) print('[{}] Writing corpus file...'.format(chalk.green(CORPUS_FILE))) corpus.save(CORPUS_FILE) if os.path.exists(GLOVE_MODEL_FILE) and not args.train: print('[{}] Reading glove model from file...'.format( chalk.yellow(GLOVE_MODEL_FILE))) glove = Glove.load(GLOVE_MODEL_FILE) else: glove = Glove(no_components=VECTOR_DIMENSION, learning_rate=0.05) glove.fit(corpus.matrix, epochs=GLOVE_EPOCHS, no_threads=PARALLEL_WORKER_COUNT, verbose=True) glove.add_dictionary(corpus.dictionary) print('[{}] Writing glove file...'.format(chalk.green(GLOVE_MODEL_FILE))) glove.save(GLOVE_MODEL_FILE) if args.query: dictionary = glove.dictionary print(glove.word_vectors[glove.dictionary[args.query]]) print(glove.most_similar(args.query, number=40)) def get_glove_model(): return glove
def corpus_to_glove(corpus): glove_model = Glove(no_components=50, learning_rate=0.07) glove_model.fit(corpus.matrix, epochs=1, no_threads=2) glove_model.add_dictionary(corpus.dictionary) return glove_model
def generateModel(traces, runID): linesentencedataKey = "" linesentencetimeKey = "" linesentencenumberKey = "" linesentencebiburstDataKey = "" linesentencebiburstTimeKey = "" linesentencePackLen = "" sentencesFile = "model/" + runID + "sentences.txt" modelFile = "model/" + runID + "mygloveModel" myFile = open(sentencesFile, 'w') mypackCount = 0 for trace in traces: linesentencedataKey = "" linesentencetimeKey = "" linesentencenumberKey = "" linesentencebiburstDataKey = "" linesentencebiburstTimeKey = "" linesentencePackLen = "" directionCursor = None dataCursor = 0 timeCursor = 0 prevTimeCursor = 0 burstTimeRef = 0 numberCursor = 0 secondBurstAndUp = False prevDataCursor = 0 prevDirectionCursor = None for packet in trace.getPackets(): if directionCursor == None: directionCursor = packet.getDirection() if packet.getDirection() != directionCursor: dataKey = 'S' + str(directionCursor) + '-' + str( GloveClassifier.roundArbitrary(dataCursor, 600)) #dataKey = 'S'+str(directionCursor)+'-'+str(dataCursor) if config.GLOVE_OPTIONS['burstSize'] == 1: linesentencedataKey = linesentencedataKey + " " + dataKey #directionCursor = packet.getDirection() #dataCursor = 0 timeKey = 'T' + str(directionCursor) + '-' + str(timeCursor) if config.GLOVE_OPTIONS['burstTime'] == 1: linesentencetimeKey = linesentencetimeKey + " " + timeKey burstTimeRef = packet.getTime() # number marker numberKey = 'N' + str(directionCursor) + '-' + str( numberCursor) if config.GLOVE_OPTIONS['burstNumber'] == 1: linesentencenumberKey = linesentencenumberKey + " " + numberKey numberCursor = 0 # BiBurst if secondBurstAndUp: biBurstDataKey = 'Bi-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \ str( GloveClassifier.roundArbitrary(prevDataCursor, 600) )+'-'+ \ str( GloveClassifier.roundArbitrary(dataCursor, 600) ) if config.GLOVE_OPTIONS['biBurstSize'] == 1: linesentencebiburstDataKey = linesentencebiburstDataKey + " " + biBurstDataKey biBurstTimeKey = 'BiTime-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \ str( prevTimeCursor )+'-'+ \ str( timeCursor ) if config.GLOVE_OPTIONS['biBurstTime'] == 1: linesentencebiburstTimeKey = linesentencebiburstTimeKey + " " + biBurstTimeKey prevTimeCursor = timeCursor timeCursor = 0 secondBurstAndUp = True prevDataCursor = dataCursor dataCursor = 0 prevDirectionCursor = directionCursor directionCursor = packet.getDirection() dataCursor += packet.getLength() timeCursor = packet.getTime() - burstTimeRef numberCursor += 1 if config.GLOVE_OPTIONS['packetSize'] == 1: linesentencePackLen = linesentencePackLen + " " + str( packet.getLength()) + "_" + str(packet.getDirection()) if dataCursor > 0: #key = 'S'+str(directionCursor)+'-'+str( dataCursor) key = 'S' + str(directionCursor) + '-' + str( GloveClassifier.roundArbitrary(dataCursor, 600)) if config.GLOVE_OPTIONS['burstSize'] == 1: linesentencedataKey = linesentencedataKey + " " + key timeKey = 'T' + str(directionCursor) + '-' + str(timeCursor) if config.GLOVE_OPTIONS['burstTime'] == 1: linesentencetimeKey = linesentencetimeKey + " " + timeKey numberKey = 'N' + str(directionCursor) + '-' + str(numberCursor) if config.GLOVE_OPTIONS['burstNumber'] == 1: linesentencenumberKey = linesentencenumberKey + " " + numberKey # BiBurst if secondBurstAndUp: #biBurstDataKey = 'Bi-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \ # str( prevDataCursor )+'-'+ \ # str( dataCursor ) biBurstDataKey = 'Bi-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \ str( GloveClassifier.roundArbitrary(prevDataCursor, 600) )+'-'+ \ str( GloveClassifier.roundArbitrary(dataCursor, 600) ) if config.GLOVE_OPTIONS['biBurstSize'] == 1: linesentencebiburstDataKey = linesentencebiburstDataKey + " " + biBurstDataKey biBurstTimeKey = 'BiTime-'+str(prevDirectionCursor)+'-'+str(directionCursor)+'-'+ \ str( prevTimeCursor )+'-'+ \ str( timeCursor ) if config.GLOVE_OPTIONS['biBurstTime'] == 1: linesentencebiburstTimeKey = linesentencebiburstTimeKey + " " + biBurstTimeKey myFile.write(linesentencePackLen + linesentencedataKey + linesentencetimeKey + linesentencenumberKey + linesentencebiburstDataKey + linesentencebiburstTimeKey) myFile.write("\n") myFile.close() if config.CLASSIFIER == config.GLOVE_CLASSIFIER: sentences = models.word2vec.LineSentence(sentencesFile) corpus = Corpus() corpus.fit(sentences, window=config.GLOVE_PARAMETERS['window']) glove = Glove(no_components=config.GLOVE_PARAMETERS['no_components'], learning_rate=config.GLOVE_PARAMETERS['learning_rate']) glove.fit(corpus.matrix, epochs=config.GLOVE_PARAMETERS['epochs'], no_threads=10, verbose=False) glove.add_dictionary(corpus.dictionary) glove.save(modelFile) elif config.CLASSIFIER == config.W2V_CLASSIFIER: txt = open(sentencesFile) # print txt.read() if (len(txt.read()) > 0): #print "in here" txt.close() sentences = models.word2vec.LineSentence(sentencesFile) model = models.word2vec.Word2Vec(sentences, size=50, window=15, min_count=1, workers=4) model.save("word2vecModel") txt.close()
try: with open('text8') as f: words = f.read() except: msg = 'Missing "text8" file!\nTry "wget https://data.deepai.org/text8.zip" and unzipping text8.zip then retrying this script!' raise Exception(msg) def preprocessor(text, to_tokens = False): if to_tokens: return text.split() return [text.split()] file = 'setup.npz' if file in os.listdir(folder): logger(f'found saved {file}') glove = Glove(None , preprocessor, random_state = 2021, x_min = 2, x_max = 20) glove.load(f'{folder}/{file}') else: start = process_time() glove = Glove([words], preprocessor, random_state = 2021, x_min = 2, x_max = 20) time = process_time() - start glove.dump_co_occurance(f'{folder}/{file}', time = time) del words; gc.collect() print() for dim in [2, 10, 50, 100, 200, 300, 400, 500, 600]: filename = f'glove-{dim}.npz' start = process_time() glove.fit(dim, eta = 0.5, epochs = 500, optimiser = 'adam', decay = 1e-2) time = process_time() - start
def train(self, epochs=30,no_threads=None): """ Train with own Data(s) Support single or multiple corpus or dataframe. Parameters: ----------- model_name(optional): preferred model name epochs : int : total epochs for training no_threads(optional): int : no of threads for training Example -------- >>> from ekushey.feature_extraction import BN_GloVe #Training Against Sentences >>> glv = BN_GloVe(sentences=[['আমার', 'প্রিয়', 'জন্মভূমি'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'], ['বাংলা', 'আমার', 'মাতৃভাষা'] ]) >>> glv.train() #Training Against one Text Corpus >>> glv = BN_GloVe(corpus_file="path_to_corpus.txt") >>> glv.train() #Training Against Multiple Corpuses path ->corpus ->1.txt ->2.txt ->3.txt >>> glv = BN_GloVe(corpus_path="path/corpus") >>> glv.train(epochs=25) #Training Against a Dataframe Column >>> glv = BN_GloVe(df= news_data['text_content']) >>> glv.train(epochs=25) """ if not(self.sentences) and not(self.corpus_file) and not(self.corpus_path) and self.df is None: raise Exception('Data is not given') elif self.sentences: data = self.sentences print("got sentence") elif self.corpus_file: print("got sentence") data = PathLineSentences(self.corpus_file) elif self.corpus_path: print("got sentence") data = PathLineSentences(self.corpus_path) elif self.df is not None: print("Dataframe got") data = '\n'.join(self.df) data = data.split('\n') data = [sent.split() for sent in data] else: print("Unexpected error occured: Please check your data file again.") if no_threads is None: no_threads = self.cpu_cores t = time() corpus = Corpus() corpus.fit(data, window=self.window) print('Dict size: %s' % len(corpus.dictionary)) glove = Glove(no_components=self.size, learning_rate=self.n) glove.fit(corpus.matrix, epochs=epochs, no_threads=no_threads, verbose=True) print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2))) glove.add_dictionary(corpus.dictionary) print("Saving model to current directory") glove.save(self.model_name)
import settings from protocolparser import ProtocolParser from glove import Glove from datetime import datetime test_time = 1000 / (10 * settings.polling_rate) * 1000000 # Microseconds # Open up the connection serverMACAddress = settings.MAC_Juuso port = 3 s = socket.socket(socket.AF_BLUETOOTH, socket.SOCK_STREAM, socket.BTPROTO_RFCOMM) s.connect((serverMACAddress, port)) # Make a virtual glove and parser glove = Glove(always_send_all_sensor_data=settings.glove_sends_all_data) parser = ProtocolParser() parser.init_send(2) start_time = datetime.now() last_poll_time = datetime.now() last_movement_time = datetime.now() hz_in_microsecs = 1 / (settings.polling_rate * 1000 * 1000) test_ticks = 10000 # The amount of times our finger changes physical positions # Run loop for the duration of the test while (last_poll_time - start_time).microseconds < test_time: # Increment finger position if needed if (datetime.now() - last_movement_time).microseconds > test_time / test_ticks:
start_time = time.time() sentences = [] for word in vocabulary: sentences.extend(randomNWalkUniform(triples, word, walks, path_depth)) elapsed_time = time.time() - start_time print('Time elapsed to generate features:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) #de models #GloVe corpus = Corpus() corpus.fit(sentences, window=10) glove_500 = Glove(no_components=10, learning_rate=0.05) glove_500.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove_500.add_dictionary(corpus.dictionary) glove_500.save('glove_10.model') #GloVe glove_200 = Glove(no_components=15, learning_rate=0.05) glove_200.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove_500.add_dictionary(corpus.dictionary) glove_500.save('glove_15.model') #fasttext 500 print("start fast 10") modelf = gensim.models.FastText(size=10, workers=5, window=10,
#%% # 训练Glove词向量 from glove import Glove from glove import Corpus # 读取训练数据。先转换成Corpus形式 sentense = [] with open("poem_for_embedding.txt") as f: for line in f.readlines(): sentense.append(line.replace("\n", "").split(" ")) corpus_model = Corpus() corpus_model.fit(sentense, window=5) # window: 滑动窗口大小 # 训练glove embedding_dim = 10 glove = Glove(no_components=embedding_dim, learning_rate=0.05) # no_components:词嵌入维度, glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True) # verbose:训练时是否打印info glove.add_dictionary(corpus_model.dictionary) glove.save(f'glove_{embedding_dim}.txt') # glove = Glove.load(f'glove_{embedding_dim}.txt') # glove.most_similar('我', number=10)
self.wseq = wseq def __iter__(self): for i in range(0,self.mtx.shape[0]): b = np.asarray(self.mtx[i,:].todense())[0] idx = get_hot_idx(b) doc = get_word_seq(idx,self.wseq) yield list(doc) for b in range(len(cuts)): corpus = iterate_corpus(d[:cuts[b],:],wseq) corpus_model = Corpus() corpus_model.fit(corpus,window=8) # corpus_model.save('08_Glove/corpus.model') print('Dict size: %s' % len(corpus_model.dictionary)) print('Collocations: %s' % corpus_model.matrix.nnz) glove = Glove(no_components=fsize, learning_rate=0.05) glove.fit(corpus_model.matrix, epochs=ite, no_threads=6, verbose=True) glove.add_dictionary(corpus_model.dictionary) glove.save('08_Glove/model_d'+str(fsize)+'_size'+str(cuts[b])+'.model')
import pandas as pd import numpy as np import os from glove import Glove from ccs_tools import dx_multi, pr_multi DX_cat = ['missing'] + sorted(dx_multi.ICD9CM_CODE) PR_cat = ['missing'] + sorted(pr_multi.ICD9CM_CODE) code_cat = ['missing'] + sorted(dx_multi.ICD9CM_CODE) + sorted(pr_multi.ICD9CM_CODE) n_DX_cat = len(DX_cat) n_PR_cat = len(PR_cat) n_code_cat = len(code_cat) path = '/nfs/turbo/umms-awaljee/wsliu/Data/NRD/' model_path = path + 'models/' if not os.path.exists(model_path): os.mkdir(model_path) g = Glove(input_dim=n_code_cat, embedding_dim=100) cooccur_df = pd.read_csv(path+'all/cooccur_df.csv') g.train_glove(cooccur_df=cooccur_df, cache_path=model_path, epochs=100, verbose=2) embed_mat = g.get_embed_mat() np.save(path+'all/embed_mat0823.npy', embed_mat)
with open('../../output/vocabs_100.txt', 'r') as vbf: for line in vbf.readlines(): vocab.append(line.strip()) # 建立词典,统计共现矩阵 dictionary = {} for i, word in enumerate(vocab): dictionary[word] = i corpus = [] with open('../../input/wiki.500.txt', 'r') as cf: for line in cf.readlines(): corpus.append([word for word in line.split()]) corpus_obj = Corpus(dictionary=dictionary) corpus_obj.fit(corpus, window=10, ignore_missing=True) # 得到稀疏的上三角矩阵 corpus_obj.save('../../output/corpus_obj') # corpus_obj = Corpus.load('../../output/corpus_obj') # self.dictionary, self. matrix glove = Glove(no_components=100, learning_rate=0.05, alpha=0.75, max_count=1000, max_loss=10.0, random_state=None) glove.fit(corpus_obj.matrix, epochs=100, no_threads=6, verbose=True) glove.add_dictionary(dictionary=dictionary) wordvectors = glove.word_vectors.round(decimals=6) with open('../../output/glove100.wv', 'w') as wvf: for i, wv in enumerate(wordvectors): wvf.write(vocab[i] + ' ' + str(list(wv))[1:-1].replace(', ', ' ') + '\n')
def topk_recall_glove_embedding(click_all, dict_label, k=100, dim=88, epochs=30, learning_rate=0.5): import psutil from glove import Glove from glove import Corpus data_ = click_all.groupby( ['pred', 'user_id'])['item_id'].agg(lambda x: ','.join(list(x))).reset_index() list_data = list(data_['item_id'].map(lambda x: x.split(','))) corpus_model = Corpus() corpus_model.fit(list_data, window=999999) glove = Glove(no_components=dim, learning_rate=learning_rate) glove.fit(corpus_model.matrix, epochs=epochs, no_threads=psutil.cpu_count(), verbose=True) glove.add_dictionary(corpus_model.dictionary) list_user_id = [] list_item_similar = [] list_score_similar = [] print('------- glove 召回 ---------') for i, row in tqdm(data_.iterrows()): list_item_id = row['item_id'].split(',') dict_item_id_score = {} for i, item in enumerate(list_item_id[::-1]): most_topk = glove.most_similar(item, number=k) for item_similar, score_similar in most_topk: if item_similar not in list_item_id: if item_similar not in dict_item_id_score: dict_item_id_score[item_similar] = 0 sigma = 0.8 dict_item_id_score[item_similar] += 1.0 / ( 1 + sigma * i) * score_similar dict_item_id_score_topk = sorted(dict_item_id_score.items(), key=lambda kv: kv[1], reverse=True)[:k] assert len(dict_item_id_score_topk) == k dict_item_id_set = set([ item_similar for item_similar, score_similar in dict_item_id_score_topk ]) assert len(dict_item_id_set) == k for item_similar, score_similar in dict_item_id_score_topk: list_item_similar.append(item_similar) list_score_similar.append(score_similar) list_user_id.append(row['user_id']) topk_recall = pd.DataFrame({ 'user_id': list_user_id, 'item_similar': list_item_similar, 'score_similar': list_score_similar }) topk_recall['next_item_id'] = topk_recall['user_id'].map(dict_label) topk_recall['pred'] = topk_recall['user_id'].map( lambda x: 'train' if x in dict_label else 'test') return topk_recall
topics = [[] for i in range(len(df))] para = [[] for i in range(len(df))] topics1 = [[] for i in range(len(df))] para1 = [[] for i in range(len(df))] for i in range(len(df_combined)): text = df_combined.iloc[i][0] text = str(text) topics[i] = preprocess_text(text) text = df_combined.iloc[i][1] text = str(text) para[i] = preprocess_text(text) corpus = Corpus() corpus.fit(para, window=10) glove = Glove(no_components=5, learning_rate=0.05) glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) # glove.save('glove.model') corpus1 = Corpus() corpus1.fit(topics, window=10) glove1 = Glove(no_components=5, learning_rate=0.05) glove1.fit(corpus1.matrix, epochs=30, no_threads=4, verbose=True) glove1.add_dictionary(corpus1.dictionary) import numpy as np def get_answers(df_combined, query1): query = Answer_Pre_Processing(query1)
code_cat = ['missing'] + sorted(dx_multi.ICD9CM_CODE) + sorted( dx_multi.ICD9CM_CODE) + sorted(pr_multi.ICD9CM_CODE) DX1_dict = dict(zip(DX1_cat, range(len(DX_cat)))) DX_dict = dict(zip(DX_cat, [0] + list(range(len(DX_cat), len(DX_cat) * 2)))) PR_dict = dict( zip(PR_cat, [0] + list(range(len(DX_cat) * 2 - 1, len(DX_cat) * 2 + len(PR_cat) - 1)))) DXs = ['DX' + str(j) for j in range(2, 31)] PRs = ['PR' + str(j) for j in range(1, 16)] unclassified = set(dx_multi.loc[dx_multi.CCS_LVL1 == '18', 'ICD9CM_CODE']) g = Glove(input_dim=len(code_cat), embedding_dim=100) #dtypes = dict(zip(DXs, [bytes]*30)) #dtypes.update(zip(PRs, [bytes]*15)) dxpr_df = pd.read_csv(path + 'raw/2014/NRD_2014_Core.CSV', sep=',', header=None, names=core_cols, dtype=core_dtypes_pd, na_values=na_values, chunksize=500000) chunk_id = 0 for df in dxpr_df: start = time.time()
corpus4 = Corpus() corpus4.fit(inputPosts2, window=10) glove4 = Glove(no_components=100, learning_rate=0.05) glove4.fit(corpus4.matrix, epochs=1000, no_threads=10, verbose=True) glove4.add_dictionary(corpus4.dictionary) glove4.save('GPStemmedOneList.model') corpus = Corpus() corpus.fit(inputPosts, window=10) glove = Glove(no_components=100, learning_rate=0.05) glove.fit(corpus.matrix, epochs=1000, no_threads=10, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save('GP.model')''' corpus2 = Corpus() corpus2.fit(inputPosts2, window=10) glove2 = Glove(no_components=100, learning_rate=0.05) glove2.fit(corpus2.matrix, epochs=1000, no_threads=10, verbose=True) glove2.add_dictionary(corpus2.dictionary) glove2.save('MedHelpStemmed.model')
cooccur_all = pd.read_csv(path+'multi_space_glove/cooccur_df_all_10.csv') all_df = pd.read_csv(path+'cohorts20/{}/pred_comorb.csv'.format(cohort), dtype=core_dtypes_pd) preprocessed = preprocess(all_df, DX1_cat=DX1_cat, DX_cat=DX_cat, PR_cat=PR_cat) DX1_dict = preprocessed['DX1_dict'] DX_dict = preprocessed['DX_dict'] PR_dict = preprocessed['PR_dict'] code_cat = preprocessed['code_cat'] hosp_cat = preprocessed['hosp_cat'] dx1_ccs_dict = preprocessed['dx1_ccs_dict'] dx_ccs_dict = preprocessed['dx_ccs_dict'] pr_ccs_dict = preprocessed['pr_ccs_dict'] parent_pairs = preprocessed['parent_pairs'] hosp_cat = preprocessed['hosp_cat'] g = Glove(input_dim=len(code_cat), embedding_dim=code_embed_dim, count_cap=count_cap) g.train_glove(cooccur_df=cooccur_all, cache_path=model_path+'temp/{}/'.format(job_index), batch_size=1024*8, epochs=80, earlystop_patience=10, reducelr_patience=2, parent_pairs=parent_pairs, lamb=penalty, metric=penalty_metric, verbose=2) embed_mat = g.get_embed_mat() all_df = preprocessed['int_df'] tst_key = pd.read_csv(path+'cohorts20/{}/tst_key{}.csv'.format(cohort, tst_seed), names = ['KEY_NRD']) tst_df = all_df.loc[all_df.KEY_NRD.isin(tst_key.KEY_NRD)] train_df0 = all_df.loc[~all_df.KEY_NRD.isin(tst_key.KEY_NRD)].reset_index() ## convert different variables into different np.array n_DX = 29 n_PR = 15 DXs = ['DX'+str(j) for j in range(2, n_DX+2)] PRs = ['PR'+str(j) for j in range(1, n_PR+1)] age_mean = train_df0['AGE'].mean()
def train_and_eval_crf(thread_ids, posts, labels, max_posts=20, max_words=400, frac=[0.8, 0.1, 0.1], seed=0, batch_size=9, embedding='glove', max_epoch=500, validate=False, result_dir=None): # preliminary check if len(thread_ids) != len(posts) or \ len(thread_ids) != len(labels) or \ len(posts) != len(labels): raise Exception('Invalid length of data.') if len(frac) != 3 or frac[0]+frac[1]+frac[2] != 1: raise Exception('Invalid value of frac.') if frac[0] <= 0 or frac[1] <= 0 or frac[2] <= 0: raise Exception('Invalid value(s) for one or more frac element(s).') if embedding not in ['glove']: raise Exception('Invalid embedding.') train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = utils.filter_and_shuffle_data(thread_ids, posts, labels, max_words, max_posts, seed, frac) # from here on is glove specific implementation (may need to extract to a function) print('Init embedding') glove = Glove() glove.create_custom_embedding([item for sublist in train_texts for item in sublist]) glove.add_to_embedding(['.', '!', '?']) print('Padding and packing data into data loader') for i, thread in enumerate(train_texts): for j, post_text in enumerate(thread): train_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) for i, thread in enumerate(test_texts): for j, post_text in enumerate(thread): test_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) for i, thread in enumerate(val_texts): for j, post_text in enumerate(thread): val_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) # padding at the post level post_padding = [glove.word2idx[glove.pad_token]] * max_words for posts in [train_texts, test_texts, val_texts]: for sublist in posts: if len(sublist) < max_posts: sublist.extend([post_padding] * (max_posts - len(sublist))) train_masks, test_masks, val_masks = [], [], [] def get_to_append(ones): to_append = [1] * len(labels) if len(to_append) < max_posts: to_append.extend([0] * (max_posts-len(labels))) return to_append for labels in train_labels: to_append = get_to_append(len(labels)) train_masks.append(to_append) for labels in test_labels: to_append = get_to_append(len(labels)) test_masks.append(to_append) for labels in val_labels: to_append = get_to_append(len(labels)) val_masks.append(to_append) for labels in [train_labels, test_labels, val_labels]: for sublist in labels: if len(sublist) < max_posts: sublist.extend([0] * (max_posts-len(sublist))) train_loader = utils.to_data_loader(batch_size, train_texts, train_labels, train_masks) test_loader = utils.to_data_loader(batch_size, test_texts, test_labels, test_masks) val_loader = utils.to_data_loader(batch_size, val_texts, val_labels, val_masks) print('Creating model') embedding = create_emb_layer(torch.from_numpy(glove.weights_matrix).float().to(utils.get_device())) model = hLSTM_CRF(num_tags=2, input_size=glove.emb_dim, hidden_size=glove.emb_dim, output_size=glove.emb_dim, batch_size=batch_size, num_layers=1, bidirectional=False, embedding=embedding, drop_prob=0.5, max_output=max_posts, device=utils.get_device()) labels = [label for sublist in train_labels for label in sublist] intervention_ratio = len([label for label in labels if label == 1]) / len(labels) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) writer = None if result_dir is not None: writer = SummaryWriter(f'runs/{result_dir}') if not os.path.exists(f'models/{result_dir}'): os.makedirs(f'models/{result_dir}') if not validate: val_loader = None print('Start training model') model.zero_grad() model.train() running_loss = 0.0 for epoch in range(max_epoch): if (epoch + 1) % 20 == 0: print(f'Training model ({epoch + 1} / {max_epoch})') for i, (inputs, labels, masks) in enumerate(train_loader): inputs, labels, masks = inputs.to(utils.get_device()), labels.to(utils.get_device()), masks.to(utils.get_device()) loss = model.loss(inputs, labels, masks) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5) optimizer.step() running_loss += loss.item() if i % 1000 == 999: # every 100 mini-batches if writer is not None: writer.add_scalar('training loss', running_loss / 1000, epoch * len(train_loader) + i) running_loss = 0.0 if val_loader is not None: f1, _, _ = eval_model(model, val_loader) writer.add_scalar('validation f1', f1, epoch * len(train_loader) + i) print('Evaluating model') f1, precision, recall = eval_model(model, test_loader, False) print(f''' Test results: F1 = {f1} Precision = {precision} Recall = {recall} ''') if result_dir is not None: print('Saving final model') torch.save(model.state_dict(), f'models/{result_dir}/final_model.pth') print('DONE :)))')
# trg_word2vec = Word2Vec.load('trg_embedd.model') # for i in range(src_vocabsize): # word = list(SRC.vocab.stoi.keys())[i] # if word in src_word2vec.wv.index2word: # src_embed_mtrx[SRC.vocab.stoi[word]] = torch.tensor(src_word2vec.wv[word].copy()).to(device) # # for i in range(trg_vocabsize): # word = list(TRG.vocab.stoi.keys())[i] # if word in trg_word2vec.wv.index2word: # trg_embed_mtrx[TRG.vocab.stoi[word]] = torch.tensor(trg_word2vec.wv[word].copy()).to(device) ''' for glove ''' glove = Glove() src_glove = glove.load('src_glove.model') trg_glove = glove.load('trg_glove.model') for word in list(SRC.vocab.stoi.keys()): if word in src_glove.dictionary: src_embed_mtrx[SRC.vocab.stoi[word]] = torch.tensor(src_glove.word_vectors[src_glove.dictionary[word]].copy()).to(device) for word in list(TRG.vocab.stoi.keys()): if word in trg_glove.dictionary: trg_embed_mtrx[SRC.vocab.stoi[word]] = torch.tensor(trg_glove.word_vectors[trg_glove.dictionary[word]].copy()).to(device) print("pretrained word embeddings loaded") sys.stdout.flush() '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# Calculate distribution, to account for 95th percentile of messages. max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts]))) print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length)) corpus = Corpus() try: print("Loading pretrained corpus...") corpus = Corpus.load("cache/corpus.p") except: print("Training corpus...") corpus.fit(texts, window=max_sentence_length) corpus.save("cache/corpus.p") glove = Glove(no_components=number_components, learning_rate=0.05) try: print("Loading pretrained GloVe vectors...") glove = Glove.load("cache/glove.p") except: print("Training GloVe vectors...") # More epochs seems to make it worse glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("cache/glove.p") # Convert input text print("Vectorizing input sentences...") X = vectify(texts, previous_message, glove.dictionary, max_sentence_length, contextual) y = np.array([x == u'1' for x in classes]).astype(np.int32)
ETRI_mod_dict=ETRI_dependency_mod_dict, weight=weight) print('add dependency complete') ''' reduce co-occurrence matrix''' remove_pos_list = [ 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC', 'EP', 'EF', 'EC', 'ETN', 'ETM', 'XPN', 'XSN', 'XSV', 'XSA', 'XR', 'SF', 'SE', 'SSO', 'SSC', 'SC', 'SY', 'SL', 'SH', 'SN', 'UNKNOWN', 'UNDETERMINED' ] co_occurrence_mat, vocab2idx, idx2vocab = reduce_matrix( co_occurrence_mat, remove_pos_list, vocab2idx, idx2vocab) print("Remove unnecessary POS complete") ''' Train the glove ''' co_occurrence_csrmat = sparse.csr_matrix(co_occurrence_mat) glove = Glove(no_components=n_dim) ret_dict = glove.fit(co_occurrence_csrmat.tocoo(), epochs=n_epoch, verbose=True) glove.add_dictionary(vocab2idx) print("Training glove complete") for i in range(n_epoch): if i % 100 == 0 or i == n_epoch - 1: one_word_vectors = ret_dict[i] ''' explore the glove ''' emotion_centroid_dict = find_emotion_centroid( one_word_vectors, emotion2word_dict, vocab2idx) ''' convert word vector to 6 dim vector''' matrix_based_emotion, emotion_order = transform_based_emotion( one_word_vectors,
import tensorflow as tf import pandas as pd from glove import Corpus, Glove MAX_WORDS = 200000 MAX_LEN = 200000 patents = pd.read_csv("txtheaders.csv") patent_text = patents["txt"].str.lower() tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, lower=True, oov_token="OOV") tokenizer.fit_on_texts(patent_text) patent_sequences = tf.keras.preprocessing.sequence.pad_sequences(tokenizer.texts_to_sequences(patent_text), maxlen=MAX_LEN) patent_words = [] for t in range(patent_sequences.shape[0]): pt1 = tokenizer.sequences_to_texts(patent_sequences[t].reshape(1, MAX_LEN)) pt2 = [x for x in pt1[0].split(" ") if not any(char.isdigit() for char in x) and len(x)<16] patent_words.extend(pt2) corpus = Corpus() corpus.fit(patent_words, window=20) embeddings = Glove(no_components=200, learning_rate=0.05) embeddings.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True) embeddings.add_dictionary(corpus.dictionary) embeddings.save('patent.glove') # embeddings = glove.load('patent.glove') print(embeddings.word_vectors[embeddings.dictionary['computer']])
newfather[s] = f id2code = {} for k in types: id2code[types[k]] = k corMat = np.zeros((len(types), len(types))) for i in range(len(types)): corMat[i, i] = 1 curId = i while curId in newfather: f = newfather[curId] corMat[i, f] = 1 curId = f trainFile = '../data/mimic/mimic.train' coOccurMat = np.zeros((len(types), len(types))) trainSet = pickle.load(open(trainFile, 'rb'))[0] for patient in trainSet: for visit in patient: augmented = np.nonzero(sum(corMat[visit]))[0] listNum = len(augmented) for i in range(listNum): for j in range(i + 1, listNum): coOccurMat[augmented[i]][augmented[j]] += 1 coOccurMat[augmented[j]][augmented[i]] += 1 coOccurMatzip = coo_matrix(coOccurMat.astype(np.float)) glove = Glove(no_components=128, learning_rate=0.05) glove.fit(coOccurMatzip, epochs=50, no_threads=1.0, verbose=True) res = glove.word_vectors.astype(np.float32) pickle.dump( (types, newfather, corMat, res), open(resFile, 'wb'), -1 ) # save types (node to id, newfather, cormat and pre-trained embeddings)
def generateModel(traces): linesentence = "" myFile = open("sentences.txt", 'w') mypackCount = 0 for trace in traces: for packet in trace.getPackets(): mypackCount = mypackCount + 1 linesentence = linesentence + " " + str( packet.getLength()) + "_" + str(packet.getDirection()) # reset sentences if mypackCount % 40 == 0: myFile.write(linesentence) myFile.write("\n") mypackCount = 0 linesentence = "" directionCursor = None dataCursor = 0 for packet in trace.getPackets(): if directionCursor == None: directionCursor = packet.getDirection() if packet.getDirection() != directionCursor: #dataKey = 'S'+str(directionCursor)+'-'+str( GloveClassifier.roundArbitrary(dataCursor, 600) ) dataKey = 'S' + str(directionCursor) + '-' + str(dataCursor) linesentence = linesentence + " " + dataKey directionCursor = packet.getDirection() dataCursor = 0 dataCursor += packet.getLength() if dataCursor > 0: key = 'S' + str(directionCursor) + '-' + str(dataCursor) linesentence = linesentence + " " + key myFile.write(linesentence) myFile.write("\n") myFile.close() if config.CLASSIFIER == config.GLOVE_CLASSIFIER: sentences = models.word2vec.LineSentence("sentences.txt") corpus = Corpus() corpus.fit(sentences, window=8) glove = Glove(no_components=25, learning_rate=0.05) glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True) glove.add_dictionary(corpus.dictionary) glove.save("mygloveModel") elif config.CLASSIFIER == config.W2V_CLASSIFIER: txt = open("sentences.txt") # print txt.read() if (len(txt.read()) > 0): print "in here" txt.close() sentences = models.word2vec.LineSentence("sentences.txt") model = models.word2vec.Word2Vec(sentences, size=50, window=15, min_count=1, workers=4) model.save("word2vecModel") txt.close()