def main(sentences, wordfile: str, weightfile: str, weightpara: float = 1e-3, rmpc: int = 1): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m, _ = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i
def SIFDocEmbedding(w2vdict, weighttxt, txtfile): weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] (words, We) = getWordMap(w2vdict) word2weight = data_io.getWordWeight( weighttxt, word2weight_pickle_file, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word DocVectorDict = {} DocSentVecDict = {} docNum = 0 with open(txtfile, 'r') as reader: txt = reader.readlines() reader.close() for doc in txt: doc = doc.strip() sentEm = SIFSentEmbedding(weighttxt, doc, words, We, weight4ind, weightpara=1e-3, paramm=1) DocSentVecDict[docNum] = sentEm docVector = (np.sum(sentEm, axis=1)) / (sentEm.shape[0]) DocVectorDict[docNum] = docVector docNum += 1 return DocVectorDict, DocSentVecDict, We
def load_model(self): sys.path.append('../src') weightpara = 1e-5 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] print('读取中文模型') self.words_chi, self.We_chi = data_io.getWordmap( '../models/wiki_news_model2_vector.txt') self.word2weight_chi = data_io.getWordWeight( '../models/word_count.txt', # each line is a word and its frequency, weightpara) # word2weight['str'] is the weight for the word 'str' print('中文模型读取完毕') print('读取英文模型') weightpara = 1e-3 self.words_eng, self.We_eng = data_io.getWordmap( '../models/glove_large.txt') self.word2weight_eng = data_io.getWordWeight( '../models/enwiki_vocab_min200.txt', # each line is a word and its frequency weightpara) # word2weight['str'] is the weight for the word 'str' print('英文模型读取完毕')
def load_embeddings(wordfile, weightfile, weightpara=5e-4, word2vec=False): if word2vec: (words, We) = getWordmapWord2Vec(wordfile) else: (words, We) = data_io.getWordmap(wordfile) word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word return words, We, weight4ind
def get_embs(sentences, params): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # get SIF embedding embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def __init__(self): self.weightfile = config.url_enwiki self.weightpara = 1e-3 print("Getting embeddings from the Glove pickles") with open(config.url_glove_pickle_we_1, "rb") as file: We_1 = pickle.load(file) with open(config.url_glove_pickle_words_1, "rb") as file: words_1 = pickle.load(file) with open(config.url_glove_pickle_we_2, "rb") as file: We_2 = pickle.load(file) with open(config.url_glove_pickle_words_2, "rb") as file: words_2 = pickle.load(file) with open(config.url_glove_pickle_we_3, "rb") as file: We_3 = pickle.load(file) with open(config.url_glove_pickle_words_3, "rb") as file: words_3 = pickle.load(file) self.We = [] self.We.extend(We_1) self.We.extend(We_2) self.We.extend(We_3) self.words = {} self.words.update(words_1) self.words.update(words_2) self.words.update(words_3) with open(config.url_snli_pc1, "rb") as file: self.snli_pc_1 = pickle.load(file) with open(config.url_snli_pc2, "rb") as file: self.snli_pc_2 = pickle.load(file) print("Successfully got the embeddings from the pickle") self.word2weight = data_io.getWordWeight( self.weightfile, self.weightpara ) # word2weight['str'] is the weight for the word 'str' self.weight4ind = data_io.getWeight( self.words, self.word2weight) # weight4ind[i] is the weight for the i-th word
def get_sif(dataset): wordfile = '../data/glove.6B.50d.txt' # word vector file, can be downloaded from GloVe website weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 2.7e-4 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 0 # number of principal components to remove in SIF weighting scheme # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word param = params.params() param.rmpc = rmpc sentence_embedding_all = get_sentences_embedding(dataset, words, weight4ind, param, We) # sentence_embedding_all = turn2std(sentence_embedding_all) # 将矩阵转换为标准矩阵 return sentence_embedding_all
def main(word_embeddings_path, word_weight_path, out_dir): wordfile = word_embeddings_path weightfile = word_weight_path weightparas = [1e-2, 1e-3, 1e-4] (words, We) = getWordmap(wordfile) vector_file = open(os.path.join(out_dir, "vectors"), "w") pickle.dump(We, vector_file) words_file = open(os.path.join(out_dir, "words"), "w") pickle.dump(words, words_file) for weightpara in weightparas: print("calculating word2weight with a = {}.".format(weightpara)) word2weight = data_io.getWordWeight(weightfile, weightpara) print("calculating weight4ind with a = {}.".format(weightpara)) weight4ind = data_io.getWeight(words, word2weight) weight4ind_file = open( os.path.join(out_dir, "weight4ind_weightpara_%.E" % Decimal(weightpara)), 'w') pickle.dump(weight4ind, weight4ind_file)
def sif_embedding(sen): import sys #sys.path.append("../src") #sys.path.append("../data") import data_io, params, SIF_embedding import params import SIF_embedding # input wordfile = 'data/dic_files.txt' # word vector file, can be downloaded from GloVe website weightfile = 'data/dic_freq.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # sentences = ['这是一个例句', '这是一个更长一些的例句'] # sentences = ['昨天天气不错', '这是一个更长一些的例句'] sentences = sen # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer'] # load word vectors (words, We) = data_io.getWordmap(wordfile) # print(words,We) #单词,和词向量 # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location # print(x,m) w = data_io.seq2weight(x, m, weight4ind) # get word weights # print('word weight:',w) # set parameters # params = params.params() params = params.params_all() # name 'params' is not defined params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def vectorize_sif(filename): class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(filename, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i return embedding
def load_model(): wordfile = "glove path (glove.840B.300d.txt file)" # you can download glove from https://www.kaggle.com/takuok/glove840b300dtxt weightfile = artifact_path + '/SIF/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme (words, We) = data_io.getWordmap(wordfile) a = list(words.keys()) for i, v in enumerate(a): words[v.decode("utf-8")] = words.pop(v) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word return (words, weight4ind, rmpc, We)
def load_embed(wordfile, weightfile, weightpara=1e-3, param=None, rmpc=0): ''' wordfile: : location of embedding data (e.g., glove embedings) weightfile: : location of TF data for words weightpara: : the parameter in the SIF weighting scheme, usually in range [3e-5, 3e-3] rmpc: : number of principal components to remove in SIF weighting scheme ''' # input wordfile = '/home/francisco/GitHub/SIF/data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website weightfile = '/home/francisco/GitHub/SIF/auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency # load word vectors (words, Weights) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters param.rmpc = rmpc return Weights, words, word2weight, weight4ind
def get_sent_vec(sentences): import params # 详见data_io.py (words, We) = data_io.getWordmap(wordfile) # 详见data_io.py word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) # 详见data_io.py x, m = data_io.sentences2idx(sentences, words) w = data_io.seq2weight(x, m, weight4ind) # 参数设置 params = params.params() params.rmpc = rmpc # 调用SIF核心算法计算句向量,详见SIF_core embedding = SIF_core.SIF_embedding(We, x, w, params) get_sent_vec = {} for i in range(len(embedding)): get_sent_vec[sentences[i]] = embedding[i] return get_sent_vec
## run wordfiles = [ #'../data/paragram_sl999_small.txt', # need to download it from John Wieting's github (https://github.com/jwieting/iclr2016) '/Users/sherryruan/data/glove/glove.6B/glove.6B.300d.txt' # need to download it first ] weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' weightparas = [-1, 1e-3] #[-1,1e-1,1e-2,1e-3,1e-4] rmpcs = [0, 1] # [0,1,2] params = params.params() parr4para = {} sarr4para = {} for wordfile in wordfiles: (words, We) = data_io.getWordmap(wordfile) for weightpara in weightparas: word2weight = data_io.getWordWeight(weightfile, weightpara) weight4ind = data_io.getWeight(words, word2weight) for rmpc in rmpcs: print('word vectors loaded from %s' % wordfile) print('word weights computed from %s using parameter a=%f' % (weightfile, weightpara)) params.rmpc = rmpc print('remove the first %d principal components' % rmpc) ## eval just one example dataset parr, sarr = eval.sim_evaluate_one( We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params) ## eval all datasets; need to obtained datasets from John Wieting (https://github.com/jwieting/iclr2016) # parr, sarr = eval.sim_evaluate_all(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params) paras = (wordfile, weightfile, weightpara, rmpc) parr4para[paras] = parr
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) #personality = random.choice(personalities) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) wordfile = './data/truncate.txt' # word vector file, can be downloaded from GloVe website weightfile = './auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word p = 0 start_time = time.time() with open('data_volunteers.json') as json_file: json_data = json.load(json_file) for i in json_data: p += 1 #if p <1100: # continue history = [] personality = [] query_set = [] json_dialog = i["dialog"] json_bot = i["bot_profile"] for j in json_bot: personality.append(tokenizer.encode(j)) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) persona = tokenizer.decode(chain(*personality)) row = {"Personality": persona} text = [] for j in json_dialog: if j["sender_class"] == "Human": json_text = j["text"] raw_text = json_text check = tokenizer.decode(tokenizer.encode(raw_text), skip_special_tokens=True) if check == "": history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = normal_sample_sequence( personality, history, tokenizer, model, args) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) continue history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) else: json_text = j["text"] raw_text = json_text history.append(tokenizer.encode(raw_text)) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text }) row["dialog"] = text query_set.append(row) #print(query_set) with open('./sif_set/sif' + str(p) + '.json', 'w', encoding='utf-8') as make_file: json.dump(query_set, make_file) if not p % 10: print( str(p * 100 / 1111) + '%, ' + str(time.time() - start_time) + 'sec') '''
items.append(str(embedding[i, j])) line = " ".join(items) + "\n" writer.write(line) writer.close() if __name__ == "__main__": # input wordfile = '../data/glove.840B.300d-freq500K.txt' # word vector file, can be downloaded from GloVe website weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency input_dir = sys.argv[1] output_dir = sys.argv[2] rmpc = int(sys.argv[3]) # number of principal components to remove in SIF weighting scheme params = params.params() params.rmpc = rmpc # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight(weightfile, 1e-3) weight4ind = data_io.getWeight(words, word2weight) for f in os.listdir(input_dir): try: input_file = "%s/%s" % (input_dir, f) output_file = "%s/%s" % (output_dir, f) print_embeddings(get_embeddings(words, We, word2weight, weight4ind, input_file, params), output_file) except Exception as ex: print("an error occured. skipped %s, due to err: %s" % (input_file, ex))
params.nonlinearity = lasagne.nonlinearities.rectify if args.nonlinearity == 4: params.nonlinearity = lasagne.nonlinearities.sigmoid # load data (words, We) = data_io.getWordmap(params.wordfile) if args.task == "sim" or args.task == "ent": train_data = data_io.getSimEntDataset(params.traindata, words, params.task) elif args.task == "sentiment": train_data = data_io.getSentimentDataset(params.traindata, words) else: raise ValueError('Task should be ent, sim, or sentiment.') # load weight if params.weightfile: word2weight = data_io.getWordWeight(params.weightfile, params.weightpara) params.weight4ind = data_io.getWeight(words, word2weight) print( ('word weights computed using parameter a=' + str(params.weightpara))) else: params.weight4ind = [] if params.npc > 0: params.pc = get_pc(train_data, We, params.weight4ind, params) else: params.pc = [] # load model model = None if params.nntype == 'proj': model = proj_model_sim(We, params) elif params.nntype == 'proj_sentiment':
wordFilePath = '../data/vectors-en.txt' model_300 = gensim.models.KeyedVectors.load_word2vec_format(wordFilePath,binary = False) words={} for index,word in enumerate(model_300.wv.index2entity): words[word] = index We = model_300.wv.vectors weightparas = [-1, 1e-3] rmpcs = [0,1] params = params.params() parr4para = {} sarr4para = {} # (words, We) = data_io.getWordmap(wordfile) for weightpara in weightparas: word2weight = data_io.getWordWeight(model_300.wv.vocab, weightpara) weight4ind = data_io.getWeight(words, word2weight) for rmpc in rmpcs: print ('word vectors loaded from %s' % wordFilePath) # print ('word weights computed from %s using parameter a=%f' % (weightfile, weightpara)) params.rmpc = rmpc print ('remove the first %d principal components' % rmpc) ## eval just one example dataset parr, sarr = eval.sim_evaluate_one(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params) ## eval all datasets; need to obtained datasets from John Wieting (https://github.com/jwieting/iclr2016) # parr, sarr = eval.sim_evaluate_all(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params) paras = (wordFilePath, model_300.wv.vocab, weightpara, rmpc) parr4para[paras] = parr sarr4para[paras] = sarr ## save results
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataname', default='t6', help='dataset name', choices=['t6', 't26', '2C']) parser.add_argument('-c', '--classifiername', default='RF', help='which classifier to use', choices=['GaussianNB', 'RF', 'SVM', 'KNN']) args = parser.parse_args() data_name = args.dataname # t6 or t26, 2C, 4C clf_name = args.classifiername # classfier # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter. embed_dims = [100] # can add 25, 50, 200 dimension if needed wordfile_list = [ '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims ] # each line is a word and its frequency weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt' # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] weightpara = 1e-3 # number of principal components to remove in SIF weighting scheme rmpc = 1 for wordfile, dim in zip(wordfile_list, embed_dims): # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights # word2weight['str'] is the weight for the word 'str' word2weight = data_io.getWordWeight(weightfile, weightpara) # weight4ind[i] is the weight for the i-th word weight4ind = data_io.getWeight(words, word2weight) data_path = "../data/" if data_name == "t6": file_path = data_path + "CrisisLexT6_cleaned/" disasters = [ "sandy", "queensland", "boston", "west_texas", "oklahoma", "alberta" ] test_list = [ "{}_glove_token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "t26": file_path = data_path + "CrisisLexT26_cleaned/" disasters = [ "2012_Colorado_wildfires", "2013_Queensland_floods", "2013_Boston_bombings", "2013_West_Texas_explosion", "2013_Alberta_floods", "2013_Colorado_floods", "2013_NY_train_crash" ] test_list = [ "{}-tweets_labeled.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "2C": file_path = data_path + "2CTweets_cleaned/" disasters = [ "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston", "Brisbane", "Dublin", "London", "Sydney" ] test_list = [ "{}2C.csv.token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}2C_training.csv".format(disaster) for disaster in disasters ] accu_list = [] roc_list = [] precision_list = [] recall_list = [] f1_list = [] for train, test in zip(train_list, test_list): train_file = os.path.join(file_path, train) test_file = os.path.join(file_path, test) xtrain, ytrain = load_data(data_name, train_file) xtest, ytest = load_data(data_name, test_file) # load train # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location xtrain_windx, m_train = data_io.sentences2idx(xtrain, words) w_train = data_io.seq2weight(xtrain_windx, m_train, weight4ind) # get word weights # set parameters paramss = params.params() paramss.rmpc = rmpc # get SIF embedding train_embed = SIF_embedding.SIF_embedding( We, xtrain_windx, w_train, paramss) # embedding[i,:] is the embedding for sentence i # load target # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location xtest_windx, m_test = data_io.sentences2idx(xtest, words) # get word weights w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind) # set parameters paramsss = params.params() paramsss.rmpc = rmpc # get SIF embedding test_embed = SIF_embedding.SIF_embedding( We, xtest_windx, w_test, paramsss) # embedding[i,:] is the embedding for sentence i print(test) accu, roc, precision, recall, f1 = run_classifier( train_embed, ytrain, test_embed, ytest, clf_name, 100) accu_list.append(accu) roc_list.append(roc) precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print("{}_SIF_{}_LOO_accuracy {}".format(data_name, clf_name + str(dim), accu_list)) print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim), roc_list)) print("{}_SIF_{}_LOO_precision {}".format(data_name, clf_name + str(dim), precision_list)) print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim), recall_list)) print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim), f1_list)) print( "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}" .format(data_name, clf_name + str(dim), np.mean(accu_list), np.std(accu_list), np.mean(roc_list), np.std(roc_list), np.mean(f1_list), np.std(f1_list), np.mean(precision_list), np.std(precision_list), np.mean(recall_list), np.std(recall_list)))
def SIF_master(segfile, cleanfile, directory, summ_ind): print "segfile: ", segfile print "clean file: ", cleanfile #cleanfile = cleanfile+".ls" class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) # input wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme #sentiment_file = '../data/sentiment-test' # sentiment data file #cleanfile = "2/D1026-A.M.100.E.10.segs.cl" #sentiment_file = '../data/clean-5.txt' # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences (here use sentiment data as an example) #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location x, m = data_io.sentiment2idx(cleanfile, words) w = data_io.seq2weight(x, m, weight4ind) # get word weights # parameters params = params() #params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding_lib.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i #segfile = segfile+".segs" f = open(segfile).readlines() indexes = [] matches = [] for item in f: ind = item.rfind("&") indexes.append(item[:ind + 1]) if len(indexes) == len(embedding): for ind in range(0, len(indexes)): lines = indexes[ind] + str(list(embedding[ind])) matches.append(lines) else: print "length doesn't match!! Check if there is empty line!!" #fname = directory +'/'+str(summ_ind)+ '/' + getRealName(segfile) + '.ls' #fname = directory +'/'+str(summ_ind)+ '/' + segfile + '.ls' fname = directory + '/' + str(summ_ind) + '/' + getRealName(segfile) print fname with open(fname + ".ls", "w") as file: for item in matches: file.write(item + "\n") return embedding
tgtsent = open(tgtSentFilePath, 'r').readlines() # srcsent = ['Pada mulanya, waktu Allah mulai menciptakan alam semesta'] # tgtsent = ['God saw the light, and saw that it was good. God divided the light from the darkness.'] # params = params.params() weightpara = 1e-3 rmpc = 1 # def srcEmbedding(srcWordFilePath, srcsent): src_model_300 = gensim.models.KeyedVectors.load_word2vec_format( srcWordFilePath, binary=False) srcwords = {} for index, word in enumerate(src_model_300.wv.index2entity): srcwords[word] = index srcWe = src_model_300.wv.vectors srcword2weight = data_io.getWordWeight(src_model_300.wv.vocab, weightpara) srcweight4ind = data_io.getWeight(srcwords, srcword2weight) srcx, srcm = data_io.sentences2idx(srcsent, srcwords) srcw = data_io.seq2weight(srcx, srcm, srcweight4ind) srcparams = params.params() srcparams.rmpc = rmpc srcEmbedding = SIF_embedding.SIF_embedding(srcWe, srcx, srcw, srcparams) # return embedding # def tgtEmbedding(tgtWordFilePath, tgtsent): tgtmodel_300 = gensim.models.KeyedVectors.load_word2vec_format(tgtWordFilePath, binary=False) tgtwords = {} for index, word in enumerate(tgtmodel_300.wv.index2entity): tgtwords[word] = index tgtWe = tgtmodel_300.wv.vectors
# Arabic GloVe embedding pre-trained model wordfile = '../models/glove_full_grams_sg_300_wiki.txt' weightfile = '../AraSIF_word_counts/arwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme # load word vectors print("Reading embedding matrix. Hang on! this will take a while ...") (glove_words, We) = data_io.getWordmap(wordfile) print("shape of Word embedding is: " + str(We.shape)) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( glove_words, word2weight) # weight4ind[i] is the weight for the i-th word # set parameters params = params.params() params.rmpc = rmpc # load sentences print("reading the input sentences now & converting to indices .. \n") sample_sents = read_NMT_data.read_data(sample_ara) # AraSIF embedding for sample sentences print("computing AraSIF embedding now ...\n")
with open(douban_cropus_path) as f: for line in f: line = line.strip() line = line.split(':')[1] sentences.append(line) glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt' # word vector file, can be downloaded from GloVe website word_freauency_path = './douban_cropus_vocab.txt' # each line is a word and its frequency weightpara = 1e-3 rmpc = 1 # load word vectors (Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path) # load word weights word2weight = data_io.getWordWeight( word_freauency_path, weightpara) # word2weight['str'] is the weight for the word 'str' Index2Weight = data_io.getWeight( Word2Indx, word2weight) # weight4ind[i] is the weight for the i-th word word_idx_seq_of_sentence, mask = data_io.sentences2idx( sentences, Word2Indx ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters params = params.params() params.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, params)