def __init__(self): self.wordvec = './model/wordvec.txt' self.wordfreq = './model/wordfreq.txt' self.params_ = params.params(1) self.words = None self.We = None self.weight4ind = None self.pc = None
def embeding_sentence_cosine_similarity(s1,s2): word_idx_seq_of_sentence, mask = data_io.sentences2idx([s1,s2], Word2Indx) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location print(s1,s2) print('word_idx_seq_of_sentence') print(word_idx_seq_of_sentence) print('mask') print(mask) word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters param = params.params() param.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) s1_embed = embedding[0] s2_embed = embedding[1] return distance.cosine(s1_embed,s2_embed)
def cosine_distance_by_sentence_vector(s1, s2): word_idx_seq_of_sentence, mask = data_io.sentences2idx( [' '.join(s1), ' '.join(s2)], Word2Indx ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location word_weight_of_sentence = data_io.seq2weight( word_idx_seq_of_sentence, mask, Index2Weight) # get word weights # set parameters param = params.params() param.rmpc = rmpc embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) s1_embed = embedding[0] s2_embed = embedding[1] return cosine_similarity(s1_embed, s2_embed)
parser.add_argument("-task", help="Either sim, ent, or sentiment.") parser.add_argument( "-weightfile", help= "The file containing the weights for words; used in weighted_proj_model_sim." ) parser.add_argument("-weightpara", help="The parameter a used in computing word weights.", type=float) parser.add_argument("-npc", help="The number of principal components to use.", type=int, default=0) args = parser.parse_args() params = params() params.LW = args.LW params.LC = args.LC params.batchsize = args.batchsize params.hiddensize = args.dim params.memsize = args.memsize params.wordfile = args.wordfile params.nntype = args.nntype params.layersize = args.layersize params.updatewords = str2bool(args.updatewords) params.traindata = args.traindata params.devdata = args.devdata params.testdata = args.testdata params.nntype = args.nntype params.epochs = args.epochs params.learner = learner2bool(args.learner)
# input wordfile = '../data/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] rmpc = 1 # number of principal components to remove in SIF weighting scheme sentences = [ 'this is an example sentence', 'this is another sentence that is slightly longer' ] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word # load sentences x, m = data_io.sentences2idx( sentences, words ) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location w = data_io.seq2weight(x, m, weight4ind) # get word weights # set parameters params = params.params() params.rmpc = rmpc # get SIF embedding embedding = SIF_embedding.SIF_embedding( We, x, w, params) # embedding[i,:] is the embedding for sentence i