Esempio n. 1
0
 def __init__(self):
     self.wordvec = './model/wordvec.txt'
     self.wordfreq = './model/wordfreq.txt'
     self.params_ = params.params(1)
     self.words = None
     self.We = None
     self.weight4ind = None
     self.pc = None
def embeding_sentence_cosine_similarity(s1,s2):    
    word_idx_seq_of_sentence, mask = data_io.sentences2idx([s1,s2], Word2Indx) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    print(s1,s2)
    print('word_idx_seq_of_sentence')
    print(word_idx_seq_of_sentence)
    print('mask')
    print(mask)
    word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask, Index2Weight) # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence, word_weight_of_sentence, param) 
    s1_embed = embedding[0]
    s2_embed = embedding[1]    

    return distance.cosine(s1_embed,s2_embed)
Esempio n. 3
0
def cosine_distance_by_sentence_vector(s1, s2):
    word_idx_seq_of_sentence, mask = data_io.sentences2idx(
        [' '.join(s1), ' '.join(s2)], Word2Indx
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location

    word_weight_of_sentence = data_io.seq2weight(
        word_idx_seq_of_sentence, mask, Index2Weight)  # get word weights
    # set parameters
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(Word2vector,
                                            word_idx_seq_of_sentence,
                                            word_weight_of_sentence, param)
    s1_embed = embedding[0]
    s2_embed = embedding[1]

    return cosine_similarity(s1_embed, s2_embed)
Esempio n. 4
0
parser.add_argument("-task", help="Either sim, ent, or sentiment.")
parser.add_argument(
    "-weightfile",
    help=
    "The file containing the weights for words; used in weighted_proj_model_sim."
)
parser.add_argument("-weightpara",
                    help="The parameter a used in computing word weights.",
                    type=float)
parser.add_argument("-npc",
                    help="The number of principal components to use.",
                    type=int,
                    default=0)
args = parser.parse_args()

params = params()
params.LW = args.LW
params.LC = args.LC
params.batchsize = args.batchsize
params.hiddensize = args.dim
params.memsize = args.memsize
params.wordfile = args.wordfile
params.nntype = args.nntype
params.layersize = args.layersize
params.updatewords = str2bool(args.updatewords)
params.traindata = args.traindata
params.devdata = args.devdata
params.testdata = args.testdata
params.nntype = args.nntype
params.epochs = args.epochs
params.learner = learner2bool(args.learner)
Esempio n. 5
0
# input
wordfile = '../data/glove.840B.300d.txt'  # word vector file, can be downloaded from GloVe website
weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme
sentences = [
    'this is an example sentence',
    'this is another sentence that is slightly longer'
]

# load word vectors
(words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    words, word2weight)  # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(
    sentences, words
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i