Example #1
0
def main(sentences,
         wordfile: str,
         weightfile: str,
         weightpara: float = 1e-3,
         rmpc: int = 1):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m, _ = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    params = params.params()
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
Example #2
0
 def fit(self, sentences, We, lowercase_tokens, embeddings_format, embeddings_filepath, params, word_map, weight4ind):
     
     # store these off for pickling or extra transforms
     self.word_map = word_map
     self.weight4ind = weight4ind
     self.params = params
     self.lowercase_tokens = lowercase_tokens
     self.embeddings_format = embeddings_format
     self.embeddings_filepath = embeddings_filepath
     
     self.sentence_count = len(sentences)
     
     x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
     w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
     
     # now let's do some of what happens in src/SIF_embedding.py
     # but also keep some pieces along the way
     #weighted_emb = get_weighted_average(We, x, w)
     weighted_emb = get_weighted_average_alternate(We, x, w)
     
     self.compute_pc(weighted_emb)
     
     self.trained = True
     
     return self.remove_pc(weighted_emb)
    def compute_sif_emb(self, sentences):
        # load sentences
        x1, m = data_io.sentences2idx(sentences, self.words)
        w1 = data_io.seq2weight(x1, m, self.weight4ind)

        result = get_emb(self.We, x1, w1)
        return result
Example #4
0
def getSIFscore(sentences: list, words, weight4ind, rmpc, We, params, sx: int,
                sy: int):
    # load sentences
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # print('load sentences finished')

    # set parameters
    params = params.params()
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    embeddingSize = len(embedding)
    # print('embeddingSize= ',embeddingSize)

    emb = list()
    for x in range(embeddingSize):
        emb.append(embedding[x, :])

    emb1 = emb[sx]
    emb2 = emb[sy]
    inn = (emb1 * emb2).sum()
    emb1norm = numpy.sqrt((emb1 * emb1).sum())
    emb2norm = numpy.sqrt((emb2 * emb2).sum())
    score = inn / emb1norm / emb2norm

    # print(sentences[sx],'--------',sentences[sy],' = ',score,'\n')
    return score
def return_sif(sentences, words, weight4ind, param, Weights):
    # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    # get SIF embedding
    embeddings = SIF_embedding.SIF_embedding(Weights, x, w, param) # embedding[i,:] is the embedding for sentence i
    return embeddings
Example #6
0
def get_embedding(sentence, words, weight4ind, params, We):
    # load sentences
    xx, mm = data_io.sentences2idx(sentence, words)
    ww = data_io.seq2weight(xx, mm, weight4ind)  # get word weights
    # get SIF embedding
    em = SIF_embedding.SIF_embedding(
        We, xx, ww, params)  # embedding[i,:] is the embedding for sentence i
    return em
Example #7
0
def get_embs(sentences, params):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
    return embedding
Example #8
0
def generate_vecs(models, document):
    words, weight4ind, rmpc, We = models

    x, m = data_io.sentences2idx(document, words)
    # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    param = params.params()
    param.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, param)  # embedding[i,:] is the embedding for sentence i
    return embedding
Example #9
0
def SIFSentEmbedding(weighttxt,
                     docfile,
                     words,
                     We,
                     weight4ind,
                     weightpara=1e-3,
                     paramm=1):
    # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    # number of principal components to remove in SIF weighting scheme
    sentences = sent_tokenize(docfile)
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    paramm = params.params()
    paramm = paramm.LC
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, paramm)  # embedding[i,:] is the embedding for sentence i
    return embedding
Example #10
0
def sif_embedding(sen):
    import sys
    #sys.path.append("../src")
    #sys.path.append("../data")
    import data_io, params, SIF_embedding
    import params
    import SIF_embedding
    # input
    wordfile = 'data/dic_files.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'data/dic_freq.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    # sentences = ['这是一个例句', '这是一个更长一些的例句']
    # sentences = ['昨天天气不错', '这是一个更长一些的例句']
    sentences = sen
    # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # print(words,We) #单词,和词向量
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    # print(x,m)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # print('word weight:',w)
    # set parameters
    # params = params.params()
    params = params.params_all()  # name 'params' is not defined
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
    return embedding
def get_sent_vec(sentences):
    import params
    # 详见data_io.py
    (words, We) = data_io.getWordmap(wordfile)
    # 详见data_io.py
    word2weight = data_io.getWordWeight(weightfile, weightpara)
    weight4ind = data_io.getWeight(words, word2weight)
    # 详见data_io.py
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)

    # 参数设置
    params = params.params()
    params.rmpc = rmpc
    # 调用SIF核心算法计算句向量,详见SIF_core
    embedding = SIF_core.SIF_embedding(We, x, w, params)

    get_sent_vec = {}
    for i in range(len(embedding)):
        get_sent_vec[sentences[i]] = embedding[i]

    return get_sent_vec
Example #12
0
def sentences2embeddings(sentences):
	"""
	Input: sentences - a list of sentences
	Output: sentence_embeddings - a list of sentence embeddings (numpy vectors of shape (1,300))

	"""
	# load sentences
	x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
	w = data_io.seq2weight(x, m, weight4ind) # get word weights

	# set parameters
	parameters = params.params()
	parameters.rmpc = rmpc
	# get SIF embedding
	embedding = SIF_embedding.SIF_embedding(We, x, w, parameters) # embedding[i,:] is the embedding for sentence i

	sentence_embeddings = []
	for i in range(len(sentences)):
		e = np.array(embedding[i,:]).reshape((1,300)) # reshape to fit into the function semantic_similarity
		sentence_embeddings.append(e)

	return sentence_embeddings
Example #13
0
def get_sent_vec(sentences):
    '''
    通过SIF算法计算句向量
    :param sentences: 类型为列表,列表内的元素为字句子(句子类型为字符串,不用通过jieba.cut分词处理)
    :return: 输出类型为字典,key为句子字符串,value为句向量列表
    '''
    import params
    # 详见data_io.py
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)

    # 参数设置
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    params = params.params()
    params.rmpc = rmpc

    # 调用SIF核心算法计算句向量,详见SIF_core
    embedding = SIF_core.SIF_embedding(We, x, w, params)

    get_sent_vec = {}
    for i in range(len(embedding)):
        get_sent_vec[sentences[i]] = embedding[i]

    return get_sent_vec
Example #14
0
	#if '20-30' in sentence_file or '25-35' in sentence_file or '30-40' \
	#in sentence_file or '5-15' in sentence_file:
	batch_num = 0
	with open(sentence_file, 'r', encoding='utf-8') as fr:
		print('Processing file', sentence_file, '...')
		p = hnswlib.Index(space='cosine', dim=dimension)
		p.init_index(max_elements = num_elements, ef_construction = 2000, M = 80)
		p.set_ef(1000)
		# Set number of threads used during batch search/construction
		# By default using all available cores
		p.set_num_threads(30)
		for n_lines in iter(lambda: tuple(islice(fr, batch_size)), ()):
			sents = list(map(str.strip, n_lines))
			sent_id = list(map(lambda x:int(x.split('\t')[0]), sents))
			sentences = list(map(lambda x:x.split('\t')[-1], sents))
			x, m = data_io.sentences2idx(sentences, words)
			w = data_io.seq2weight(x, m, weight4ind)

			# get SIF embedding
			embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
			embeddings.normalize(embedding, ["unit", "center"])

			p.add_items(embedding,sent_id)
			print('Finished batch', batch_num, '.', end = '\r')
			batch_num += 1
	print('\nFinished loading', sentence_file, '.')
	out_file = sentence_file+'.ann'
	p.save_index(out_file)
	print('Finished saving', out_file, '.')
	del p
def top_filtering(logits,
                  words,
                  weight4ind,
                  We,
                  tokenizer,
                  history,
                  args,
                  params,
                  embedding1,
                  top_k=0,
                  top_p=0.0,
                  threshold=-float('Inf'),
                  filter_value=-float('Inf'),
                  current_output=None):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
    """
    if current_output is None:
        current_output = []
    assert logits.dim(
    ) == 1  # Only work for batch size 1 for now - could update but it would obfuscate a bit the code
    top_k = min(top_k, logits.size(-1))

    if top_k > 0:
        # Remove all tokens with a probability less than the last token in the top-k tokens
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
                                                                  None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        # Compute cumulative probabilities of sorted tokens
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits,
                                                          dim=-1),
                                                dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probabilities > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
            ..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # Back to unsorted indices and set them to -infinity
        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        indices_to_use = sorted_indices[~sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
        if len(indices_to_use) > 1:
            cands = [current_output + [idx] for idx in indices_to_use.tolist()]
            raw_cands = [
                tokenizer.decode(cand, skip_special_tokens=True)
                for cand in cands
            ]
            scores = []
            for i in raw_cands:
                sentences = [i]
                x, m = data_io.sentences2idx(sentences, words)
                w = data_io.seq2weight(x, m, weight4ind)
                embedding2 = SIF_embedding.SIF_embedding(We, x, w, params)
                inn = (embedding1 * embedding2).sum(axis=1)
                emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1))
                emb2norm = np.sqrt((embedding2 * embedding2).sum(axis=1))
                scores.append(inn / emb1norm / emb2norm)
                #print(sentences)
            for idx, sim in zip(indices_to_use, scores):
                logits[idx] += sim.item()
            """
            probs = F.softmax(logits, dim=-1)
            index = []
            for i in probs:
                if i > 0:
                    index.append(i)
            prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, len(index))
            text = []
            last_utt = history[-1]
            last = tokenizer.decode(last_utt, skip_special_tokens=True)
            sentences = [last]
            # load sentences
            x, m = data_io.sentences2idx(sentences,
                                            words)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
            w = data_io.seq2weight(x, m, weight4ind)  # get word weights

            # set parameters
            global params
            params.rmpc = rmpc
            # get SIF embedding
            embedding1 = SIF_embedding.SIF_embedding(We, x, w, params)  # embedding[i,:] is the embedding for sentence i
            for i in prev:
                text.append(i.item())
            for i in text:
                cand = current_output.copy()
                cand.append(i)
                indice = i
                raw_text=tokenizer.decode(cand, skip_special_tokens=True)
                sentences = [raw_text]
                x, m= data_io.sentences2idx(sentences,
                                                words)
                w = data_io.seq2weight(x, m, weight4ind)
                embedding2 = SIF_embedding.SIF_embedding(We, x, w, params)
                inn = (embedding1 * embedding2 ).sum(axis=1)
                emb1norm = np.sqrt((embedding1 * embedding1).sum(axis=1))
                emb2norm = np.sqrt((embedding2  * embedding2 ).sum(axis=1))
                scores = inn / emb1norm / emb2norm
                #print(scores)
                logits[indice] += scores.item()
                cand.clear()
                """

    indices_to_remove = logits < threshold
    logits[indices_to_remove] = filter_value

    return logits
def sample_sequence(personality,
                    history,
                    tokenizer,
                    model,
                    args,
                    words,
                    weight4ind,
                    We,
                    current_output=None):
    special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
    if current_output is None:
        current_output = []
    last_utt = history[-1]
    last = tokenizer.decode(last_utt, skip_special_tokens=True)
    sentences = [last]
    # load sentences
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    rmpc = 1  # number of principal components to remove in SIF weighting scheme

    # set parameters
    global params
    params.rmpc = rmpc
    # get SIF embedding
    embedding1 = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    for i in range(args.max_length):
        instance, _ = build_input_from_segments(personality,
                                                history,
                                                current_output,
                                                tokenizer,
                                                with_eos=False)

        input_ids = torch.tensor(instance["input_ids"],
                                 device=args.device).unsqueeze(0)
        token_type_ids = torch.tensor(instance["token_type_ids"],
                                      device=args.device).unsqueeze(0)
        temperature = 1.0
        top_k = 0
        top_p = 0.9

        logits = model(input_ids, token_type_ids=token_type_ids)
        if isinstance(logits, tuple):  # for gpt2 and maybe others
            logits = logits[0]
        logits = logits[0, -1, :] / args.temperature
        logits = top_filtering(logits,
                               words,
                               weight4ind,
                               We,
                               tokenizer,
                               history,
                               args,
                               params,
                               embedding1,
                               top_k=top_k,
                               top_p=top_p,
                               current_output=current_output)
        probs = F.softmax(logits, dim=-1)

        prev = torch.topk(
            probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1)
        if i < args.min_length and prev.item() in special_tokens_ids:
            while prev.item() in special_tokens_ids:
                if probs.max().item() == 1:
                    warnings.warn(
                        "Warning: model generating special token with probability 1."
                    )
                    break  # avoid infinitely looping over special token
                prev = torch.multinomial(probs, num_samples=1)

        if prev.item() in special_tokens_ids:
            break
        current_output.append(prev.item())

    return current_output
Example #17
0
model_100 = Word2Vec.load(os.path.join('/media/brx/TOSHIBA EXT/wiki_zh_word2vec/', 'ngram_100_5_90w.bin'))
words = {}
for index, word in enumerate(model_100.wv.index2entity):
    words[word] = index
We = model_100.wv.vectors
'''

# input
wordfile = '../newsif/datafile/without_stopwords/word2vec_format.txt' # word vector file, can be downloaded from GloVe website
weightfile = '../newsif/datafile/without_stopwords/words_count.txt' # each line is a word and its frequency
weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme
sentences_test = ['会议充分肯定了2019年金融市场和信贷政策工作取得的成绩,在推动金融市场规范、创新、发展、开放,加大金融支持国家战略和重点领域。', '民营小微企业、精准扶贫力度,稳妥开展互联网金融风险专项整治以及房地产金融宏观审慎管理等方面做了大量卓有成效的工作。', '为实施稳健货币政策、防范化解重大金融风险、推动经济结构调整和转型升级提供了有力支撑。']


# load word vectors
(words, We) = data_io.getWordmap(wordfile)
# load word weights
word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(sentences_test, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind) # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_core.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
print(embedding)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataname',
                        default='t6',
                        help='dataset name',
                        choices=['t6', 't26', '2C'])
    parser.add_argument('-c',
                        '--classifiername',
                        default='RF',
                        help='which classifier to use',
                        choices=['GaussianNB', 'RF', 'SVM', 'KNN'])
    args = parser.parse_args()
    data_name = args.dataname  # t6 or t26, 2C, 4C
    clf_name = args.classifiername  # classfier

    # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter.
    embed_dims = [100]  # can add 25, 50, 200 dimension if needed
    wordfile_list = [
        '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims
    ]
    # each line is a word and its frequency
    weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt'
    # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    weightpara = 1e-3
    # number of principal components to remove in SIF weighting scheme
    rmpc = 1

    for wordfile, dim in zip(wordfile_list, embed_dims):
        # load word vectors
        (words, We) = data_io.getWordmap(wordfile)
        # load word weights
        # word2weight['str'] is the weight for the word 'str'
        word2weight = data_io.getWordWeight(weightfile, weightpara)
        # weight4ind[i] is the weight for the i-th word
        weight4ind = data_io.getWeight(words, word2weight)

        data_path = "../data/"
        if data_name == "t6":
            file_path = data_path + "CrisisLexT6_cleaned/"
            disasters = [
                "sandy", "queensland", "boston", "west_texas", "oklahoma",
                "alberta"
            ]
            test_list = [
                "{}_glove_token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "t26":
            file_path = data_path + "CrisisLexT26_cleaned/"
            disasters = [
                "2012_Colorado_wildfires", "2013_Queensland_floods",
                "2013_Boston_bombings", "2013_West_Texas_explosion",
                "2013_Alberta_floods", "2013_Colorado_floods",
                "2013_NY_train_crash"
            ]
            test_list = [
                "{}-tweets_labeled.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "2C":
            file_path = data_path + "2CTweets_cleaned/"
            disasters = [
                "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco",
                "Boston", "Brisbane", "Dublin", "London", "Sydney"
            ]
            test_list = [
                "{}2C.csv.token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}2C_training.csv".format(disaster) for disaster in disasters
            ]

        accu_list = []
        roc_list = []
        precision_list = []
        recall_list = []
        f1_list = []
        for train, test in zip(train_list, test_list):
            train_file = os.path.join(file_path, train)
            test_file = os.path.join(file_path, test)
            xtrain, ytrain = load_data(data_name, train_file)
            xtest, ytest = load_data(data_name, test_file)

            # load train
            # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location
            xtrain_windx, m_train = data_io.sentences2idx(xtrain, words)
            w_train = data_io.seq2weight(xtrain_windx, m_train,
                                         weight4ind)  # get word weights

            # set parameters
            paramss = params.params()
            paramss.rmpc = rmpc
            # get SIF embedding
            train_embed = SIF_embedding.SIF_embedding(
                We, xtrain_windx, w_train,
                paramss)  # embedding[i,:] is the embedding for sentence i

            # load target
            # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location
            xtest_windx, m_test = data_io.sentences2idx(xtest, words)
            # get word weights
            w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind)

            # set parameters
            paramsss = params.params()
            paramsss.rmpc = rmpc
            # get SIF embedding
            test_embed = SIF_embedding.SIF_embedding(
                We, xtest_windx, w_test,
                paramsss)  # embedding[i,:] is the embedding for sentence i

            print(test)
            accu, roc, precision, recall, f1 = run_classifier(
                train_embed, ytrain, test_embed, ytest, clf_name, 100)
            accu_list.append(accu)
            roc_list.append(roc)
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

        print("{}_SIF_{}_LOO_accuracy {}".format(data_name,
                                                 clf_name + str(dim),
                                                 accu_list))
        print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim),
                                            roc_list))
        print("{}_SIF_{}_LOO_precision {}".format(data_name,
                                                  clf_name + str(dim),
                                                  precision_list))
        print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim),
                                               recall_list))
        print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim),
                                           f1_list))
        print(
            "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}"
            .format(data_name, clf_name + str(dim), np.mean(accu_list),
                    np.std(accu_list), np.mean(roc_list), np.std(roc_list),
                    np.mean(f1_list), np.std(f1_list), np.mean(precision_list),
                    np.std(precision_list), np.mean(recall_list),
                    np.std(recall_list)))
Example #19
0
for index, word in enumerate(model_300.wv.index2entity):
    words[word] = index
We = model_300.wv.vectors
weightpara = 1e-3
rmpc = 1
sentences = [
    'this is an example sentence',
    'this is another sentence that is slightly longer'
]

# load word vectors
# (words, We) = data_io.getWordmap(wordfile)
# load word weights
# word2weight = data_io.getWordWeight(weightfile, weightpara)  # word2weight['str'] is the weight for the word 'str'
word2weight = data_io.getWordWeight(model_300.wv.vocab, weightpara)
weight4ind = data_io.getWeight(
    words, word2weight)  # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = data_io.sentences2idx(
    sentences,
    words)  # x is the array of word indices, m is the binary mask indicating
# whether there is a word in that location
w = data_io.seq2weight(x, m, weight4ind)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i
Example #20
0
 def transform(self, We, sentences):
     x, m = data_io.sentences2idx(sentences, self.word_map) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
     w = data_io.seq2weight(x, m, self.weight4ind) # get word weights
     weighted_emb = get_weighted_average(We, x, w)
     # now use the model we've already loaded
     return self.remove_pc(weighted_emb)
Example #21
0
    def get_embedding(self, sentences, language='Chinese', weightpara=1e-3):
        """
        This function return the embeddings for all sentences in the input parameter: sentences
        sentences is a list of sentencs need for SIF embeddings
        """
        if language == 'Chinese':
            # word vector file
            # For model2:
            # wordfile =
            # For model1:
            # wordfile='../models/wiki_news_word_vector_small2.txt'
            # word frequency file
            # weightfile =
            words = self.words_chi
            word2weight = self.word2weight_chi
            We = self.We_chi
        else:
            # for english use:
            # wordfile =
            # wordfile='../models/glove.840B.300d.txt'
            # weightfile =
            # weightpara = 1e-5
            # weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
            words = self.words_eng
            word2weight = self.word2weight_eng
            We = self.We_eng
        rmpc = 1  # number of principal components to remove in SIF weighting scheme

        weight4ind = data_io.getWeight(words, word2weight)
        print('weight4ind finished ')

        # load sentences
        if language == 'Chinese':
            x, m = data_io.sentences2idx_c(sentences, words)
        else:
            x, m = data_io.sentences2idx(sentences, words)
        # print (x.shape) # (句子的数量,最长的句子的单词的数量)
        # print (m.shape) # (句子的数量,最长的句子的单词的数量)

        print('sentences2idx finished ')
        w = data_io.seq2weight(x, m, weight4ind)  # get word weights

        print('seq2weight finished ')

        # set parameters
        param = params.params()
        param.rmpc = rmpc

        # get SIF embedding
        """
        return 所有需要计算similarity的title,全文,句子的embedding。
        paper 里面用的是TruncatedSVD,project 要求我们用PCA方法decomposite
        """
        print('embedding start ')
        embedding = SIF_embedding.SIF_embedding(
            We, x, w, param,
            method='PCA')  # embedding[i,:] is the embedding for sentence i

        print('embedding finished ')
        print(embedding.shape)
        return embedding
Example #22
0
# srcsent = ['Pada mulanya, waktu Allah mulai menciptakan alam semesta']
# tgtsent = ['God saw the light, and saw that it was good. God divided the light from the darkness.']
# params = params.params()
weightpara = 1e-3
rmpc = 1

# def srcEmbedding(srcWordFilePath, srcsent):
src_model_300 = gensim.models.KeyedVectors.load_word2vec_format(
    srcWordFilePath, binary=False)
srcwords = {}
for index, word in enumerate(src_model_300.wv.index2entity):
    srcwords[word] = index
srcWe = src_model_300.wv.vectors
srcword2weight = data_io.getWordWeight(src_model_300.wv.vocab, weightpara)
srcweight4ind = data_io.getWeight(srcwords, srcword2weight)
srcx, srcm = data_io.sentences2idx(srcsent, srcwords)
srcw = data_io.seq2weight(srcx, srcm, srcweight4ind)
srcparams = params.params()
srcparams.rmpc = rmpc
srcEmbedding = SIF_embedding.SIF_embedding(srcWe, srcx, srcw, srcparams)
# return embedding

# def tgtEmbedding(tgtWordFilePath, tgtsent):
tgtmodel_300 = gensim.models.KeyedVectors.load_word2vec_format(tgtWordFilePath,
                                                               binary=False)
tgtwords = {}
for index, word in enumerate(tgtmodel_300.wv.index2entity):
    tgtwords[word] = index
tgtWe = tgtmodel_300.wv.vectors
tgtword2weight = data_io.getWordWeight(tgtmodel_300.wv.vocab, weightpara)
tgtweight4ind = data_io.getWeight(tgtwords, tgtword2weight)
Example #23
0
(glove_words, We) = data_io.getWordmap(wordfile)
print("shape of Word embedding is: " + str(We.shape))

# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    glove_words, word2weight)  # weight4ind[i] is the weight for the i-th word

# set parameters
params = params.params()
params.rmpc = rmpc

# load sentences
print("reading the input sentences now & converting to indices .. \n")
sample_sents = read_NMT_data.read_data(sample_ara)

# AraSIF embedding for sample sentences
print("computing AraSIF embedding now ...\n")

# x is the array of word indices, m is the binary mask indicating whether there is a word in that location
x, m = data_io.sentences2idx(sample_sents, glove_words)
w = data_io.seq2weight(x, m, weight4ind)  # get word weights
sample_embedding = SIF_embedding.SIF_embedding(
    We, x, w, params)  # embedding[i,:] is the embedding for sentence i
print("shape of sample sentence embedding is: " + str(sample_embedding.shape))

# serialize for future use
numpy.save('sample_sentence_embedding.npy', sample_embedding)
Example #24
0
        line = line.strip()
        line = line.split(':')[1]
        sentences.append(line)

glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt'  # word vector file, can be downloaded from GloVe website
word_freauency_path = './douban_cropus_vocab.txt'  # each line is a word and its frequency
weightpara = 1e-3
rmpc = 1

# load word vectors
(Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path)
# load word weights
word2weight = data_io.getWordWeight(
    word_freauency_path,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
Index2Weight = data_io.getWeight(
    Word2Indx, word2weight)  # weight4ind[i] is the weight for the i-th word

word_idx_seq_of_sentence, mask = data_io.sentences2idx(
    sentences, Word2Indx
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask,
                                             Index2Weight)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence,
                                        word_weight_of_sentence, params)
np.save("douban_sentence2vector.npy", embedding)