コード例 #1
0
def main(sentences,
         wordfile: str,
         weightfile: str,
         weightpara: float = 1e-3,
         rmpc: int = 1):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m, _ = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    params = params.params()
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
コード例 #2
0
 def load_model(self):
     sys.path.append('../src')
     weightpara = 1e-5  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
     print('读取中文模型')
     self.words_chi, self.We_chi = data_io.getWordmap(
         '../models/wiki_news_model2_vector.txt')
     self.word2weight_chi = data_io.getWordWeight(
         '../models/word_count.txt',  # each line is a word and its frequency,
         weightpara)  # word2weight['str'] is the weight for the word 'str'
     print('中文模型读取完毕')
     print('读取英文模型')
     weightpara = 1e-3
     self.words_eng, self.We_eng = data_io.getWordmap(
         '../models/glove_large.txt')
     self.word2weight_eng = data_io.getWordWeight(
         '../models/enwiki_vocab_min200.txt',  # each line is a word and its frequency
         weightpara)  # word2weight['str'] is the weight for the word 'str'
     print('英文模型读取完毕')
コード例 #3
0
def load_embeddings(wordfile, weightfile, weightpara=5e-4, word2vec=False):
    if word2vec:
        (words, We) = getWordmapWord2Vec(wordfile)
    else:
        (words, We) = data_io.getWordmap(wordfile)
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    return words, We, weight4ind
コード例 #4
0
def get_embs(sentences, params):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
    return embedding
コード例 #5
0
ファイル: sentiment_analysis.py プロジェクト: kiminh/CwVW-SIF
def get_sif(dataset):
    wordfile = '../data/glove.6B.50d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 2.7e-4  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 0  # number of principal components to remove in SIF weighting scheme
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    param = params.params()
    param.rmpc = rmpc
    sentence_embedding_all = get_sentences_embedding(dataset, words,
                                                     weight4ind, param, We)
    # sentence_embedding_all = turn2std(sentence_embedding_all)  # 将矩阵转换为标准矩阵
    return sentence_embedding_all
コード例 #6
0
def vectorize_sif(filename):
    class params(object):
        def __init__(self):
            self.LW = 1e-5
            self.LC = 1e-5
            self.eta = 0.05

        def __str__(self):
            t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
            t = map(str, t)
            return ' '.join(t)

    # input
    wordfile = 'glove.6B.100d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    #sentiment_file = '../data/sentiment-test' # sentiment data file
    #cleanfile = "2/D1026-A.M.100.E.10.segs.cl"
    #sentiment_file = '../data/clean-5.txt'
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences (here use sentiment data as an example)
    #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentiment2idx(filename, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # parameters
    params = params()
    #params = params.params()
    params.rmpc = rmpc

    # get SIF embedding
    embedding = SIF_embedding_lib.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    return embedding
コード例 #7
0
def sif_embedding(sen):
    import sys
    #sys.path.append("../src")
    #sys.path.append("../data")
    import data_io, params, SIF_embedding
    import params
    import SIF_embedding
    # input
    wordfile = 'data/dic_files.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'data/dic_freq.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    # sentences = ['这是一个例句', '这是一个更长一些的例句']
    # sentences = ['昨天天气不错', '这是一个更长一些的例句']
    sentences = sen
    # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # print(words,We) #单词,和词向量
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    # print(x,m)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # print('word weight:',w)
    # set parameters
    # params = params.params()
    params = params.params_all()  # name 'params' is not defined
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
    return embedding
コード例 #8
0
ファイル: inference.py プロジェクト: pirate3166/LFIP_SUM
def load_model():
    wordfile = "glove path (glove.840B.300d.txt file)"  # you can download glove from https://www.kaggle.com/takuok/glove840b300dtxt
    weightfile = artifact_path + '/SIF/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme

    (words, We) = data_io.getWordmap(wordfile)

    a = list(words.keys())
    for i, v in enumerate(a):
        words[v.decode("utf-8")] = words.pop(v)

    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    return (words, weight4ind, rmpc, We)
コード例 #9
0
def load_embed(wordfile, weightfile, weightpara=1e-3, param=None, rmpc=0):
    '''
    wordfile:   : location of embedding data (e.g., glove embedings)
    weightfile: : location of TF data for words
    weightpara: : the parameter in the SIF weighting scheme, usually in range [3e-5, 3e-3]
    rmpc:       : number of principal components to remove in SIF weighting scheme
    '''
    # input
    wordfile = '/home/francisco/GitHub/SIF/data/glove.840B.300d.txt'   # word vector file, can be downloaded from GloVe website
    weightfile = '/home/francisco/GitHub/SIF/auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency

    # load word vectors
    (words, Weights) = data_io.getWordmap(wordfile)

    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

    # set parameters
    param.rmpc = rmpc

    return Weights, words, word2weight, weight4ind
def get_sent_vec(sentences):
    import params
    # 详见data_io.py
    (words, We) = data_io.getWordmap(wordfile)
    # 详见data_io.py
    word2weight = data_io.getWordWeight(weightfile, weightpara)
    weight4ind = data_io.getWeight(words, word2weight)
    # 详见data_io.py
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)

    # 参数设置
    params = params.params()
    params.rmpc = rmpc
    # 调用SIF核心算法计算句向量,详见SIF_core
    embedding = SIF_core.SIF_embedding(We, x, w, params)

    get_sent_vec = {}
    for i in range(len(embedding)):
        get_sent_vec[sentences[i]] = embedding[i]

    return get_sent_vec
コード例 #11
0
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import sys
import gensim
import data_io
import os
from scipy.stats import spearmanr

filename = sys.argv[1]
dataset = sys.argv[2]

f_base = os.path.splitext(os.path.basename(filename))[0]
d_base = os.path.splitext(os.path.basename(dataset))[0]

(words, We) = data_io.getWordmap(filename)

wordsim = pd.read_csv(dataset,delimiter=';', names=['word1','word2','sim'], index_col=None)

similarities = []
i=0
tot=0
for index, row in wordsim.iterrows():
    try:
        similarity = numpy.dot(We[words[row['word1']]], We[words[row['word2']]])/(numpy.linalg.norm(We[words[row['word1']]])* numpy.linalg.norm(We[words[row['word2']]]))
        tot+=1
    except KeyError:
        similarity = numpy.nan
        i+=1
    similarities.append(similarity)
コード例 #12
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    #personality = random.choice(personalities)
    #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
    wordfile = './data/truncate.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = './auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    p = 0
    start_time = time.time()
    with open('data_volunteers.json') as json_file:
        json_data = json.load(json_file)
        for i in json_data:
            p += 1
            #if p <1100:
            #    continue
            history = []
            personality = []
            query_set = []
            json_dialog = i["dialog"]
            json_bot = i["bot_profile"]
            for j in json_bot:
                personality.append(tokenizer.encode(j))
            #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
            persona = tokenizer.decode(chain(*personality))
            row = {"Personality": persona}
            text = []
            for j in json_dialog:
                if j["sender_class"] == "Human":
                    json_text = j["text"]
                    raw_text = json_text
                    check = tokenizer.decode(tokenizer.encode(raw_text),
                                             skip_special_tokens=True)
                    if check == "":
                        history.append(tokenizer.encode(raw_text))
                        with torch.no_grad():
                            out_ids = normal_sample_sequence(
                                personality, history, tokenizer, model, args)
                        # history.append(out_ids)
                        history = history[-(2 * args.max_history + 1):]
                        out_text = tokenizer.decode(out_ids,
                                                    skip_special_tokens=True)
                        text.append({
                            "evaluation_score": j["evaluation_score"],
                            "id": j["id"],
                            "sender": j["sender"],
                            "sender_class": j["sender_class"],
                            "text": raw_text,
                            "generated_text": out_text
                        })
                        continue
                    history.append(tokenizer.encode(raw_text))
                    with torch.no_grad():
                        out_ids = sample_sequence(personality, history,
                                                  tokenizer, model, args,
                                                  words, weight4ind, We)
                    # history.append(out_ids)
                    history = history[-(2 * args.max_history + 1):]
                    out_text = tokenizer.decode(out_ids,
                                                skip_special_tokens=True)
                    text.append({
                        "evaluation_score": j["evaluation_score"],
                        "id": j["id"],
                        "sender": j["sender"],
                        "sender_class": j["sender_class"],
                        "text": raw_text,
                        "generated_text": out_text
                    })
                else:
                    json_text = j["text"]
                    raw_text = json_text
                    history.append(tokenizer.encode(raw_text))
                    text.append({
                        "evaluation_score": j["evaluation_score"],
                        "id": j["id"],
                        "sender": j["sender"],
                        "sender_class": j["sender_class"],
                        "text": raw_text
                    })
            row["dialog"] = text
            query_set.append(row)
            #print(query_set)
            with open('./sif_set/sif' + str(p) + '.json',
                      'w',
                      encoding='utf-8') as make_file:
                json.dump(query_set, make_file)
            if not p % 10:
                print(
                    str(p * 100 / 1111) + '%, ' +
                    str(time.time() - start_time) + 'sec')
    '''
コード例 #13
0
    if params.clip == 0:
        params.clip = None
params.minval = args.minval
params.maxval = args.maxval
if args.nonlinearity:
    if args.nonlinearity == 1:
        params.nonlinearity = lasagne.nonlinearities.linear
    if args.nonlinearity == 2:
        params.nonlinearity = lasagne.nonlinearities.tanh
    if args.nonlinearity == 3:
        params.nonlinearity = lasagne.nonlinearities.rectify
    if args.nonlinearity == 4:
        params.nonlinearity = lasagne.nonlinearities.sigmoid

# load data
(words, We) = data_io.getWordmap(params.wordfile)
if args.task == "sim" or args.task == "ent":
    train_data = data_io.getSimEntDataset(params.traindata, words, params.task)
elif args.task == "sentiment":
    train_data = data_io.getSentimentDataset(params.traindata, words)
else:
    raise ValueError('Task should be ent, sim, or sentiment.')

# load weight
if params.weightfile:
    word2weight = data_io.getWordWeight(params.weightfile, params.weightpara)
    params.weight4ind = data_io.getWeight(words, word2weight)
    print(
        ('word weights computed using parameter a=' + str(params.weightpara)))
else:
    params.weight4ind = []
コード例 #14
0
ファイル: run.py プロジェクト: Taolan/SynTree-WordVec
"""

import datapre
import data_io,params
import Embedding

Wordweight_file='weight/word_weight_3a.txt'
Clauseweight_file='weight/Clause_weight.txt'
Phraseweight_file='weight/Phrase_weight.txt'
word_weight=datapre.TreeNode_Weight(Wordweight_file)
clause_weight=datapre.TreeNode_Weight(Clauseweight_file)
phrase_weight=datapre.TreeNode_Weight(Phraseweight_file)

wordfile = 'wordvector/glove.6B.50d.txt' # word vector file, can be downloaded from GloVe website

(words, word_emb) = data_io.getWordmap(wordfile)
###########################################

prefix = "datapre/"

farr1 = [
            "MSRpar2012-1.txt",
            #"MSRpar2012-2.txt",
            "MSRvid2012-1.txt",
            #"MSRvid2012-2.txt",
            "OnWN2012-1.txt",
            #"OnWN2012-2.txt",
            "OnWN2013-1.txt",
            #"OnWN2013-2.txt",
            "OnWN2014-1.txt",
            #"OnWN2014-2.txt",
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataname',
                        default='t6',
                        help='dataset name',
                        choices=['t6', 't26', '2C'])
    parser.add_argument('-c',
                        '--classifiername',
                        default='RF',
                        help='which classifier to use',
                        choices=['GaussianNB', 'RF', 'SVM', 'KNN'])
    args = parser.parse_args()
    data_name = args.dataname  # t6 or t26, 2C, 4C
    clf_name = args.classifiername  # classfier

    # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter.
    embed_dims = [100]  # can add 25, 50, 200 dimension if needed
    wordfile_list = [
        '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims
    ]
    # each line is a word and its frequency
    weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt'
    # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    weightpara = 1e-3
    # number of principal components to remove in SIF weighting scheme
    rmpc = 1

    for wordfile, dim in zip(wordfile_list, embed_dims):
        # load word vectors
        (words, We) = data_io.getWordmap(wordfile)
        # load word weights
        # word2weight['str'] is the weight for the word 'str'
        word2weight = data_io.getWordWeight(weightfile, weightpara)
        # weight4ind[i] is the weight for the i-th word
        weight4ind = data_io.getWeight(words, word2weight)

        data_path = "../data/"
        if data_name == "t6":
            file_path = data_path + "CrisisLexT6_cleaned/"
            disasters = [
                "sandy", "queensland", "boston", "west_texas", "oklahoma",
                "alberta"
            ]
            test_list = [
                "{}_glove_token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "t26":
            file_path = data_path + "CrisisLexT26_cleaned/"
            disasters = [
                "2012_Colorado_wildfires", "2013_Queensland_floods",
                "2013_Boston_bombings", "2013_West_Texas_explosion",
                "2013_Alberta_floods", "2013_Colorado_floods",
                "2013_NY_train_crash"
            ]
            test_list = [
                "{}-tweets_labeled.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "2C":
            file_path = data_path + "2CTweets_cleaned/"
            disasters = [
                "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco",
                "Boston", "Brisbane", "Dublin", "London", "Sydney"
            ]
            test_list = [
                "{}2C.csv.token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}2C_training.csv".format(disaster) for disaster in disasters
            ]

        accu_list = []
        roc_list = []
        precision_list = []
        recall_list = []
        f1_list = []
        for train, test in zip(train_list, test_list):
            train_file = os.path.join(file_path, train)
            test_file = os.path.join(file_path, test)
            xtrain, ytrain = load_data(data_name, train_file)
            xtest, ytest = load_data(data_name, test_file)

            # load train
            # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location
            xtrain_windx, m_train = data_io.sentences2idx(xtrain, words)
            w_train = data_io.seq2weight(xtrain_windx, m_train,
                                         weight4ind)  # get word weights

            # set parameters
            paramss = params.params()
            paramss.rmpc = rmpc
            # get SIF embedding
            train_embed = SIF_embedding.SIF_embedding(
                We, xtrain_windx, w_train,
                paramss)  # embedding[i,:] is the embedding for sentence i

            # load target
            # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location
            xtest_windx, m_test = data_io.sentences2idx(xtest, words)
            # get word weights
            w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind)

            # set parameters
            paramsss = params.params()
            paramsss.rmpc = rmpc
            # get SIF embedding
            test_embed = SIF_embedding.SIF_embedding(
                We, xtest_windx, w_test,
                paramsss)  # embedding[i,:] is the embedding for sentence i

            print(test)
            accu, roc, precision, recall, f1 = run_classifier(
                train_embed, ytrain, test_embed, ytest, clf_name, 100)
            accu_list.append(accu)
            roc_list.append(roc)
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

        print("{}_SIF_{}_LOO_accuracy {}".format(data_name,
                                                 clf_name + str(dim),
                                                 accu_list))
        print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim),
                                            roc_list))
        print("{}_SIF_{}_LOO_precision {}".format(data_name,
                                                  clf_name + str(dim),
                                                  precision_list))
        print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim),
                                               recall_list))
        print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim),
                                           f1_list))
        print(
            "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}"
            .format(data_name, clf_name + str(dim), np.mean(accu_list),
                    np.std(accu_list), np.mean(roc_list), np.std(roc_list),
                    np.mean(f1_list), np.std(f1_list), np.mean(precision_list),
                    np.std(precision_list), np.mean(recall_list),
                    np.std(recall_list)))
コード例 #16
0
ファイル: run_mittens.py プロジェクト: flo3003/mittens
                    '-lr',
                    action='store',
                    default='0.01',
                    help='Learning rate.')

parser.add_argument('--iterations',
                    '-i',
                    action='store',
                    default=250,
                    help=('Number of iterations.'))

args = parser.parse_args()

pretrained_vectors = args.pretrained_vectors

(words, weights) = data_io.getWordmap(pretrained_vectors)

print weights.shape
initial_embeddings = {v: weights[words[v]] for v in words}

my_vocabulary = open(args.my_vocabulary, 'r')
vocab = my_vocabulary.read().split('\n')
vocab_len = len(vocab) - 1

print "Reading co-occurrence matrix..."
data = np.genfromtxt(args.my_coo_matrix, names=True, dtype=None, delimiter=',')
my_coo_matrix = sparse.coo_matrix(
    (data['cooccurrence'], (data['word_a'], data['word_b'])),
    shape=(vocab_len, vocab_len))

print "Converting co-occurence matrix to csr format..."
コード例 #17
0
def SIF_master(segfile, cleanfile, directory, summ_ind):
    print "segfile: ", segfile
    print "clean file: ", cleanfile

    #cleanfile = cleanfile+".ls"
    class params(object):
        def __init__(self):
            self.LW = 1e-5
            self.LC = 1e-5
            self.eta = 0.05

        def __str__(self):
            t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
            t = map(str, t)
            return ' '.join(t)

    # input
    wordfile = 'glove.6B.100d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    #sentiment_file = '../data/sentiment-test' # sentiment data file
    #cleanfile = "2/D1026-A.M.100.E.10.segs.cl"
    #sentiment_file = '../data/clean-5.txt'
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences (here use sentiment data as an example)
    #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentiment2idx(cleanfile, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # parameters
    params = params()
    #params = params.params()
    params.rmpc = rmpc

    # get SIF embedding
    embedding = SIF_embedding_lib.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
    #segfile = segfile+".segs"
    f = open(segfile).readlines()
    indexes = []
    matches = []
    for item in f:
        ind = item.rfind("&")
        indexes.append(item[:ind + 1])

    if len(indexes) == len(embedding):
        for ind in range(0, len(indexes)):
            lines = indexes[ind] + str(list(embedding[ind]))
            matches.append(lines)
    else:
        print "length doesn't match!! Check if there is empty line!!"

    #fname = directory +'/'+str(summ_ind)+ '/' + getRealName(segfile) + '.ls'
    #fname = directory +'/'+str(summ_ind)+ '/' + segfile + '.ls'
    fname = directory + '/' + str(summ_ind) + '/' + getRealName(segfile)
    print fname
    with open(fname + ".ls", "w") as file:
        for item in matches:
            file.write(item + "\n")

    return embedding
コード例 #18
0
                        header=0,
                        delimiter="\t",
                        quoting=3)
    test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data',
                                    'testData.tsv'),
                       header=0,
                       delimiter="\t",
                       quoting=3)

    #Data Leak
    test["sentiment"] = test["id"].map(
        lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)
    y_test = test["sentiment"]

    vectors = sys.argv[1]
    (words, We) = data_io.getWordmap(vectors)

    num_features = We.shape[1]

    p, trainDataVecs = getAvgFeatureVecs(getCleanReviews(train), We, words,
                                         num_features)
    print('Train: {0} '.format(p))
    p, testDataVecs = getAvgFeatureVecs(getCleanReviews(test), We, words,
                                        num_features)
    print('Test: {0} '.format(p))

    log_reg = LogisticRegression()

    print "Fitting a logistic regression model to labeled training data..."
    log_reg = log_reg.fit(trainDataVecs, train["sentiment"])
コード例 #19
0
import SIF_embedding
import read_NMT_data

# input arabic file
sample_ara = '../NMT_data/sample.ara'  # to compute sif embeddings for all sentences in this file

# Arabic GloVe embedding pre-trained model
wordfile = '../models/glove_full_grams_sg_300_wiki.txt'
weightfile = '../AraSIF_word_counts/arwiki_vocab_min200.txt'  # each line is a word and its frequency

weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme

# load word vectors
print("Reading embedding matrix. Hang on! this will take a while ...")
(glove_words, We) = data_io.getWordmap(wordfile)
print("shape of Word embedding is: " + str(We.shape))

# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    glove_words, word2weight)  # weight4ind[i] is the weight for the i-th word

# set parameters
params = params.params()
params.rmpc = rmpc

# load sentences
print("reading the input sentences now & converting to indices .. \n")
コード例 #20
0
ファイル: example.py プロジェクト: Rxma1805/Sentence2Vector
douban_cropus_path = '/bigdata/xiaoma/Assi12/douban.txt'
sentences = []
with open(douban_cropus_path) as f:
    for line in f:
        line = line.strip()
        line = line.split(':')[1]
        sentences.append(line)

glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt'  # word vector file, can be downloaded from GloVe website
word_freauency_path = './douban_cropus_vocab.txt'  # each line is a word and its frequency
weightpara = 1e-3
rmpc = 1

# load word vectors
(Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path)
# load word weights
word2weight = data_io.getWordWeight(
    word_freauency_path,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
Index2Weight = data_io.getWeight(
    Word2Indx, word2weight)  # weight4ind[i] is the weight for the i-th word

word_idx_seq_of_sentence, mask = data_io.sentences2idx(
    sentences, Word2Indx
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask,
                                             Index2Weight)  # get word weights

# set parameters
params = params.params()