Beispiel #1
0
def main(sentences,
         wordfile: str,
         weightfile: str,
         weightpara: float = 1e-3,
         rmpc: int = 1):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m, _ = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights

    # set parameters
    params = params.params()
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
Beispiel #2
0
def SIFDocEmbedding(w2vdict, weighttxt, txtfile):

    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    (words, We) = getWordMap(w2vdict)
    word2weight = data_io.getWordWeight(
        weighttxt, word2weight_pickle_file,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    DocVectorDict = {}
    DocSentVecDict = {}
    docNum = 0
    with open(txtfile, 'r') as reader:
        txt = reader.readlines()
    reader.close()
    for doc in txt:
        doc = doc.strip()
        sentEm = SIFSentEmbedding(weighttxt,
                                  doc,
                                  words,
                                  We,
                                  weight4ind,
                                  weightpara=1e-3,
                                  paramm=1)
        DocSentVecDict[docNum] = sentEm
        docVector = (np.sum(sentEm, axis=1)) / (sentEm.shape[0])
        DocVectorDict[docNum] = docVector
        docNum += 1
    return DocVectorDict, DocSentVecDict, We
Beispiel #3
0
 def load_model(self):
     sys.path.append('../src')
     weightpara = 1e-5  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
     print('读取中文模型')
     self.words_chi, self.We_chi = data_io.getWordmap(
         '../models/wiki_news_model2_vector.txt')
     self.word2weight_chi = data_io.getWordWeight(
         '../models/word_count.txt',  # each line is a word and its frequency,
         weightpara)  # word2weight['str'] is the weight for the word 'str'
     print('中文模型读取完毕')
     print('读取英文模型')
     weightpara = 1e-3
     self.words_eng, self.We_eng = data_io.getWordmap(
         '../models/glove_large.txt')
     self.word2weight_eng = data_io.getWordWeight(
         '../models/enwiki_vocab_min200.txt',  # each line is a word and its frequency
         weightpara)  # word2weight['str'] is the weight for the word 'str'
     print('英文模型读取完毕')
def load_embeddings(wordfile, weightfile, weightpara=5e-4, word2vec=False):
    if word2vec:
        (words, We) = getWordmapWord2Vec(wordfile)
    else:
        (words, We) = data_io.getWordmap(wordfile)
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    return words, We, weight4ind
Beispiel #5
0
def get_embs(sentences, params):
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(We, x, w, params) # embedding[i,:] is the embedding for sentence i
    return embedding
    def __init__(self):
        self.weightfile = config.url_enwiki
        self.weightpara = 1e-3

        print("Getting embeddings from the Glove pickles")

        with open(config.url_glove_pickle_we_1, "rb") as file:
            We_1 = pickle.load(file)

        with open(config.url_glove_pickle_words_1, "rb") as file:
            words_1 = pickle.load(file)

        with open(config.url_glove_pickle_we_2, "rb") as file:
            We_2 = pickle.load(file)

        with open(config.url_glove_pickle_words_2, "rb") as file:
            words_2 = pickle.load(file)

        with open(config.url_glove_pickle_we_3, "rb") as file:
            We_3 = pickle.load(file)

        with open(config.url_glove_pickle_words_3, "rb") as file:
            words_3 = pickle.load(file)

        self.We = []
        self.We.extend(We_1)
        self.We.extend(We_2)
        self.We.extend(We_3)

        self.words = {}
        self.words.update(words_1)
        self.words.update(words_2)
        self.words.update(words_3)

        with open(config.url_snli_pc1, "rb") as file:
            self.snli_pc_1 = pickle.load(file)

        with open(config.url_snli_pc2, "rb") as file:
            self.snli_pc_2 = pickle.load(file)

        print("Successfully got the embeddings from the pickle")

        self.word2weight = data_io.getWordWeight(
            self.weightfile, self.weightpara
        )  # word2weight['str'] is the weight for the word 'str'
        self.weight4ind = data_io.getWeight(
            self.words,
            self.word2weight)  # weight4ind[i] is the weight for the i-th word
Beispiel #7
0
def get_sif(dataset):
    wordfile = '../data/glove.6B.50d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 2.7e-4  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 0  # number of principal components to remove in SIF weighting scheme
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    param = params.params()
    param.rmpc = rmpc
    sentence_embedding_all = get_sentences_embedding(dataset, words,
                                                     weight4ind, param, We)
    # sentence_embedding_all = turn2std(sentence_embedding_all)  # 将矩阵转换为标准矩阵
    return sentence_embedding_all
def main(word_embeddings_path, word_weight_path, out_dir):
    wordfile = word_embeddings_path
    weightfile = word_weight_path
    weightparas = [1e-2, 1e-3, 1e-4]
    (words, We) = getWordmap(wordfile)
    vector_file = open(os.path.join(out_dir, "vectors"), "w")
    pickle.dump(We, vector_file)
    words_file = open(os.path.join(out_dir, "words"), "w")
    pickle.dump(words, words_file)
    for weightpara in weightparas:
        print("calculating word2weight with a = {}.".format(weightpara))
        word2weight = data_io.getWordWeight(weightfile, weightpara)
        print("calculating weight4ind with a = {}.".format(weightpara))
        weight4ind = data_io.getWeight(words, word2weight)
        weight4ind_file = open(
            os.path.join(out_dir,
                         "weight4ind_weightpara_%.E" % Decimal(weightpara)),
            'w')
        pickle.dump(weight4ind, weight4ind_file)
Beispiel #9
0
def sif_embedding(sen):
    import sys
    #sys.path.append("../src")
    #sys.path.append("../data")
    import data_io, params, SIF_embedding
    import params
    import SIF_embedding
    # input
    wordfile = 'data/dic_files.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'data/dic_freq.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    # sentences = ['这是一个例句', '这是一个更长一些的例句']
    # sentences = ['昨天天气不错', '这是一个更长一些的例句']
    sentences = sen
    # sentences = ['this is an example sentence', 'this is another sentence that is slightly longer']

    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # print(words,We) #单词,和词向量
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    # x, m, _ = data_io.sentences2idx(sentences, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(
        sentences, words
    )  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    # print(x,m)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # print('word weight:',w)
    # set parameters
    # params = params.params()
    params = params.params_all()  # name 'params' is not defined
    params.rmpc = rmpc
    # get SIF embedding
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
    return embedding
def vectorize_sif(filename):
    class params(object):
        def __init__(self):
            self.LW = 1e-5
            self.LC = 1e-5
            self.eta = 0.05

        def __str__(self):
            t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
            t = map(str, t)
            return ' '.join(t)

    # input
    wordfile = 'glove.6B.100d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    #sentiment_file = '../data/sentiment-test' # sentiment data file
    #cleanfile = "2/D1026-A.M.100.E.10.segs.cl"
    #sentiment_file = '../data/clean-5.txt'
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences (here use sentiment data as an example)
    #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentiment2idx(filename, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # parameters
    params = params()
    #params = params.params()
    params.rmpc = rmpc

    # get SIF embedding
    embedding = SIF_embedding_lib.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i

    return embedding
Beispiel #11
0
def load_model():
    wordfile = "glove path (glove.840B.300d.txt file)"  # you can download glove from https://www.kaggle.com/takuok/glove840b300dtxt
    weightfile = artifact_path + '/SIF/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme

    (words, We) = data_io.getWordmap(wordfile)

    a = list(words.keys())
    for i, v in enumerate(a):
        words[v.decode("utf-8")] = words.pop(v)

    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    return (words, weight4ind, rmpc, We)
def load_embed(wordfile, weightfile, weightpara=1e-3, param=None, rmpc=0):
    '''
    wordfile:   : location of embedding data (e.g., glove embedings)
    weightfile: : location of TF data for words
    weightpara: : the parameter in the SIF weighting scheme, usually in range [3e-5, 3e-3]
    rmpc:       : number of principal components to remove in SIF weighting scheme
    '''
    # input
    wordfile = '/home/francisco/GitHub/SIF/data/glove.840B.300d.txt'   # word vector file, can be downloaded from GloVe website
    weightfile = '/home/francisco/GitHub/SIF/auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency

    # load word vectors
    (words, Weights) = data_io.getWordmap(wordfile)

    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

    # set parameters
    param.rmpc = rmpc

    return Weights, words, word2weight, weight4ind
def get_sent_vec(sentences):
    import params
    # 详见data_io.py
    (words, We) = data_io.getWordmap(wordfile)
    # 详见data_io.py
    word2weight = data_io.getWordWeight(weightfile, weightpara)
    weight4ind = data_io.getWeight(words, word2weight)
    # 详见data_io.py
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind)

    # 参数设置
    params = params.params()
    params.rmpc = rmpc
    # 调用SIF核心算法计算句向量,详见SIF_core
    embedding = SIF_core.SIF_embedding(We, x, w, params)

    get_sent_vec = {}
    for i in range(len(embedding)):
        get_sent_vec[sentences[i]] = embedding[i]

    return get_sent_vec
Beispiel #14
0
## run
wordfiles = [  #'../data/paragram_sl999_small.txt', # need to download it from John Wieting's github (https://github.com/jwieting/iclr2016)
    '/Users/sherryruan/data/glove/glove.6B/glove.6B.300d.txt'  # need to download it first
]

weightfile = '../auxiliary_data/enwiki_vocab_min200.txt'
weightparas = [-1, 1e-3]  #[-1,1e-1,1e-2,1e-3,1e-4]
rmpcs = [0, 1]  # [0,1,2]

params = params.params()
parr4para = {}
sarr4para = {}
for wordfile in wordfiles:
    (words, We) = data_io.getWordmap(wordfile)
    for weightpara in weightparas:
        word2weight = data_io.getWordWeight(weightfile, weightpara)
        weight4ind = data_io.getWeight(words, word2weight)
        for rmpc in rmpcs:
            print('word vectors loaded from %s' % wordfile)
            print('word weights computed from %s using parameter a=%f' %
                  (weightfile, weightpara))
            params.rmpc = rmpc
            print('remove the first %d principal components' % rmpc)
            ## eval just one example dataset
            parr, sarr = eval.sim_evaluate_one(
                We, words, weight4ind, sim_algo.weighted_average_sim_rmpc,
                params)
            ## eval all datasets; need to obtained datasets from John Wieting (https://github.com/jwieting/iclr2016)
            # parr, sarr = eval.sim_evaluate_all(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params)
            paras = (wordfile, weightfile, weightpara, rmpc)
            parr4para[paras] = parr
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model",
                        type=str,
                        default="gpt",
                        help="Model type (gpt or gpt2)")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    #personality = random.choice(personalities)
    #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
    wordfile = './data/truncate.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = './auxiliary_data/enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word

    p = 0
    start_time = time.time()
    with open('data_volunteers.json') as json_file:
        json_data = json.load(json_file)
        for i in json_data:
            p += 1
            #if p <1100:
            #    continue
            history = []
            personality = []
            query_set = []
            json_dialog = i["dialog"]
            json_bot = i["bot_profile"]
            for j in json_bot:
                personality.append(tokenizer.encode(j))
            #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))
            persona = tokenizer.decode(chain(*personality))
            row = {"Personality": persona}
            text = []
            for j in json_dialog:
                if j["sender_class"] == "Human":
                    json_text = j["text"]
                    raw_text = json_text
                    check = tokenizer.decode(tokenizer.encode(raw_text),
                                             skip_special_tokens=True)
                    if check == "":
                        history.append(tokenizer.encode(raw_text))
                        with torch.no_grad():
                            out_ids = normal_sample_sequence(
                                personality, history, tokenizer, model, args)
                        # history.append(out_ids)
                        history = history[-(2 * args.max_history + 1):]
                        out_text = tokenizer.decode(out_ids,
                                                    skip_special_tokens=True)
                        text.append({
                            "evaluation_score": j["evaluation_score"],
                            "id": j["id"],
                            "sender": j["sender"],
                            "sender_class": j["sender_class"],
                            "text": raw_text,
                            "generated_text": out_text
                        })
                        continue
                    history.append(tokenizer.encode(raw_text))
                    with torch.no_grad():
                        out_ids = sample_sequence(personality, history,
                                                  tokenizer, model, args,
                                                  words, weight4ind, We)
                    # history.append(out_ids)
                    history = history[-(2 * args.max_history + 1):]
                    out_text = tokenizer.decode(out_ids,
                                                skip_special_tokens=True)
                    text.append({
                        "evaluation_score": j["evaluation_score"],
                        "id": j["id"],
                        "sender": j["sender"],
                        "sender_class": j["sender_class"],
                        "text": raw_text,
                        "generated_text": out_text
                    })
                else:
                    json_text = j["text"]
                    raw_text = json_text
                    history.append(tokenizer.encode(raw_text))
                    text.append({
                        "evaluation_score": j["evaluation_score"],
                        "id": j["id"],
                        "sender": j["sender"],
                        "sender_class": j["sender_class"],
                        "text": raw_text
                    })
            row["dialog"] = text
            query_set.append(row)
            #print(query_set)
            with open('./sif_set/sif' + str(p) + '.json',
                      'w',
                      encoding='utf-8') as make_file:
                json.dump(query_set, make_file)
            if not p % 10:
                print(
                    str(p * 100 / 1111) + '%, ' +
                    str(time.time() - start_time) + 'sec')
    '''
Beispiel #16
0
      items.append(str(embedding[i, j]))
    line = " ".join(items) + "\n"
    writer.write(line)
  writer.close()

if __name__ == "__main__":
  # input
  wordfile = '../data/glove.840B.300d-freq500K.txt' # word vector file, can be downloaded from GloVe website
  weightfile = '../auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency
  input_dir = sys.argv[1]
  output_dir = sys.argv[2]
  rmpc = int(sys.argv[3]) # number of principal components to remove in SIF weighting scheme

  params = params.params()
  params.rmpc = rmpc

  # load word vectors
  (words, We) = data_io.getWordmap(wordfile)

  # load word weights
  word2weight = data_io.getWordWeight(weightfile, 1e-3)
  weight4ind = data_io.getWeight(words, word2weight)

  for f in os.listdir(input_dir):
    try:
      input_file = "%s/%s" % (input_dir, f)
      output_file = "%s/%s" % (output_dir, f)
      print_embeddings(get_embeddings(words, We, word2weight, weight4ind, input_file, params), output_file)
    except Exception as ex:
      print("an error occured. skipped %s, due to err: %s" % (input_file, ex))
Beispiel #17
0
        params.nonlinearity = lasagne.nonlinearities.rectify
    if args.nonlinearity == 4:
        params.nonlinearity = lasagne.nonlinearities.sigmoid

# load data
(words, We) = data_io.getWordmap(params.wordfile)
if args.task == "sim" or args.task == "ent":
    train_data = data_io.getSimEntDataset(params.traindata, words, params.task)
elif args.task == "sentiment":
    train_data = data_io.getSentimentDataset(params.traindata, words)
else:
    raise ValueError('Task should be ent, sim, or sentiment.')

# load weight
if params.weightfile:
    word2weight = data_io.getWordWeight(params.weightfile, params.weightpara)
    params.weight4ind = data_io.getWeight(words, word2weight)
    print(
        ('word weights computed using parameter a=' + str(params.weightpara)))
else:
    params.weight4ind = []
if params.npc > 0:
    params.pc = get_pc(train_data, We, params.weight4ind, params)
else:
    params.pc = []

# load model
model = None
if params.nntype == 'proj':
    model = proj_model_sim(We, params)
elif params.nntype == 'proj_sentiment':
Beispiel #18
0
wordFilePath = '../data/vectors-en.txt'
model_300 = gensim.models.KeyedVectors.load_word2vec_format(wordFilePath,binary = False)
words={}
for index,word in enumerate(model_300.wv.index2entity):
    words[word] = index
We = model_300.wv.vectors
weightparas = [-1, 1e-3]
rmpcs = [0,1]

params = params.params()
parr4para = {}
sarr4para = {}

    # (words, We) = data_io.getWordmap(wordfile)
for weightpara in weightparas:
    word2weight = data_io.getWordWeight(model_300.wv.vocab, weightpara)
    weight4ind = data_io.getWeight(words, word2weight)
    for rmpc in rmpcs:
        print ('word vectors loaded from %s' % wordFilePath)
        # print ('word weights computed from %s using parameter a=%f' % (weightfile, weightpara))
        params.rmpc = rmpc
        print ('remove the first %d principal components' % rmpc)
        ## eval just one example dataset
        parr, sarr = eval.sim_evaluate_one(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params)
        ## eval all datasets; need to obtained datasets from John Wieting (https://github.com/jwieting/iclr2016)
        # parr, sarr = eval.sim_evaluate_all(We, words, weight4ind, sim_algo.weighted_average_sim_rmpc, params)
        paras = (wordFilePath, model_300.wv.vocab, weightpara, rmpc)
        parr4para[paras] = parr
        sarr4para[paras] = sarr

## save results
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--dataname',
                        default='t6',
                        help='dataset name',
                        choices=['t6', 't26', '2C'])
    parser.add_argument('-c',
                        '--classifiername',
                        default='RF',
                        help='which classifier to use',
                        choices=['GaussianNB', 'RF', 'SVM', 'KNN'])
    args = parser.parse_args()
    data_name = args.dataname  # t6 or t26, 2C, 4C
    clf_name = args.classifiername  # classfier

    # Original SIF paper used glove.840B.300d, we use the ones that were trained on twitter.
    embed_dims = [100]  # can add 25, 50, 200 dimension if needed
    wordfile_list = [
        '../data/glove.twitter.27B.{}d.txt'.format(dim) for dim in embed_dims
    ]
    # each line is a word and its frequency
    weightfile = 'SIF-master/auxiliary_data/enwiki_vocab_min200.txt'
    # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    weightpara = 1e-3
    # number of principal components to remove in SIF weighting scheme
    rmpc = 1

    for wordfile, dim in zip(wordfile_list, embed_dims):
        # load word vectors
        (words, We) = data_io.getWordmap(wordfile)
        # load word weights
        # word2weight['str'] is the weight for the word 'str'
        word2weight = data_io.getWordWeight(weightfile, weightpara)
        # weight4ind[i] is the weight for the i-th word
        weight4ind = data_io.getWeight(words, word2weight)

        data_path = "../data/"
        if data_name == "t6":
            file_path = data_path + "CrisisLexT6_cleaned/"
            disasters = [
                "sandy", "queensland", "boston", "west_texas", "oklahoma",
                "alberta"
            ]
            test_list = [
                "{}_glove_token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "t26":
            file_path = data_path + "CrisisLexT26_cleaned/"
            disasters = [
                "2012_Colorado_wildfires", "2013_Queensland_floods",
                "2013_Boston_bombings", "2013_West_Texas_explosion",
                "2013_Alberta_floods", "2013_Colorado_floods",
                "2013_NY_train_crash"
            ]
            test_list = [
                "{}-tweets_labeled.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}_training.csv".format(disaster) for disaster in disasters
            ]
        if data_name == "2C":
            file_path = data_path + "2CTweets_cleaned/"
            disasters = [
                "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco",
                "Boston", "Brisbane", "Dublin", "London", "Sydney"
            ]
            test_list = [
                "{}2C.csv.token.csv.unique.csv".format(disaster)
                for disaster in disasters
            ]
            train_list = [
                "{}2C_training.csv".format(disaster) for disaster in disasters
            ]

        accu_list = []
        roc_list = []
        precision_list = []
        recall_list = []
        f1_list = []
        for train, test in zip(train_list, test_list):
            train_file = os.path.join(file_path, train)
            test_file = os.path.join(file_path, test)
            xtrain, ytrain = load_data(data_name, train_file)
            xtest, ytest = load_data(data_name, test_file)

            # load train
            # xtrain_windx is the array of word indices, m_train is the binary mask indicating whether there is a word in that location
            xtrain_windx, m_train = data_io.sentences2idx(xtrain, words)
            w_train = data_io.seq2weight(xtrain_windx, m_train,
                                         weight4ind)  # get word weights

            # set parameters
            paramss = params.params()
            paramss.rmpc = rmpc
            # get SIF embedding
            train_embed = SIF_embedding.SIF_embedding(
                We, xtrain_windx, w_train,
                paramss)  # embedding[i,:] is the embedding for sentence i

            # load target
            # xtest_windx is the array of word indices, m_test is the binary mask indicating whether there is a word in that location
            xtest_windx, m_test = data_io.sentences2idx(xtest, words)
            # get word weights
            w_test = data_io.seq2weight(xtest_windx, m_test, weight4ind)

            # set parameters
            paramsss = params.params()
            paramsss.rmpc = rmpc
            # get SIF embedding
            test_embed = SIF_embedding.SIF_embedding(
                We, xtest_windx, w_test,
                paramsss)  # embedding[i,:] is the embedding for sentence i

            print(test)
            accu, roc, precision, recall, f1 = run_classifier(
                train_embed, ytrain, test_embed, ytest, clf_name, 100)
            accu_list.append(accu)
            roc_list.append(roc)
            precision_list.append(precision)
            recall_list.append(recall)
            f1_list.append(f1)

        print("{}_SIF_{}_LOO_accuracy {}".format(data_name,
                                                 clf_name + str(dim),
                                                 accu_list))
        print("{}_SIF_{}_LOO_roc {}".format(data_name, clf_name + str(dim),
                                            roc_list))
        print("{}_SIF_{}_LOO_precision {}".format(data_name,
                                                  clf_name + str(dim),
                                                  precision_list))
        print("{}_SIF_{}_LOO_recall {}".format(data_name, clf_name + str(dim),
                                               recall_list))
        print("{}_SIF_{}_LOO_f1 {}".format(data_name, clf_name + str(dim),
                                           f1_list))
        print(
            "{0}_SIF_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f}"
            .format(data_name, clf_name + str(dim), np.mean(accu_list),
                    np.std(accu_list), np.mean(roc_list), np.std(roc_list),
                    np.mean(f1_list), np.std(f1_list), np.mean(precision_list),
                    np.std(precision_list), np.mean(recall_list),
                    np.std(recall_list)))
def SIF_master(segfile, cleanfile, directory, summ_ind):
    print "segfile: ", segfile
    print "clean file: ", cleanfile

    #cleanfile = cleanfile+".ls"
    class params(object):
        def __init__(self):
            self.LW = 1e-5
            self.LC = 1e-5
            self.eta = 0.05

        def __str__(self):
            t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta
            t = map(str, t)
            return ' '.join(t)

    # input
    wordfile = 'glove.6B.100d.txt'  # word vector file, can be downloaded from GloVe website
    weightfile = 'enwiki_vocab_min200.txt'  # each line is a word and its frequency
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme
    #sentiment_file = '../data/sentiment-test' # sentiment data file
    #cleanfile = "2/D1026-A.M.100.E.10.segs.cl"
    #sentiment_file = '../data/clean-5.txt'
    # load word vectors
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences (here use sentiment data as an example)
    #x, m, _ = data_io.sentiment2idx(sentiment_file, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentiment2idx(cleanfile, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    # parameters
    params = params()
    #params = params.params()
    params.rmpc = rmpc

    # get SIF embedding
    embedding = SIF_embedding_lib.SIF_embedding(
        We, x, w, params)  # embedding[i,:] is the embedding for sentence i
    #segfile = segfile+".segs"
    f = open(segfile).readlines()
    indexes = []
    matches = []
    for item in f:
        ind = item.rfind("&")
        indexes.append(item[:ind + 1])

    if len(indexes) == len(embedding):
        for ind in range(0, len(indexes)):
            lines = indexes[ind] + str(list(embedding[ind]))
            matches.append(lines)
    else:
        print "length doesn't match!! Check if there is empty line!!"

    #fname = directory +'/'+str(summ_ind)+ '/' + getRealName(segfile) + '.ls'
    #fname = directory +'/'+str(summ_ind)+ '/' + segfile + '.ls'
    fname = directory + '/' + str(summ_ind) + '/' + getRealName(segfile)
    print fname
    with open(fname + ".ls", "w") as file:
        for item in matches:
            file.write(item + "\n")

    return embedding
Beispiel #21
0
tgtsent = open(tgtSentFilePath, 'r').readlines()

# srcsent = ['Pada mulanya, waktu Allah mulai menciptakan alam semesta']
# tgtsent = ['God saw the light, and saw that it was good. God divided the light from the darkness.']
# params = params.params()
weightpara = 1e-3
rmpc = 1

# def srcEmbedding(srcWordFilePath, srcsent):
src_model_300 = gensim.models.KeyedVectors.load_word2vec_format(
    srcWordFilePath, binary=False)
srcwords = {}
for index, word in enumerate(src_model_300.wv.index2entity):
    srcwords[word] = index
srcWe = src_model_300.wv.vectors
srcword2weight = data_io.getWordWeight(src_model_300.wv.vocab, weightpara)
srcweight4ind = data_io.getWeight(srcwords, srcword2weight)
srcx, srcm = data_io.sentences2idx(srcsent, srcwords)
srcw = data_io.seq2weight(srcx, srcm, srcweight4ind)
srcparams = params.params()
srcparams.rmpc = rmpc
srcEmbedding = SIF_embedding.SIF_embedding(srcWe, srcx, srcw, srcparams)
# return embedding

# def tgtEmbedding(tgtWordFilePath, tgtsent):
tgtmodel_300 = gensim.models.KeyedVectors.load_word2vec_format(tgtWordFilePath,
                                                               binary=False)
tgtwords = {}
for index, word in enumerate(tgtmodel_300.wv.index2entity):
    tgtwords[word] = index
tgtWe = tgtmodel_300.wv.vectors
Beispiel #22
0
# Arabic GloVe embedding pre-trained model
wordfile = '../models/glove_full_grams_sg_300_wiki.txt'
weightfile = '../AraSIF_word_counts/arwiki_vocab_min200.txt'  # each line is a word and its frequency

weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1  # number of principal components to remove in SIF weighting scheme

# load word vectors
print("Reading embedding matrix. Hang on! this will take a while ...")
(glove_words, We) = data_io.getWordmap(wordfile)
print("shape of Word embedding is: " + str(We.shape))

# load word weights
word2weight = data_io.getWordWeight(
    weightfile,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
weight4ind = data_io.getWeight(
    glove_words, word2weight)  # weight4ind[i] is the weight for the i-th word

# set parameters
params = params.params()
params.rmpc = rmpc

# load sentences
print("reading the input sentences now & converting to indices .. \n")
sample_sents = read_NMT_data.read_data(sample_ara)

# AraSIF embedding for sample sentences
print("computing AraSIF embedding now ...\n")
Beispiel #23
0
with open(douban_cropus_path) as f:
    for line in f:
        line = line.strip()
        line = line.split(':')[1]
        sentences.append(line)

glove_word2vector_path = './chinese_data_douban_cropus_vectors.txt'  # word vector file, can be downloaded from GloVe website
word_freauency_path = './douban_cropus_vocab.txt'  # each line is a word and its frequency
weightpara = 1e-3
rmpc = 1

# load word vectors
(Word2Indx, Word2vector) = data_io.getWordmap(glove_word2vector_path)
# load word weights
word2weight = data_io.getWordWeight(
    word_freauency_path,
    weightpara)  # word2weight['str'] is the weight for the word 'str'
Index2Weight = data_io.getWeight(
    Word2Indx, word2weight)  # weight4ind[i] is the weight for the i-th word

word_idx_seq_of_sentence, mask = data_io.sentences2idx(
    sentences, Word2Indx
)  # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
word_weight_of_sentence = data_io.seq2weight(word_idx_seq_of_sentence, mask,
                                             Index2Weight)  # get word weights

# set parameters
params = params.params()
params.rmpc = rmpc
embedding = SIF_embedding.SIF_embedding(Word2vector, word_idx_seq_of_sentence,
                                        word_weight_of_sentence, params)