コード例 #1
0
    def test_init(self):
        # v0 = vocab.Vocab('bitch', [
            # ('hi', '1101'),
            # ('goodbye', '1000029'),
            # ('au revoir', 'shit')
            # ])
        name = 'test'
        dict1 = {'hi': '1101',
                 'goodbye': '1000029',
                 'au revoir': 'shit'
                }

        v0 = vocab.Vocab(name, dict1)

        self.assertEqual(v0.get_name(), 'test')
        # self.assertTrue(False)
        list0 = v0.get_vocab_list()
        self.assertIsInstance(v0.get_name(), str)
        self.assertIsInstance(list0, dict)
        self.assertIn('hi', list0)
        self.assertEqual(list0['hi'], '1101')
        self.assertIn('goodbye', list0)
        self.assertEqual(list0['goodbye'], '1000029')
        self.assertIn('au revoir', list0)
        self.assertEqual(list0['au revoir'], 'shit')

        v_blank = vocab.Vocab()

        self.assertEqual(v_blank.get_name(), 'default name')
        self.assertEqual(v_blank.get_vocab_list(), {})
        self.assertIsInstance(v_blank.get_name(), str)
        self.assertIsInstance(list0, dict)
コード例 #2
0
 def __init__(self, opts, debug=False):
     """Pass in program options for now."""
     self.opts = opts
     self.source_vocab = vocab.Vocab()
     self.target_vocab = vocab.Vocab()
     self.me = maxent.MaxentModel()
     self.m1_probs = {}
     self.lm = None
     self.dictionary = {}
     self.feature_functions = []
     self.debug = debug
コード例 #3
0
def load_model(model_fn):
    print('Building model and optimizer...')
    epoch = 0
    voc = vocab.Vocab('data/pretrained_embedding/pretrained_embedding_5M.vec')

    model = BiLSTMCrf(voc, const.CHARACTER_LIST, character_embedding_dim,
                      character_hidden_dim, context_hidden_dim, tag_list)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

    if model_fn is not None:
        checkpoint = torch.load(model_fn)
        epoch = checkpoint['epoch']
        model_sd = checkpoint['model']
        optimizer_sd = checkpoint['optimizer']

        model.load_state_dict(model_sd)
        optimizer.load_state_dict(optimizer_sd)

    model = model.to(const.DEVICE)
    for state in optimizer.state.values():
        for k, v in state.items():
            if type(v) is torch.Tensor:
                state[k] = v.to(const.DEVICE)

    return model, optimizer, epoch + 1
コード例 #4
0
    def parseFromLocal(self, file):
        f = open(file, "r")
        vocabId = 0
        prevNode = None
        firstNode = None
        for i in f:
            line = i.split(",")
            regexCheck = re.search("(Text)|(Unit_\d)|(^\s*$)", line[0])

            curNode = None

            if (regexCheck):
                unitCheck = re.search("Unit_\d", line[0])
                if (unitCheck):
                    unitNum = unitCheck.string[5:]
                    curNode = UnitNode(int(unitNum))

            elif (not regexCheck):
                curNode = vocab.Vocab(line[0], line[1], line[2], line[3],
                                      line[4].rstrip())
                # give vocab an ID
                curNode.setId(vocabId)
                self.defPool.setdefault(vocabId, line[1])
                vocabId += 1
            if (curNode):
                if (not prevNode):
                    firstNode = curNode
                else:
                    curNode.setPrev(prevNode)
                    prevNode.setNext(curNode)
                prevNode = curNode

        return firstNode
コード例 #5
0
 def parseFromCloud(self):
     fetchedData = GoogleSheetConnect.fetchEverySheetData()
     vocabId = 0
     prevNode = None
     firstNode = None
     for i in fetchedData[1:]:
         curNode = None
         if (len(i) != 0):
             unitCheck = re.search("Unit_\d", i[0])
             if (unitCheck):
                 unitNum = unitCheck.string[5:]
                 curNode = vocab.UnitNode(int(unitNum))
             else:
                 while (len(i) < 5):
                     i.append("")
                 text = i[0]
                 definition = i[1]
                 root = i[2]
                 errorCount = int(i[3])
                 importancy = i[4]
                 curNode = vocab.Vocab(text, definition, root, errorCount,
                                       importancy)
                 # give vocab an ID
                 curNode.setId(vocabId)
                 self.defPool.setdefault(vocabId, i[1])
                 vocabId += 1
             if (curNode):
                 if (not prevNode):
                     firstNode = curNode
                 else:
                     curNode.setPrev(prevNode)
                     prevNode.setNext(curNode)
                 prevNode = curNode
     return firstNode
コード例 #6
0
def main():
    ''' Main function '''

    parser = argparse.ArgumentParser()
    parser.add_argument('-input_file', required=True)
    parser.add_argument('-output_file', required=True)
    parser.add_argument('-max_len', '--max_seq_len', type=int, default=64)
    parser.add_argument('-vocab',
                        type=str,
                        default="model/multilingual/vocab.txt")
    parser.add_argument('-checkoov', default=False)

    opt = parser.parse_args()

    multi_language_vocab = vocab.Vocab(opt.vocab)

    if (opt.checkoov):
        check_examples(opt.input_file, opt.max_seq_len, multi_language_vocab)

    src_lists, tgt_lists = read_examples(opt.input_file, opt.max_seq_len,
                                         multi_language_vocab)

    data = {'settings': opt, 'data': {'src': src_lists, 'tgt': tgt_lists}}

    logger.info('Dumping the processed data to file {}'.format(
        opt.output_file))
    torch.save(data, opt.output_file)
    logger.info('Finish.')
コード例 #7
0
    def test_get_name(self):
        name = 'test'
        dict1 = {'hi': '1101',
                 'goodbye': '1000029',
                 'au revoir': 'shit'
                }
        v = vocab.Vocab(name, dict1)

        self.assertIsInstance(v.get_name(), str)
        self.assertEqual(v.get_name(), 'test')
コード例 #8
0
def train():
    # creatthe vocabulary file first and then  submit the job
    input_data = json.loads(request.data)
    files_to_exclude = ",".join(input_data.get("files_to_exclude"))
    v = vocab.Vocab("gs://text-summarization-webapp.appspot.com/data/data",
                    "data/vocab", files_to_exclude)
    v.create_vocab_file()
    os.system("sudo sh submit_training_job.sh {} {}".format(
        str(input_data.get("input")), files_to_exclude))
    return json.dumps({"responseText": "Submitted training job"})
コード例 #9
0
    def test_set_vocab_list(self):
        name = 'test'
        dict1 = {'hi': '1101',
                 'goodbye': '1000029',
                 'au revoir': 'shit'
                }
        v = vocab.Vocab(name, dict1)

        self.assertEqual(v.get_vocab_list(), {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit'})
        v.set_vocab_list({'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit'})
        self.assertEqual(v.get_name(), 'test')
コード例 #10
0
    def test_get_vocab_list(self):
        name = 'test'
        dict1 = {'hi': '1101',
                 'goodbye': '1000029',
                 'au revoir': 'shit'
                }
        v = vocab.Vocab(name, dict1)

        # self.assertTrue(isinstance(v.get_vocab_list(), dict))
        self.assertIsInstance(v.get_vocab_list(), dict)
        self.assertEqual(v.get_vocab_list(), {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit'})
コード例 #11
0
ファイル: framework.py プロジェクト: Rohithpesala/Dialogue
    def __init__(self, corpus, freq):
        """
        Load Data from the corpus.
        Parameters:
        :: corpus :: filepath to read the whole dataset in txt format
		:: freq :: Min frequency of the word to be included into vocab
        """
        super(EmbedGlove, self).__init__()
        f = open(corpus,'r')
        vc = Counter()
        for l in f:
        	vc.update(Counter(l.split()))
        self.vcb = vocab.Vocab(vc, wv_type = "glove.840B",min_freq=freq,specials = ["EOS","SOS"])
        f.close()        
コード例 #12
0
 def __init__(self, opts, debug=False):
     """Pass in program options for now."""
     # This class will directly read certain program options:
     # TODO: list them
     self.opts = opts
     # Vocab is a mapping from string to integer used for both the source and
     # target.
     self.vocab = vocab.Vocab()
     # p(s|t) and p(t|s): Lexical probabilities. These are stored as a dict from
     # (int, int) tuples to a float - the integers are vocab indices.
     self.pst = {}
     self.pts = {}
     self.feature_functions = []
     self.debug = debug
コード例 #13
0
ファイル: exp.py プロジェクト: j-luo93/MUSE
def write_wv_to_file(load_path, vocab_path, output_path, size):
    print('writing wv file to %s' % output_path)
    vc, ic = torch.load(load_path)

    voc = vocab.Vocab(vocab_path)

    with codecs.open(output_path, 'w', 'utf8') as fout:
        fout.write('%d %d\n' % (len(voc), size))
        for v, i in zip(vc, ic):
            v = v.cpu().numpy()
            i = i.cpu().item()
            fout.write(voc[i] + ' ')
            for vi in v:
                fout.write('%.4f ' % vi)
            fout.write('\n')
コード例 #14
0
    def test_check(self):
        name = 'test'
        dict1 = {'hi': '1101',
                 'goodbye': '90210',
                 'au revoir': 'goodbye',
                 'weeaboo shit': 'domo arigatou mr roboto'
                }
        v = vocab.Vocab(name, dict1)
        self.assertTrue(v.check('hi', '1101'))
        self.assertFalse(v.check('hi', 1101))
        self.assertFalse(v.check('hi', []))
        self.assertFalse(v.check('hi', {}))

        self.assertTrue(v.check('goodbye', '90210'))
        self.assertTrue(v.check('au revoir', 'goodbye'))
        self.assertTrue(v.check('weeaboo shit', 'domo arigatou mr roboto'))
        self.assertFalse(v.check('kboo shit', 'domo arigatou mr roboto'))
コード例 #15
0
def build_vocab():
    dataset_file = "train-v1.1.json"
    with open(dataset_file) as dataset_file:
        dataset_json = json.load(dataset_file)
    dataset = dataset_json['data']
    token_list = []
    for i in range(len(dataset)):
        for j in range(len(dataset[i]["paragraphs"])):
            passage = dataset[i]["paragraphs"][j]["context"]
            passage = passage.replace("''", '" ')
            passage = passage.replace("``", '" ')
            token_list.extend(word_tokenize(passage))
            for k in range(len(dataset[i]["paragraphs"][j]["qas"])):
                question = dataset[i]["paragraphs"][j]["qas"][k]["question"]
                token_list.extend(word_tokenize(question))
    c = Counter(token_list)
    v = vocab.Vocab(c, wv_type='glove.840B')
    del c
    del token_list
    return v
コード例 #16
0
ファイル: trainandtest.py プロジェクト: foxlf823/ADExtractor
def pretrain(train_token, train_entity, train_relation, train_name, test_token,
             test_entity, test_relation, test_name):
    word_alphabet, postag_alphabet, relation_alphabet, entity_type_alphabet, entity_alphabet = dataset_stat(
        train_token, train_entity, train_relation)
    logging.info("training dataset stat completed")
    if opt.full_data:
        test_word_alphabet, test_postag_alphabet, test_relation_alphabet, test_entity_type_alphabet, test_entity_alphabet = dataset_stat(
            test_token, test_entity, test_relation)
        word_alphabet = word_alphabet | test_word_alphabet
        postag_alphabet = postag_alphabet | test_postag_alphabet
        relation_alphabet = relation_alphabet | test_relation_alphabet
        entity_type_alphabet = entity_type_alphabet | test_entity_type_alphabet
        entity_alphabet = entity_alphabet | test_entity_alphabet
        del test_word_alphabet, test_postag_alphabet, test_relation_alphabet, test_entity_type_alphabet, test_entity_alphabet
        logging.info("test dataset stat completed")

    position_alphabet = sortedcontainers.SortedSet()
    for i in range(opt.max_seq_len):
        position_alphabet.add(i)
        position_alphabet.add(-i)

    relation_vocab = vocab.Vocab(relation_alphabet, None,
                                 opt.relation_emb_size)
    word_vocab = vocab.Vocab(word_alphabet, opt.emb, opt.word_emb_size)
    postag_vocab = vocab.Vocab(postag_alphabet, None, opt.pos_emb_size)
    entity_type_vocab = vocab.Vocab(entity_type_alphabet, None,
                                    opt.entity_type_emb_size)
    entity_vocab = vocab.Vocab(entity_alphabet, None, opt.entity_emb_size)
    position_vocab1 = vocab.Vocab(position_alphabet, None,
                                  opt.position_emb_size)
    position_vocab2 = vocab.Vocab(position_alphabet, None,
                                  opt.position_emb_size)
    # we directly use position_alphabet to build them, since they are all numbers
    tok_num_betw_vocab = vocab.Vocab(position_alphabet, None,
                                     opt.entity_type_emb_size)
    et_num_vocab = vocab.Vocab(position_alphabet, None,
                               opt.entity_type_emb_size)
    logging.info("vocab build completed")

    logging.info("saving ... vocab")
    pickle.dump(word_vocab,
                open(os.path.join(opt.pretrain, 'word_vocab.pkl'), "wb"), True)
    pickle.dump(postag_vocab,
                open(os.path.join(opt.pretrain, 'postag_vocab.pkl'), "wb"),
                True)
    pickle.dump(relation_vocab,
                open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), "wb"),
                True)
    pickle.dump(
        entity_type_vocab,
        open(os.path.join(opt.pretrain, 'entity_type_vocab.pkl'), "wb"), True)
    pickle.dump(entity_vocab,
                open(os.path.join(opt.pretrain, 'entity_vocab.pkl'), "wb"),
                True)
    pickle.dump(position_vocab1,
                open(os.path.join(opt.pretrain, 'position_vocab1.pkl'), "wb"),
                True)
    pickle.dump(position_vocab2,
                open(os.path.join(opt.pretrain, 'position_vocab2.pkl'), "wb"),
                True)
    pickle.dump(
        tok_num_betw_vocab,
        open(os.path.join(opt.pretrain, 'tok_num_betw_vocab.pkl'), "wb"), True)
    pickle.dump(et_num_vocab,
                open(os.path.join(opt.pretrain, 'et_num_vocab.pkl'), "wb"),
                True)

    train_X, train_Y, _ = my_utils.getRelationInstance2(
        train_token, train_entity, train_relation, train_name, word_vocab,
        postag_vocab, relation_vocab, entity_type_vocab, entity_vocab,
        position_vocab1, position_vocab2, tok_num_betw_vocab, et_num_vocab)
    logging.info("training instance build completed, total {}".format(
        len(train_Y)))
    pickle.dump(train_X, open(os.path.join(opt.pretrain, 'train_X.pkl'), "wb"),
                True)
    pickle.dump(train_Y, open(os.path.join(opt.pretrain, 'train_Y.pkl'), "wb"),
                True)

    test_X, test_Y, test_other = my_utils.getRelationInstance2(
        test_token, test_entity, test_relation, test_name, word_vocab,
        postag_vocab, relation_vocab, entity_type_vocab, entity_vocab,
        position_vocab1, position_vocab2, tok_num_betw_vocab, et_num_vocab)
    logging.info("test instance build completed, total {}".format(len(test_Y)))
    pickle.dump(test_X, open(os.path.join(opt.pretrain, 'test_X.pkl'), "wb"),
                True)
    pickle.dump(test_Y, open(os.path.join(opt.pretrain, 'test_Y.pkl'), "wb"),
                True)
    pickle.dump(test_other,
                open(os.path.join(opt.pretrain, 'test_Other.pkl'), "wb"), True)
コード例 #17
0
def summarize(input):
    # create the vocabulary file first and then  submit the job
    v = vocab.Vocab("gs://sasidhar-project1-mlengine", "data/vocab")
    v.create_vocab_file()
    os.system("sudo sh train_textsum_dist.sh " + input)
    return "done"
コード例 #18
0
ファイル: prepareNeuralLM.py プロジェクト: zaycev/nnsmt
    if args.validation_text:
        for li, line in enumerate(file(args.validation_text)):
            words = line.split()
            words = [start] * (n - 1) + words + [stop]
            validation_data.append(words)
    else:
        if args.validation_size > 0:
            validation_data = train_data[-args.validation_size:]
            train_data[-args.validation_size:] = []

    c = collections.Counter()
    for words in train_data:
        c.update(words[n - 1:])

    v = vocab.Vocab()
    v.insert_word(start)
    v.insert_word(stop)
    v.insert_word(null)
    inserted = v.from_counts(c, args.n_vocab)
    if inserted == len(c):
        sys.stderr.write(
            "warning: only %d words types in training data; set --n_vocab lower to learn unknown word\n"
        )

    if args.words_file:
        with open(args.words_file, "w") as outfile:
            for w in v.words:
                outfile.write("%s\n" % (w, ))

    if args.train_file == '-':
コード例 #19
0
ファイル: test.py プロジェクト: sachindharashivkar/Attention
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Apr  1 16:37:07 2017

@author: sachin
"""

import torch
import vocab

from collections import Counter
c = Counter(['hello', 'world'])
v = vocab.Vocab(c, wv_type='glove.840B')
print(v.itos)
print(v.vectors[v.stoi["hello"]])
コード例 #20
0
ファイル: get_glove_vec.py プロジェクト: thu-spmi/LABES
import utils, pickle, vocab
# glove_path = '/home/ouzj01/zhangyc/project/glove/glove.840B.300d.txt'
glove_path = 'data/glove/glove_ori.6B.50d.txt'
# save_path = 'data/glove/glove_multiwoz.840B.300d.txt'
# save_path = 'data/glove/glove_multiwoz.6B.50d.txt'
save_path = 'data/glove/glove_kvret.6B.50d.txt'

vocab = vocab.Vocab(1100)
vocab.load_vocab('data/kvret/vocab')
# vocab.load_vocab('data/MultiWOZ/processed/vocab')
vec_array = []
with open(glove_path, 'r', encoding='UTF-8') as ef:
    with open(save_path, 'w') as f:

        for line in ef.readlines():
            line_sep = line.strip().split(' ')
            word, vec = line_sep[0], line_sep[1:]
            if vocab.has_word(word):
                f.write(line)
ef.close()
コード例 #21
0
ファイル: exp.py プロジェクト: j-luo93/MUSE
def pipeline(src_suf,
             tgt_suf,
             arg_string,
             size,
             old_params=None,
             data_dir='13-es-100K',
             center=False):
    import os
    #     arg_string = '--exp_id k0gf0f007v --src_lang es --tgt_lang en '\
    #                  '--n_refinement 5 --emb_dim 500 --normalize_embeddings center --full_vocab'
    lazy_wvs = dict()
    for suf in [src_suf, tgt_suf]:
        if suf is None:
            continue
        load_path = '../EMNLP-NMT/data/%s/torch_save.%s' % (data_dir, suf)
        vocab_path = '../EMNLP-NMT/data/%s/%s' % (data_dir, suf)
        output_path = 'wv.%s' % suf
        #if not os.path.isfile(output_path):
        lazy_wvs[suf] = LazyObject(
            lambda load_path=load_path, vocab_path=
            vocab_path, output_path=output_path: write_wv_to_file(
                load_path, vocab_path, output_path, size))

        if suf == src_suf:
            arg_string += ' --src_emb %s' % output_path
        else:
            arg_string += ' --tgt_emb %s' % output_path

    if old_params is not None and center:
        print('getting test')
        lazy_test = LazyObject(
            lambda: get_test_models(arg_string,
                                    old_params.compute()[0]))
        #params, (test_eval, test_trainer) = get_test_models(arg_string, old_params.compute()[0])
    else:
        lazy_test = LazyObject(lambda: get_saved_models(arg_string))
        #params, (test_eval, test_trainer) = get_saved_models(arg_string)

    import torch
    import vocab

    for i, suf in enumerate([src_suf, tgt_suf]):
        #for emb, suf in zip([src_emb, tgt_emb], [src_suf, tgt_suf]):
        if suf is None:
            continue
        save_path = '../EMNLP-NMT/data/%s/torch_save.MUSE.%s' % (data_dir, suf)
        if os.path.isfile(save_path):
            continue

        if center:
            save_path += '.center'
        vocab_path = '../EMNLP-NMT/data/%s/%s' % (data_dir, suf)
        for k in lazy_wvs:
            lazy_wvs[k].compute()

        voc = vocab.Vocab(vocab_path)
        test_eval = lazy_test.compute()[1][0]

        if i == 0:
            emb = test_eval.src_emb
            # mapped word embeddings
            emb = test_eval.mapping(emb.weight).data

        else:
            emb = lazy_test.compute()[1][0].tgt_emb
            emb = emb.weight.data

        vc = emb.cpu()
        eval_vocab = test_eval.tgt_dico if suf == tgt_suf else test_eval.src_dico
        ic = torch.from_numpy(
            np.asarray([
                voc[w]
                for w in [eval_vocab.id2word[i] for i in range(len(vc))]
            ]))
        torch.save([vc, ic], save_path)
    if 'test_eval' in locals():
        return test_eval
    else:
        return None
コード例 #22
0
        optimizer.load_state_dict(optimizer_sd)

    model = model.to(const.DEVICE)
    for state in optimizer.state.values():
        for k, v in state.items():
            if type(v) is torch.Tensor:
                state[k] = v.to(const.DEVICE)

    return model, optimizer, epoch + 1, all_losses, eval_losses, test_scores, best_test_score


if __name__ == '__main__':
    args = parse()
    print(args)
    print('Loading vocab...')
    voc = vocab.Vocab(args.pretrained_path, freeze=True)

    print('Loading data ...')
    train_sentences = [Sentence(sentence, voc) for sentence in utils.read_data(args.trainpath)]
    dev_sentences = [Sentence(sentence, voc) for sentence in utils.read_data(args.devpath)]
    test_sentences = [Sentence(sentence, voc) for sentence in utils.read_data(args.testpath)]

    train_ds = dataset.Dataset(train_sentences, word_padding_idx=voc.padding_index,
                               pos_padding_idx=const.POS_PADDING_IDX,
                               chunk_padding_idx=const.CHUNK_PADDING_IDX,
                               character_padding_idx=const.CHARACTER2INDEX['<PAD>'],
                               tag_padding_idx=const.CHUNK_PADDING_IDX)
    dev_ds = dataset.Dataset(dev_sentences, word_padding_idx=voc.padding_index,
                             pos_padding_idx=const.POS_PADDING_IDX,
                             chunk_padding_idx=const.CHUNK_PADDING_IDX,
                             character_padding_idx=const.CHARACTER2INDEX['<PAD>'],
コード例 #23
0
 def test_holy_mother_of_symbols(self):
     v = vocab.Vocab('[[[[]gorp!@)(87845,./;[]|\\-_=+)]][][]', {})
     self.assertEqual(v.get_name(), '[[[]gorp!@)(87845,./;[]|\\-_=+)]][][')
コード例 #24
0
run_path = os.path.join('runs', run_hash)
results_path = os.path.join(run_path, 'results.txt')

os.makedirs(run_path)

with open(results_path, 'w+') as f:
    f.write(f'train_loss\ttrain_mrr\tvalid_loss\tvalid_mrr\n')

with open(os.path.join(run_path, 'args.json'), 'w+') as f:
    json.dump(args_dict, f, indent=2)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

code_vocab = vocab.Vocab(args.code_vocab, args.vocab_max_size,
                         args.vocab_min_freq, UNK_TOKEN, PAD_TOKEN)

desc_vocab = vocab.Vocab(args.desc_vocab, args.vocab_max_size,
                         args.vocab_min_freq, UNK_TOKEN, PAD_TOKEN)

print(f'code vocab size: {len(code_vocab)}')
print(f'desc vocab size: {len(desc_vocab)}')

train_data = utils.load_retrieval_data(args.train_data, code_vocab, desc_vocab,
                                       args.code_max_length,
                                       args.desc_max_length)

valid_data = utils.load_retrieval_data(args.valid_data, code_vocab, desc_vocab,
                                       args.code_max_length,
                                       args.desc_max_length)
コード例 #25
0
def main():
    parser = OptionParser()
    cwd = os.getcwd()

    parser.add_option(
        "-p",
        "--parallel_data",
        dest="training_file",
        default=cwd + "/data/euro_esen_10k",
        help="Parallel data, expecting \".source\" and \".target\"")

    parser.add_option(
        "-c",
        "--comparable_data",
        dest="comp_data",
        default=cwd + "/data/es_dev",
        help=
        "Annotated comparable data, expecting \".source\", \".target\", and \".alignment\""
    )

    parser.add_option(
        "-r",
        "--raw_data",
        dest="raw_data",
        default=cwd + "/data/esen_docs_small",
        help="Raw comparable data, expecting \".source\" and \".target\"")

    parser.add_option("-t",
                      "--t_table",
                      dest="m1_data",
                      default=cwd + "/data/pruned.model",
                      help="Word alignment parameters from some parallel data")

    parser.add_option(
        "-e",
        "--example_window",
        dest="example_window",
        type="int",
        default=3,
        help="Size of the example window for gathering training data")

    parser.add_option(
        "--length_ratio",
        type="float",
        dest="max_len_ratio",
        default=3.0,
        help="Maximum length ratio for sentences to be considered parallel")

    parser.add_option(
        "--test_max",
        type="int",
        dest="test_max",
        default=100,
        help="Number of sentences from the parallel data to use as test data")

    parser.add_option("--prob_floor",
                      type="float",
                      dest="prob_floor",
                      default=1e-4,
                      help="Lowest probability value for LM and M1")

    parser.add_option("--max_iterations",
                      type="int",
                      dest="max_iterations",
                      default=20,
                      help="Maximum number of L-BFGS iterations")

    parser.add_option("--l2_norm",
                      type="float",
                      dest="l2_norm",
                      default="2.0",
                      help="L2 normalizing value for the Maxent model")

    parser.add_option(
        "--sent_out",
        dest="sent_out",
        default="",
        help="Extract sentences from the raw documents to this location")

    (opts, args) = parser.parse_args()

    # Read available data
    source_vocab = vocab.Vocab()
    target_vocab = vocab.Vocab()
    if opts.training_file:
        source_parallel = read_text(opts.training_file + '.source',
                                    source_vocab)
        target_parallel = read_text(opts.training_file + '.target',
                                    target_vocab)
        t_lm = create_lm(target_parallel, opts)
    if opts.comp_data:
        (source_docs, target_docs,
         alignments) = read_comp_data(opts.comp_data, source_vocab,
                                      target_vocab)
    if opts.raw_data:
        (raw_source, raw_target) = read_comp_data(opts.raw_data, source_vocab,
                                                  target_vocab)
    if opts.m1_data:
        m1 = read_m1_data(opts.m1_data, source_vocab, target_vocab)

    #(train_data, test_data) = create_train_test_data(
    #    me, source_parallel, target_parallel, m1, t_lm, opts)
    comp_data = create_comp_test_data(source_docs, target_docs, alignments, m1,
                                      t_lm, opts)
    print_feature_stats(comp_data)
    folds = range(5)
    for fold in folds:
        comp_test_data = []
        me = maxent.MaxentModel()

        print "\nFold " + str(fold + 1) + ":"
        me.begin_add_event()
        for i, event in enumerate(comp_data):
            if i % len(folds) == fold:
                comp_test_data.append(event)
            else:
                me.add_event(event[0], event[1])
        me.end_add_event()
        me.train(opts.max_iterations, "lbfgs", opts.l2_norm)
        parallel_eval(me, comp_test_data)

    if opts.sent_out:
        full_me = maxent.MaxentModel()
        full_me.begin_add_event()
        for event in comp_data:
            full_me.add_event(event[0], event[1])
        full_me.end_add_event()
        full_me.train(opts.max_iterations, "lbfgs", opts.l2_norm)

        #for threshold in drange(0.05, 0.96, 0.05):
        threshold = 0.5
        out_file = opts.sent_out + '.' + str(threshold)
        extract_sentences(full_me, raw_source, raw_target, out_file, threshold,
                          m1, t_lm, source_vocab, target_vocab, opts)
コード例 #26
0
    criteria = nn.NLLLoss(weight=weights, size_average=True)
    optimizer = torch.optim.Adam(lstm_model.parameters())

    torch.manual_seed(settings.seed)
    random.seed(settings.seed)

    if torch.cuda.is_available():
        lstm_model.cuda()
        criteria.cuda()
        torch.cuda.manual_seed(settings.seed)

    for param in lstm_model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # Load Voccab
    src_vocab = vocab.Vocab(settings.src_vocab_size,
                            os.path.join(args.path, "src_vocab.txt"))
    trg_vocab = vocab.Vocab(settings.trg_vocab_size,
                            os.path.join(args.path, "trg_vocab.txt"))
    biVocab = vocab.BiVocab(src_vocab, trg_vocab)

    # Load Dataset
    train_ner_data = dataset.NERDataset(
        os.path.join(args.path, "atis.train.txt"), biVocab)
    train_ner_data_loader = DataLoader(train_ner_data,
                                       batch_size=settings.batch_size,
                                       shuffle=True,
                                       collate_fn=lambda x: x)
    dev_ner_data = dataset.NERDataset(os.path.join(args.path, "atis.test.txt"),
                                      biVocab)
    dev_ner_data_loader = DataLoader(dev_ner_data,
                                     batch_size=settings.batch_size,
コード例 #27
0
ファイル: app.py プロジェクト: thanhlt998/ner-bi-lstm-crf
            max_len = max(len(token), len(tag_)) + 4
            formated_sentence.append(fill(token, max_len))
            formated_tag.append(fill(tag_, max_len))

        no_lines = len(formated_sentence) // max_words_per_line + (
            0 if len(formated_sentence) % max_words_per_line == 0 else 1)

        for i in range(no_lines):
            print(' '.join(formated_sentence[max_words_per_line * i: max_words_per_line * (i + 1)]))
            print(' '.join(formated_tag[max_words_per_line * i: max_words_per_line * (i + 1)]))
            print('\n')


if __name__ == '__main__':
    print('loading vocab......')
    voc = vocab.Vocab(args['pretrained_path'])
    print('loading model.......')
    model = load_model(
        model_fn=args['checkpoint_fn'],
        voc=voc,
        character_embedding_dim=args['character_embedding_dim'],
        character_hidden_dim=args['character_hidden_dim'],
        context_hidden_dim=args['context_hidden_dim'],
        dropout=args['dropout'],
        crf_loss_reduction=args['crf_loss_reduction'],
        using_pos_chunk=args['using_pos_chunk']
    )
    model = model.to(device)
    model.eval()

    print('program is running.....')
コード例 #28
0
def prepare(args):
    """
    Checks data, create vocab, load pretrained embedding or initialization randomly embedding
    """
    logger = logging.getLogger("alibaba")
    logger.info("Checking the data files... ")
    for data in args.data_files:
        assert os.path.exists(data), "{} file does not exist".format(data)
    logger.info("preprocess raw data...")
    jieba.load_userdict(args.dict_file)
    logger.info("segment raw data")
    preposs_file = open(args.preposs_file, "w")
    index = 1
    for data_file in args.data_files:
        with open(data_file, "r") as fin:
            for idx, line in enumerate(fin):
                line = unicode(line, encoding="utf8")
                line_re = re.sub(
                    u"[’!\"#$%&'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+",
                    "", line)
                line_list = str(line_re).strip("\n").split("\t")
                if len(line_list) != 4:
                    logger.warning("{} - {} from is wrong".format(
                        args.data_files, idx + 1))
                    continue
                document1 = line_list[1].strip().replace(" ", "")
                document2 = line_list[2].strip().replace(" ", "")
                segment_document1 = [_ for _ in jieba.cut(document1)]
                segment_document2 = [_ for _ in jieba.cut(document2)]
                preposs_file.write(str(index))
                preposs_file.write("|")
                preposs_file.write(" ".join(segment_document1))
                preposs_file.write("|")
                preposs_file.write(" ".join(segment_document2))
                preposs_file.write("|")
                preposs_file.write(line_list[3] + "\n")
                index += 1
    preposs_file.close()
    logger.info("Building vocabulary...")
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    data = dataset.Dataset(args)
    word_vocab_ = vocab.Vocab()
    for token in data.word_iter(set_name="train"):
        word_vocab_.add(token)
    unfiltered_vocab_size = word_vocab_.size()
    word_vocab_.filter_word_by_count(min_count=2)
    filtered_num = unfiltered_vocab_size - word_vocab_.size()
    logger.info(
        "After filter {} tokens, the final word vocab size is {}".format(
            filtered_num, word_vocab_.size()))

    logger.info("Assigning word embeddings...")
    word_vocab_.random_init_embeddings(args.embedding_size)

    character_vocab_ = vocab.Vocab()
    for character in data.word_iter("train", character=True):
        character_vocab_.add(character)
    unfiltered_vocab_size = character_vocab_.size()
    character_vocab_.filter_word_by_count(min_count=2)
    filtered_num = unfiltered_vocab_size - character_vocab_.size()
    logger.info(
        "After filter {} characters, the final character vocab size is {}".
        format(filtered_num, character_vocab_.size()))
    logger.info("Assigning character embeddings...")
    character_vocab_.random_init_embeddings(args.character_embedding_size)

    logger.info("Saving vocab...")
    with open(os.path.join(args.vocab_dir, "vocab.data"), "wb") as fout:
        pickle.dump(word_vocab_, fout)

    logger.info("Saving character vocab...")
    with open(os.path.join(args.vocab_dir, "vocab_character.data"),
              "wb") as fout:
        pickle.dump(character_vocab_, fout)
    logger.info("Done with preparing!")
コード例 #29
0
def main():
    ''' Main function '''
    parser = argparse.ArgumentParser()

    # parser.add_argument('-data', required=True)

    parser.add_argument('-train_atok', required=True)
    parser.add_argument('-valid_atok', required=True)

    parser.add_argument('-epoch', type=int, default=200)
    parser.add_argument('-batch_size', type=int, default=8)

    parser.add_argument('-d_word_vec', type=int, default=512)
    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-log', default=None)
    parser.add_argument('-save_model', default=None)
    parser.add_argument('-save_mode',
                        type=str,
                        choices=['all', 'best'],
                        default='best')

    parser.add_argument('-no_cuda', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Loading Dataset =========#
    train_atok = torch.load(opt.train_atok)
    valid_atok = torch.load(opt.valid_atok)

    train_vocab = vocab.Vocab(train_atok['settings'].vocab)

    training_data = dataset.translation_dataloader(train_atok,
                                                   opt.batch_size,
                                                   shuffle=True)
    validation_data = dataset.translation_dataloader(valid_atok,
                                                     opt.batch_size,
                                                     shuffle=False)

    # data = torch.load(opt.data)
    opt.max_token_seq_len = train_atok['settings'].max_seq_len

    # training_data, validation_data = prepare_dataloaders(data, opt)

    opt.src_vocab_size = train_vocab.size()
    opt.tgt_vocab_size = train_vocab.size()

    #========= Preparing Model =========#
    # if opt.embs_share_weight:
    #     assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \
    #         'The src/tgt word2idx table are different but asked to share word embedding.'

    print(opt)

    device = torch.device('cuda' if opt.cuda else 'cpu')
    transformer = Transformer(opt.src_vocab_size,
                              opt.tgt_vocab_size,
                              opt.max_token_seq_len,
                              tgt_emb_prj_weight_sharing=opt.proj_share_weight,
                              emb_src_tgt_weight_sharing=opt.embs_share_weight,
                              d_k=opt.d_k,
                              d_v=opt.d_v,
                              d_model=opt.d_model,
                              d_word_vec=opt.d_word_vec,
                              d_inner=opt.d_inner_hid,
                              n_layers=opt.n_layers,
                              n_head=opt.n_head,
                              dropout=opt.dropout).to(device)

    if os.path.exists("trained.chkpt"):
        x = torch.load("trained.chkpt")
        # print(type(x["model"]))
        transformer.load_state_dict(x["model"])

    optimizer = ScheduledOptim(
        optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()),
                   betas=(0.9, 0.98),
                   eps=1e-09), opt.d_model, opt.n_warmup_steps)

    train(transformer, training_data, validation_data, optimizer, device, opt)
コード例 #30
0
sys.path.append("..")
import vocab as vc
import pickle
import random
import collections

vocab_file = "vocab.txt"
train_file = "train.p"
infer_file = "infer.p"
valid_file = "validation.p"
score_file = "score.txt"

max_seq_length = 40
pad_id = 1

cab = vc.Vocab(vocab_file, verbose=False)
scores = {}
data = collections.defaultdict(list)
infer = collections.defaultdict(list)


def load_scores():
    with open("final-score.csv") as f:
        f_csv = csv.DictReader(f)
        for i, row in enumerate(f_csv):
            scores[row['skuid']] = int(row['score']) - 5


def load_data():
    with open("jd-comment.csv", encoding='utf8') as f:
        f_csv = csv.DictReader(f)