Exemple #1
0
def main():
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    outdir = HOME_DIR + '_' + input_fname
    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    ddir = 'semeval/binary'
    train16 = "task-BD-train-2016.tsv"
    dev2016 = "task-BD-dev-2016.tsv"
    devtest2016 = "task-BD-devtest-2016.tsv"
    test2016 = "SemEval2016-task4-test.subtask-BD.txt"

    fname_vocab = os.path.join(outdir, 'vocab.pickle')
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_idx = alphabet.fid
    print "alphabet", len(alphabet)
    print 'dummy_word:', dummy_word_idx

    topic_alphabet = Alphabet(start_feature_id=0)
    topic_alphabet.add('UNKNOWN_TOPIC_IDX')
    dummy_topic_idx = topic_alphabet.fid

    print "Loading Semeval Data"
    #save semeval tweets seperate
    files = [train16, dev2016, devtest2016, test2016]
    for fname in files:
        fname_ext = os.path.join(ddir, fname)
        tid, topics, tweets, sentiments = load_data(fname_ext, topic_alphabet)
        print "Number of tweets:", len(tweets)

        tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx)
        topic_idx = get_topic_indices(tweets, topics, topic_alphabet)

        basename, _ = os.path.splitext(os.path.basename(fname))
        np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid)
        np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)),
                tweet_idx)
        np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)),
                sentiments)
        np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)),
                topic_idx)

    cPickle.dump(
        topic_alphabet,
        open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
Exemple #2
0
def main():
    HOME_DIR = "semeval_parsed"
    input_fname = '200M'

    outdir = HOME_DIR + '_' + input_fname
    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    ddir = 'semeval/binary'
    train16 = "task-BD-train-2016.tsv"
    dev2016 = "task-BD-dev-2016.tsv"
    devtest2016 = "task-BD-devtest-2016.tsv"
    test2016 = "SemEval2016-task4-test.subtask-BD.txt"

    fname_vocab = os.path.join(outdir, 'vocab.pickle')
    alphabet = cPickle.load(open(fname_vocab))
    dummy_word_idx = alphabet.fid
    print "alphabet", len(alphabet)
    print 'dummy_word:',dummy_word_idx

    topic_alphabet = Alphabet(start_feature_id=0)
    topic_alphabet.add('UNKNOWN_TOPIC_IDX')
    dummy_topic_idx = topic_alphabet.fid

    print "Loading Semeval Data"
    #save semeval tweets seperate
    files = [train16,dev2016,devtest2016,test2016]
    for fname in files:
        fname_ext = os.path.join(ddir,fname)
        tid,topics,tweets, sentiments = load_data(fname_ext,topic_alphabet)
        print "Number of tweets:",len(tweets)

        tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx)
        topic_idx = get_topic_indices(tweets,topics,topic_alphabet)

        basename, _ = os.path.splitext(os.path.basename(fname))
        np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid)
        np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx)
        np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments)
        np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx)

    cPickle.dump(topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
Exemple #3
0
def build_alphabet(input_file, train_files, test_files):
    event_alphabet = Alphabet("eventid")
    # deal with the train file
    for train_day in train_files:
        train_path = input_file + "/" + train_day
        files = os.listdir(train_path)  #得到文件夹下的所有文件名称
        for file in files:  #遍历mid文件夹
            in_lines = open(train_path + "/" + file, 'r',
                            encoding='utf-8').readlines()
            for idx in range(len(in_lines)):
                eventid = in_lines[idx].split('\t')[0]
                event_alphabet.add(eventid)
    # deal with the test file
    for test_day in test_files:
        test_path = input_file + "/" + test_day
        files = os.listdir(test_path)  #得到文件夹下的所有文件名称
        for file in files:  #遍历mid文件夹
            in_lines = open(test_path + "/" + file, 'r',
                            encoding='utf-8').readlines()
            for idx in range(len(in_lines)):
                eventid = in_lines[idx].split('\t')[0]
                event_alphabet.add(eventid)

    return event_alphabet
Exemple #4
0
def main():
    outdir = "preprocessed_data"
    out_file = 'vocal_wembext.pickle'
    fname, delimiter, ndim = (
        'embeddings/smiley_tweets_embedding_multilingual300M', ' ', 52)
    word2vec = load_glove_vec(fname, {}, delimiter, ndim)

    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    alphabet.add('DUMMY_WORD_IDX')
    dummy_word_idx = alphabet.get('DUMMY_WORD_IDX')

    for token in word2vec.keys():
        alphabet.add(token)

    print 'Alphabet before purge:', len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))
def main():
    data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz'
    output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy'
    output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy'
    outdir = 'parsed_tweets'

    alphabet_words = Alphabet(start_feature_id=0)
    alphabet_words.add('UNKNOWN_WORD_IDX')
    alphabet_words.add('DUMMY_WORD_IDX')
    dummy_word_idx = DUMMY_WORD_IDX

    alphabet_hashtags = Alphabet(start_feature_id=0)
    alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX')

    inp = 'train'
    store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))
    inp = 'test'
    store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp))

    cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w'))
    cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))
Exemple #6
0
  #stoplist.update(punct)

  # merge inputs to compute word frequencies
  _, ext = os.path.splitext(os.path.basename(train))
  all_fname = "/tmp/trec-merged" + ext
  files = ' '.join([train, dev, test])
  subprocess.call("/bin/cat {} > {}".format(files, all_fname), shell=True)
  unique_questions, qids, questions, answers, labels = load_data(all_fname, resample = False)

  docs = answers + unique_questions
  word2dfs = compute_dfs(docs)
  print word2dfs.items()[:10]

  # map words to ids
  alphabet = Alphabet(start_feature_id=0)
  alphabet.add('UNKNOWN_WORD_IDX')
  add_to_vocab(answers, alphabet)
  add_to_vocab(questions, alphabet)
  basename = os.path.basename(train)
  cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))
  print "alphabet size=", len(alphabet)

  # dump embedding file
  dummy_word_idx = alphabet.fid
  dump_embedding(outdir, 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin', alphabet)

  # summarize max sentense length
  q_max_sent_length = max(map(lambda x: len(x), questions))
  a_max_sent_length = max(map(lambda x: len(x), answers))
  print 'q_max_sent_length', q_max_sent_length
  print 'a_max_sent_length', a_max_sent_length
Exemple #7
0
def main(args):
    if not os.path.exists(args.test_eval_dir):
        os.makedirs(args.test_eval_dir)
    if not os.path.exists(args.eval_dir):
        os.makedirs(args.eval_dir)
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    #### print config ####
    print(args)

    #### add label ####
    label_alphabet = Alphabet('label', True)
    label_alphabet.add("O")
    label_alphabet.add("B-T")
    label_alphabet.add("I-T")
    label_alphabet.add("B-P")
    label_alphabet.add("I-P")

    # read data
    print("Loading data....")
    datasets = torch.load(args.data)
    train_set = datasets["train"]
    test_set = datasets["test"]
    train_dataloader = read_data(train_set, "train", args.batchSize)
    eval_dataloader = read_data(test_set, "test", args.batchSize)

    #### load BERT config ####
    print("Loading BERT config....")
    bert_config = BertConfig.from_json_file(args.bert_json_dir)

    #### defined model ####
    model = opinionMining(args, bert_config, label_alphabet)
    if args.mode == "test":
        assert args.test_model != ""
        model = torch.load(args.test_model)
        test_start = time.time()
        # evaluate
        RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate(
            eval_dataloader, test_set, model,
            args.test_eval_dir + "/test_output", args)
        test_finish = time.time()
        test_cost = test_finish - test_start
        print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0))
        print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
              (RP, RR, RF))
        print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
              (TP, TR, TF))
        print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
              (OP, OR, OF))
    else:
        print("Loading model from pretrained checkpoint: " +
              args.bert_checkpoint_dir)
        model = bert_load_state_dict(
            model, torch.load(args.bert_checkpoint_dir, map_location='cpu'))

        #### define optimizer ####
        num_train_steps = int(len(train_set) / args.batchSize * args.iteration)
        param_optimizer = list(model.named_parameters())
        optimizer_grouped_parameters = [{
            'params': [p for n, p in param_optimizer if "bert" in n],
            'weight_decay':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if "bert" not in n],
            'lr':
            args.lr_rate,
            'weight_decay':
            0.01
        }]
        optimizer_grouped_parameters_r = [{
            'params': [p for n, p in param_optimizer if "bert" in n],
            'weight_decay':
            0.01
        }, {
            'params': [p for n, p in param_optimizer if "relation" in n],
            'lr':
            args.R_lr_rate,
            'weight_decay':
            0.01
        }]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=2e-05,
                             warmup=0.1,
                             t_total=num_train_steps)
        optimizer_r = BertAdam(optimizer_grouped_parameters_r,
                               lr=2e-05,
                               warmup=0.1,
                               t_total=num_train_steps)

        #### train ####
        print("start training......")
        best_Score = -10000
        lr = args.lr_rate
        for idx in range(args.iteration):
            epoch_start = time.time()
            temp_start = epoch_start
            print("Epoch: %s/%s" % (idx, args.iteration))

            if idx > 10:
                lr = lr * args.lr_decay
                print(lr)
                optimizer.param_groups[1]["lr"] = lr
                optimizer_r.param_groups[1]["lr"] = lr

            sample_loss = 0
            total_loss = 0
            right_target_token = 0
            whole_target_token = 0
            right_relation_token = 0
            whole_relation_token = 0

            model.train()
            model.zero_grad()
            for step, batch in enumerate(train_dataloader):
                if args.ifgpu:
                    batch = tuple(t.cuda() for t in batch)
                all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels = batch
                max_seq_len = torch.max(torch.sum(all_input_mask, dim=1))
                all_input_ids = all_input_ids[:, :max_seq_len].contiguous()
                all_input_mask = all_input_mask[:, :max_seq_len].contiguous()
                all_segment_ids = all_segment_ids[:, :max_seq_len].contiguous()
                all_relations = all_relations[:, :max_seq_len, :
                                              max_seq_len].contiguous()
                all_labels = all_labels[:, :max_seq_len].contiguous()
                tloss, rloss, targetPredict, relationPredict = model.neg_log_likelihood_loss(
                    all_input_ids, all_segment_ids, all_labels, all_relations,
                    all_input_mask)
                # check right number
                targetRight, targetWhole = targetPredictCheck(
                    targetPredict, all_labels, all_input_mask)
                relationRight, relationWhole = relationPredictCheck(
                    relationPredict, all_relations)

                # cal right and whole label number
                right_target_token += targetRight
                whole_target_token += targetWhole
                right_relation_token += relationRight
                whole_relation_token += relationWhole
                # cal loss
                sample_loss += rloss.data[0] + tloss.data[0]
                total_loss += rloss.data[0] + tloss.data[0]
                # print train info
                if step % 20 == 0:
                    temp_time = time.time()
                    temp_cost = temp_time - temp_start
                    temp_start = temp_time
                    print(
                        "     Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f"
                        % (step * args.batchSize, temp_cost, sample_loss,
                           right_target_token, whole_target_token,
                           (right_target_token + 0.) / whole_target_token,
                           right_relation_token, whole_relation_token,
                           (right_relation_token + 0.) / whole_relation_token))
                    if sample_loss > 1e8 or str(sample_loss) == "nan":
                        print(
                            "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                        )
                        exit(1)
                    sys.stdout.flush()
                    sample_loss = 0

                if step % 2 == 0:
                    loss = 9 * rloss + tloss  #
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()
                else:
                    rloss.backward()
                    optimizer_r.step()
                    optimizer_r.zero_grad()

            temp_time = time.time()
            temp_cost = temp_time - temp_start
            print(
                "     Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f"
                % (step * args.batchSize, temp_cost, sample_loss,
                   right_target_token, whole_target_token,
                   (right_target_token + 0.) / whole_target_token,
                   right_relation_token, whole_relation_token,
                   (right_relation_token + 0.) / whole_relation_token))

            epoch_finish = time.time()
            epoch_cost = epoch_finish - epoch_start
            print(
                "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"
                % (idx, epoch_cost, len(train_set) / epoch_cost, total_loss))
            print("totalloss:", total_loss)
            if total_loss > 1e8 or str(total_loss) == "nan":
                print(
                    "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...."
                )
                exit(1)

            # evaluate
            RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate(
                eval_dataloader, test_set, model,
                args.eval_dir + "/test_output_" + str(idx), args)
            test_finish = time.time()
            test_cost = test_finish - epoch_finish
            current_Score = RF

            print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0))
            print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
                  (RP, RR, RF))
            print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
                  (TP, TR, TF))
            print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" %
                  (OP, OR, OF))

            if current_Score > best_Score:
                print(
                    "Exceed previous best f score with target f: %.4f and opinion f: %.4f and relation f: %.4f"
                    % (TF, OF, RF))
                model_name = args.model_dir + "/modelFinal.model"
                print("Save current best model in file:", model_name)
                torch.save(model, model_name)
                best_Score = current_Score

            gc.collect()
Exemple #8
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"  ## "LSTM"/"CNN"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_average_batch_loss = False
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 50
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyper       iteration: %s" % (self.HP_iteration))
        print("     Hyper      batch size: %s" % (self.HP_batch_size))
        print("     Hyper   average batch: %s" % (self.HP_average_batch_loss))
        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper        lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyper         HP_clip: %s" % (self.HP_clip))
        print("     Hyper        momentum: %s" % (self.HP_momentum))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper      lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyper          bilstm: %s" % (self.HP_bilstm))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper        use_char: %s" % (self.HP_use_char))
        if self.HP_use_char:
            print("             Char_features: %s" % (self.char_features))

        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def refresh_label_alphabet(self, input_file):
        old_size = self.label_alphabet_size
        self.label_alphabet.clear(True)
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                label = pairs[-1]
                self.label_alphabet.add(label)
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"
        self.fix_alphabet()
        print("Refresh label alphabet finished: old:%s -> new:%s" %
              (old_size, self.label_alphabet_size))

    def extend_word_char_alphabet(self, input_file_list):
        old_word_size = self.word_alphabet_size
        old_char_size = self.char_alphabet_size
        for input_file in input_file_list:
            in_lines = open(input_file, 'r').readlines()
            for line in in_lines:
                if len(line) > 2:
                    pairs = line.strip().split()
                    word = pairs[0]
                    if self.number_normalized:
                        word = normalize_word(word)
                    self.word_alphabet.add(word)
                    for char in word:
                        self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        print("Extend word/char alphabet finished!")
        print("     old word:%s -> new word:%s" %
              (old_word_size, self.word_alphabet_size))
        print("     old char:%s -> new char:%s" %
              (old_char_size, self.char_alphabet_size))
        for input_file in input_file_list:
            print("     from file:%s" % (input_file))

    def build_alphabet(self, input_file):
        in_lines_string = open(input_file + ".string.txt", 'r').readlines()
        in_lines_label = open(input_file + ".label.txt", 'r').readlines()
        for line_string, line_label in zip(in_lines_string, in_lines_label):
            print(line_label)
            print(line_string)
            line_label = line_label[:-1].split(',')
            line_string = line_string[:-1]
            assert len(line_label) == len(line_string)
            for i in range(len(line_label)):
                self.label_alphabet.add(line_label[i])
                self.word_alphabet.add(line_string[i])
        self.char_alphabet.add("*")
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def build_word_pretrain_emb(self, emb_path):
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_char_pretrain_emb(self, emb_path):
        self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
            emb_path, self.char_alphabet, self.char_emb_dim,
            self.norm_char_emb)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, output_file, predict_results, name):
        fout = open(output_file, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, output_file))
Exemple #9
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.number_normalized = True
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.feature_name2id = {}

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "BMES"

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None

        self.word_emb_dir = None

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.pretrain_word_embedding = None

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        self.nbest = None

        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_l2 = 1e-8

        # both
        self.full_data = False
        self.tune_wordemb = False

        # relation
        self.max_seq_len = 500
        self.pad_idx = 0
        self.sent_window = 3
        # self.output =None
        self.unk_ratio = 1
        self.seq_feature_size = 256

        self.re_feature_name = []
        self.re_feature_name2id = {}
        self.re_feature_alphabets = []
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feat_config = None
        self.re_feature_emb_dims = []
        self.re_feature_alphabet_sizes = []

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        self.patience = 10

        # self.pretrained_model_dir = None

    def copy_alphabet(self, other):
        self.word_alphabet = copy.deepcopy(other.word_alphabet)
        self.char_alphabet = copy.deepcopy(other.char_alphabet)
        for feature_alphabet in other.feature_alphabets:
            self.feature_alphabets.append(copy.deepcopy(feature_alphabet))

        self.label_alphabet = copy.deepcopy(other.label_alphabet)

        self.feature_name = copy.deepcopy(other.feature_name)
        self.feature_alphabets = copy.deepcopy(other.feature_alphabets)
        self.feature_num = len(self.feature_alphabets)
        self.feature_name2id = copy.deepcopy(other.feature_name2id)
        self.feature_alphabet_sizes = copy.deepcopy(
            other.feature_alphabet_sizes)
        self.feature_emb_dims = copy.deepcopy(other.feature_emb_dims)

        for re_feature_alphabet in other.re_feature_alphabets:
            self.re_feature_alphabets.append(
                copy.deepcopy(re_feature_alphabet))

        self.re_feature_name = copy.deepcopy(other.re_feature_name)
        self.re_feature_name2id = copy.deepcopy(other.re_feature_name2id)
        self.re_feature_alphabets = copy.deepcopy(other.re_feature_alphabets)
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feature_emb_dims = copy.deepcopy(other.re_feature_emb_dims)
        self.re_feature_alphabet_sizes = copy.deepcopy(
            other.re_feature_alphabet_sizes)

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))

        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))

        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))

        print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))

        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper             NBEST: %s" % (self.nbest))

        print("     full data: %s" % (self.full_data))
        print("     Tune  word embeddings: %s" % (self.tune_wordemb))

        print("     max sequence length: %s" % (self.max_seq_len))
        print("     pad index: %s" % (self.pad_idx))
        print("     patience: %s" % (self.patience))
        print("     sentence window: %s" % (self.sent_window))
        # print("     Output directory: %s" % (self.output))
        print("     The ratio using negative instnaces 0~1: %s" %
              (self.unk_ratio))
        print("     Size of seqeuence feature representation: %s" %
              (self.seq_feature_size))

        print("     RE FEATURE num: %s" % (self.re_feature_num))
        for idx in range(self.re_feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.re_feature_alphabets[idx].name,
                   self.re_feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding size: %s" %
                  (self.re_feature_alphabets[idx].name,
                   self.re_feature_emb_dims[idx]))

        print("     RE Train instance number: %s" % (len(self.re_train_Y)))
        print("     RE Dev   instance number: %s" % (len(self.re_dev_Y)))
        print("     RE Test  instance number: %s" % (len(self.re_test_Y)))

        # print("     pretrained_model_dir: %s" % (self.pretrained_model_dir))

        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()

    def initial_feature_alphabets(self):

        feature_prefix = '[Cap]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 0

        feature_prefix = '[POS]'
        self.feature_alphabets.append(Alphabet(feature_prefix))
        self.feature_name.append(feature_prefix)
        self.feature_name2id[feature_prefix] = 1

        self.feature_num = len(self.feature_alphabets)
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']

    def build_alphabet(self, documents):
        for doc in documents:
            for sentence in doc:
                for token in sentence:
                    word = token['word']
                    if self.number_normalized:
                        word = normalize_word(word)
                    label = token['label']
                    self.label_alphabet.add(label)
                    self.word_alphabet.add(word)
                    ## build feature alphabet
                    self.feature_alphabets[0].add(token['cap'])
                    self.feature_alphabets[1].add(token['pos'])

                    for char in word:
                        self.char_alphabet.add(char)

        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def open_alphabet(self):
        self.word_alphabet.open()
        self.char_alphabet.open()
        # label not open
        # self.label_alphabet.open()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].open()

    def initial_re_feature_alphabets(self):
        id = 0
        for k, v in self.re_feat_config.items():
            self.re_feature_alphabets.append(Alphabet(k))
            self.re_feature_name.append(k)
            self.re_feature_name2id[k] = id
            id += 1

        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feature_emb_dims = [20] * self.re_feature_num
        self.re_feature_alphabet_sizes = [0] * self.re_feature_num
        if self.re_feat_config:
            for idx in range(self.re_feature_num):
                if self.re_feature_name[idx] in self.re_feat_config:
                    self.re_feature_emb_dims[idx] = self.re_feat_config[
                        self.re_feature_name[idx]]['emb_size']

    def build_re_feature_alphabets(self, tokens, entities, relations):

        entity_type_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_TYPE]']]
        entity_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY]']]
        relation_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[RELATION]']]
        token_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[TOKEN_NUM]']]
        entity_num_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[ENTITY_NUM]']]
        position_alphabet = self.re_feature_alphabets[
            self.re_feature_name2id['[POSITION]']]

        for i, doc_token in enumerate(tokens):

            doc_entity = entities[i]
            doc_relation = relations[i]

            sent_idx = 0
            sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]
            while sentence.shape[0] != 0:

                entities_in_sentence = doc_entity[(
                    doc_entity['sent_idx'] == sent_idx)]
                for _, entity in entities_in_sentence.iterrows():
                    entity_type_alphabet.add(entity['type'])
                    tk_idx = entity['tf_start']
                    while tk_idx <= entity['tf_end']:
                        entity_alphabet.add(
                            my_utils1.normalizeWord(sentence.iloc[
                                tk_idx, 0]))  # assume 'text' is in 0 column
                        tk_idx += 1

                sent_idx += 1
                sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

            for _, relation in doc_relation.iterrows():
                relation_alphabet.add(relation['type'])

        for i in range(data.max_seq_len):
            token_num_alphabet.add(i)
            entity_num_alphabet.add(i)
            position_alphabet.add(i)
            position_alphabet.add(-i)

        for idx in range(self.re_feature_num):
            self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[
                idx].size()

    def fix_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            alphabet.close()

    def open_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            if alphabet.name == '[RELATION]':  # label not open
                continue
            alphabet.open()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            logging.info("Load pretrained word embedding, dir: %s" %
                         (self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim)

    def generate_instance(self, name, documents):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                documents, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            logging.info(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_re_instance(self, name, tokens, entities, relations, names):
        self.fix_re_alphabet()
        if name == "train":
            self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        elif name == "dev":
            self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        elif name == "test":
            self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(
                tokens, entities, relations, names, self)
        else:
            logging.info(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def clear_data(self):
        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        self.pretrain_word_embedding = None

    def read_config(self, config_file, opt):
        config = config_file_to_dict(config_file)
        ## read data:

        self.train_dir = opt.train_dir

        self.dev_dir = opt.dev_dir

        self.test_dir = opt.test_dir

        self.word_emb_dir = opt.word_emb_file

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])

        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = int(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])

        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        # both
        the_item = 'full_data'
        if the_item in config:
            self.full_data = str2bool(config[the_item])

        the_item = 'tune_wordemb'
        if the_item in config:
            self.tune_wordemb = str2bool(config[the_item])

        the_item = 'max_seq_len'
        if the_item in config:
            self.max_seq_len = int(config[the_item])

        the_item = 'pad_idx'
        if the_item in config:
            self.pad_idx = int(config[the_item])

        the_item = 'sent_window'
        if the_item in config:
            self.sent_window = int(config[the_item])

        # the_item = 'output'
        # if the_item in config:
        #     self.output = config[the_item]

        the_item = 'unk_ratio'
        if the_item in config:
            self.unk_ratio = float(config[the_item])

        the_item = 'seq_feature_size'
        if the_item in config:
            self.seq_feature_size = int(config[the_item])

        the_item = 're_feature'
        if the_item in config:
            self.re_feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'patience'
        if the_item in config:
            self.patience = int(config[the_item])
Exemple #10
0
class Data:
    def __init__(self):
        self.substring_names = ['word', 'pos', 'char', 'bpe', 'word-pos']
        self.substring_maxlen = 10

        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.norm_trans_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.translation_alphabet = Alphabet('translation')
        self.translation_id_format = {}

        self.feature_names = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None

        self.label_alphabet = Alphabet('label', True)
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True
        ###
        self.task_name = None

        ### I/O
        self.data_bin_dir = None
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None
        self.middle_dir = None
        self.viterbi_inputs_model_name = None

        self.trans_dir = None

        self.decode_dir = None
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.trans_embed_dir = None
        self.typeinfo_dir = None

        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_trans_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.trans_alphabet_size = 0

        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30
        self.trans_emb_dim = 100

        ###Classification
        ## Dataset Plus
        self.substring_dir = None
        self.bpe_emb_dir = None
        self.pos_emb_dir = None
        self.pretrain_bpe_embedding = None
        self.pretrain_pos_embedding = None
        self.bpe_emb_dim = 30
        self.pos_emb_dim = 30
        self.bpe_alphabet_size = 0
        self.pos_alphabet_size = 0
        self.norm_bpe_emb = False
        self.norm_pos_emb = False
        self.bpe_texts = []
        self.bpe_Ids = []
        self.pos_texts = []
        self.pos_Ids = []
        self.label_size = 0
        self.substring_train_texts = None
        self.substring_train_Ids = None
        self.substring_dev_texts = None
        self.substring_dev_Ids = None
        self.substring_test_texts = None
        self.substring_test_Ids = None
        self.substring_label_alphabet = Alphabet('substring_label', True)

        ###Networks
        self.word_feature_extractor = "LSTM"  # "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_seq_feature = "CNN"  # "LSTM"/"CNN"/"GRU"/None
        self.use_trans = False
        self.use_crf = True
        self.nbest = None
        self.use_mapping = False
        self.mapping_func = None  # tanh or sigmoid

        # Training
        self.save_model = True
        self.state_training_name = 'default'
        self.average_batch_loss = False
        self.optimizer = "SGD"  # "SGD"/"Adam"
        self.status = "train"
        self.show_loss_per_batch = 100
        # Hyperparameters
        self.seed_num = None
        self.cnn_layer = 4
        self.iteration = 100
        self.batch_size = 10
        self.char_hidden_dim = 50
        self.trans_hidden_dim = 50
        self.hidden_dim = 200
        self.dropout = 0.5
        self.lstm_layer = 1
        self.bilstm = True

        self.gpu = False
        self.lr = 0.015
        self.lr_decay = 0.05
        self.clip = None
        self.momentum = 0
        self.l2 = 1e-8

        # circul
        self.circul_time = 4
        self.circul_deepth = 2
        self.circul_gather_output_mode = "concat"

        # decode prepare
        self.decode_prepare_mode = 'example'

    def init_substring_instance(self):
        len_names = len(self.substring_names)
        self.substring_train_texts = [[[]
                                       for _ in range(self.substring_maxlen)]
                                      for _ in range(len_names)]
        self.substring_train_Ids = [[[] for _ in range(self.substring_maxlen)]
                                    for _ in range(len_names)]
        self.substring_dev_texts = [[[] for _ in range(self.substring_maxlen)]
                                    for _ in range(len_names)]
        self.substring_dev_Ids = [[[] for _ in range(self.substring_maxlen)]
                                  for _ in range(len_names)]
        self.substring_test_texts = [[[] for _ in range(self.substring_maxlen)]
                                     for _ in range(len_names)]
        self.substring_test_Ids = [[[] for _ in range(self.substring_maxlen)]
                                   for _ in range(len_names)]

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Trans alphabet size: %s" % (self.trans_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Tran embedding  dir: %s" % (self.trans_embed_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Tran embedding size: %s" % (self.trans_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Norm   tran     emb: %s" % (self.norm_trans_emb))
        print("++" * 50)
        print("   task name: %s" % (self.task_name))
        print("++" * 50)
        print("   Data bin file directory: %s" % (self.data_bin_dir))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        print("     Middle file directory: %s" % (self.middle_dir))
        print(" viterbi inputs model name: %s" %
              (self.viterbi_inputs_model_name))
        if self.typeinfo_dir:
            print("     typeinfo    directory: %s" % (self.typeinfo_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char_seq_feature: %s" % (self.char_seq_feature))
            print("     Model char_hidden_dim: %s" % (self.char_hidden_dim))
        if self.use_trans:
            print("     Model trans_hidden_dim: %s" % (self.trans_hidden_dim))
        if self.use_mapping:
            print("     Model mapping function: %s" % (self.mapping_func))
        print(" " + "++" * 20)
        print(" Training:")
        print("     show_loss_per_batch: %s" % (self.show_loss_per_batch))
        print("     save_model: %s" % (self.save_model))
        print("     state_training_name: %s" % (self.state_training_name))
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.iteration))
        print("     BatchSize: %s" % (self.batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper        seed_num: %s" % (self.seed_num))
        print("     Hyper              lr: %s" % (self.lr))
        print("     Hyper        lr_decay: %s" % (self.lr_decay))
        print("     Hyper            clip: %s" % (self.clip))
        print("     Hyper        momentum: %s" % (self.momentum))
        print("     Hyper              l2: %s" % (self.l2))
        print("     Hyper      hidden_dim: %s" % (self.hidden_dim))
        print("     Hyper         dropout: %s" % (self.dropout))
        print("     Hyper      lstm_layer: %s" % (self.lstm_layer))
        print("     Hyper          bilstm: %s" % (self.bilstm))
        print("     Hyper             GPU: %s" % (self.gpu))
        print("DATA SUMMARY END.")
        print("++" * 50)

        print("      substring dir : %s" % (self.substring_dir))
        print("    bpe_emb_dir dir : %s" % (self.bpe_emb_dir))
        print("    pos_emb_dir dir : %s" % (self.pos_emb_dir))
        print("++" * 50)

        print("      circul time   : %s" % (self.circul_time))
        print("      circul deepth : %s" % (self.circul_deepth))
        print(" gather output mode : %s" % (self.circul_gather_output_mode))
        print("++" * 50)

        print(" decode prepare mode : %s" % (self.decode_prepare_mode))
        print("++" * 50)

        sys.stdout.flush()

    def make_substring_label_alphabet(self):
        for label in self.label_alphabet.instances:
            label = label.split('-')[-1]
            self.substring_label_alphabet.add(label)
        self.substring_label_alphabet.close()

    def initial_feature_alphabets(self):
        items = open(self.train_dir, 'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = 'feature_' + str(idx)
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_names.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [20] * self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                self.feature_emb_dims[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_size']
                self.feature_emb_dirs[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_dir']
                self.norm_feature_embs[idx] = self.feat_config[
                    self.feature_names[idx]]['emb_norm']
        # exit(0)

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('windows-1252')
                # word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx + 1].split(']', 1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def build_alphabet_substring(self, input_file_dir, substring_file_prefix):
        ## will not read lables
        input_files = os.listdir(input_file_dir)
        print input_files
        for input_file in input_files:
            plus_feature = ''
            input_file_name = os.path.split(input_file)[1]
            if input_file_name.split('.')[0] != substring_file_prefix:
                continue
            if 'bpe' in input_file_name:
                plus_feature = 'bpe'
            elif 'word' in input_file_name:
                plus_feature = 'word'
            if plus_feature == '':
                continue
            in_lines = open(input_file_dir + input_file, 'r').readlines()
            for line in in_lines:
                if len(line.strip()) > 0:
                    pairs = line.strip().split('\t')
                    words = pairs[0].decode('windows-1252')
                    # word = pairs[0].decode('utf-8')
                    if self.number_normalized:
                        words = normalize_word(words)
                    labels = pairs[-1]
                    for word in words.split():
                        self.word_alphabet.add(word)
                        for char in word:
                            self.char_alphabet.add(char)
            self.word_alphabet_size = self.word_alphabet.size()
            self.char_alphabet_size = self.char_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.translation_alphabet.close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s" %
                  (self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
                self.norm_word_emb)

            if self.typeinfo_dir:
                type_info_matrix = []
                with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file:
                    type_info_lines = typeinfo_file.readlines()
                    for line in type_info_lines:
                        line = line.rstrip().split()
                        for i, _ in enumerate(line):
                            line[i] = float(line[i])
                        line = np.array(line)
                        type_info_matrix.append(line)

                print(
                    "Caculate type info distribution,and concate word and type......"
                )
                cos_res = []
                for i, word_embed in enumerate(self.pretrain_word_embedding):
                    word_type_info = []
                    if i == 0:
                        word_type_info = np.random.random(
                            size=len(type_info_matrix))
                        cos_res.append(word_type_info)
                    else:
                        for type_info in type_info_matrix:
                            cos_sim = 1 - spatial.distance.cosine(
                                word_embed, type_info)
                            word_type_info.append(cos_sim)
                        cos_res.append(word_type_info)
                cos_res = np.array(cos_res)
                cos_res = sigmoid(cos_res)
                self.pretrain_word_embedding = np.concatenate(
                    [self.pretrain_word_embedding, cos_res], axis=1)
                print "type info length:{}".format(len(type_info_matrix))
                self.word_emb_dim += len(type_info_matrix)
                print "new word dim is :{}".format(self.word_emb_dim)

        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s" %
                  (self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
                self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
                self.norm_char_emb)
        if self.trans_embed_dir:
            print("Load pretrained trans embedding, norm: %s, dir: %s" %
                  (self.norm_trans_emb, self.trans_embed_dir))
            self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding(
                self.trans_embed_dir, self.translation_alphabet,
                self.trans_emb_dim, self.norm_trans_emb)

        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print(
                    "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                    % (self.feature_name[idx], self.norm_feature_embs[idx],
                       self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                    idx] = build_pretrain_embedding(
                        self.feature_emb_dirs[idx],
                        self.feature_alphabets[idx],
                        self.feature_emb_dims[idx],
                        self.norm_feature_embs[idx])

    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                self.train_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                self.dev_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                self.test_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                self.raw_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.translation_id_format)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_substring(self, substring_file_prefix):
        self.init_substring_instance()
        self.make_substring_label_alphabet()
        input_files = os.listdir(self.substring_dir)
        print input_files
        for input_file in input_files:
            input_file_name = os.path.split(input_file)[1]
            input_file_dir = os.path.join(self.substring_dir, input_file_name)
            input_file_name_split = input_file_name.split('.')
            if input_file_name_split[0] != substring_file_prefix:
                continue
            print('dealing %s' % (input_file_name))
            name = input_file_name_split[1]
            feature_name = input_file_name_split[2]
            f_l = int(input_file_name_split[-1][3:])  #feature_len

            if feature_name == 'word':
                alphabet = self.word_alphabet
            elif feature_name == 'char':
                alphabet = self.char_alphabet
            elif feature_name == 'pos':
                alphabet = self.feature_alphabets[0]
            elif feature_name == 'bpe':
                alphabet = self.feature_alphabets[1]

            s_f_id = self.substring_names.index(
                feature_name)  #substring_feature_id
            if name == "train":
                self.substring_train_texts[s_f_id][f_l], self.substring_train_Ids[s_f_id][f_l]\
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            elif name == "testa":
                self.substring_dev_texts[s_f_id][f_l], self.substring_dev_Ids[s_f_id][f_l] \
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            elif name == "testb":
                self.substring_test_texts[s_f_id][f_l], self.substring_test_Ids[s_f_id][f_l] \
                    = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized)
            else:
                print(
                    "Error: you can only generate train/testa/testb instance! Illegal input:%s"
                    % (name))

    def write_decoded_results(self, predict_results, name):
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, self.decode_dir))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        assert (sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f') + " "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                label_string = content_list[idx][0][idy].encode('utf-8') + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy] + " "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))

    def read_config(self, config_file):
        config = config_file_to_dict(config_file)
        ## task:
        the_item = 'task_name'
        if the_item in config:
            self.task_name = config[the_item]

        ## read data:
        the_item = 'data_bin_dir'
        if the_item in config:
            self.data_bin_dir = config[the_item]
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'trans_dir'
        if the_item in config:
            self.trans_dir = config[the_item]
        the_item = 'middle_dir'
        if the_item in config:
            self.middle_dir = config[the_item]
        the_item = 'viterbi_inputs_model_name'
        if the_item in config:
            self.viterbi_inputs_model_name = config[the_item]

        the_item = 'substring_dir'
        if the_item in config:
            self.substring_dir = config[the_item]
        the_item = 'bpe_emb_dir'
        if the_item in config:
            self.bpe_emb_dir = config[the_item]
        the_item = 'pos_emb_dir'
        if the_item in config:
            self.pos_emb_dir = config[the_item]

        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]

        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]
        the_item = 'trans_embed_dir'
        if the_item in config:
            self.trans_embed_dir = config[the_item]
        the_item = 'typeinfo_dir'
        if the_item in config:
            self.typeinfo_dir = config[the_item]

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])
        the_item = 'trans_emb_dim'
        if the_item in config:
            self.trans_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'use_trans'
        if the_item in config:
            self.use_trans = str2bool(config[the_item])
        the_item = 'use_mapping'
        if the_item in config:
            self.use_mapping = str2bool(config[the_item])
        the_item = 'mapping_func'
        if the_item in config:
            self.mapping_func = config[the_item]
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_seq_feature = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        ## read training setting:
        the_item = 'save_model'
        if the_item in config:
            self.save_model = str2bool(config[the_item])
        the_item = 'state_training_name'
        if the_item in config:
            self.state_training_name = config[the_item]
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]
        the_item = 'show_loss_per_batch'
        if the_item in config:
            self.show_loss_per_batch = int(config[the_item])

        ## read Hyperparameters:
        the_item = 'seed_num'
        if the_item in config:
            if config[the_item] != 'None':
                self.seed_num = int(config[the_item])
        the_item = 'cnn_layer'
        if the_item in config:
            self.cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.char_hidden_dim = int(config[the_item])

        the_item = 'trans_hidden_dim'
        if the_item in config:
            self.trans_hidden_dim = int(config[the_item])

        the_item = 'hidden_dim'
        if the_item in config:
            self.hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            if config[the_item] == 'None':
                self.clip = None
            else:
                self.clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.l2 = float(config[the_item])

        ###base2
        the_item = 'feature_name'
        if the_item in config:
            self.feature_name = config[the_item]
        the_item = 'feature_length'
        if the_item in config:
            self.feature_length = int(config[the_item])
        the_item = 'class_num'
        if the_item in config:
            self.class_num = int(config[the_item])
        the_item = 'feature_ans'
        if the_item in config:
            self.feature_ans = config[the_item]

        ###circul
        the_item = 'circul_time'
        if the_item in config:
            self.circul_time = config[the_item]
        the_item = 'circul_deepth'
        if the_item in config:
            self.circul_deepth = config[the_item]
        the_item = 'circul_gather_output_mode'
        if the_item in config:
            self.circul_gather_output_mode = config[the_item]

        ###decode_prepare
        the_item = 'decode_prepare_mode'
        if the_item in config:
            self.decode_prepare_mode = config[the_item]

    def read_arg(self, args):
        if args.task_name != None: self.task_name = args.task_name

        if args.data_bin_dir != None: self.data_bin_dir = args.data_bin_dir
        if args.train_dir != None: self.train_dir = args.train_dir
        if args.dev_dir != None: self.dev_dir = args.dev_dir
        if args.test_dir != None: self.test_dir = args.test_dir
        if args.trans_dir != None: self.trans_dir = args.trans_dir
        if args.word_emb_dir != None: self.word_emb_dir = args.word_emb_dir
        if args.trans_embed_dir != None:
            self.trans_embed_dir = args.trans_embed_dir
        if args.middle_dir != None: self.middle_dir = args.middle_dir
        if args.viterbi_inputs_model_name != None:
            self.viterbi_inputs_model_name = args.viterbi_inputs_model_name

        if args.substring_dir != None: self.substring_dir = args.substring_dir
        if args.bpe_emb_dir != None: self.bpe_emb_dir = args.bpe_emb_dir
        if args.pos_emb_dir != None: self.pos_emb_dir = args.pos_emb_dir

        if args.model_dir != None: self.model_dir = args.model_dir
        if args.norm_word_emb != None: self.norm_word_emb = args.norm_word_emb
        if args.norm_char_emb != None: self.norm_char_emb = args.norm_char_emb
        if args.word_emb_dim != None: self.word_emb_dim = args.word_emb_dim
        if args.char_emb_dim != None: self.char_emb_dim = args.char_emb_dim
        if args.trans_emb_dim != None: self.trans_emb_dim = args.trans_emb_dim

        if args.number_normalized != None:
            self.number_normalized = args.number_normalized
        if args.seg != None: self.seg = args.seg

        if args.use_crf != None: self.use_crf = args.use_crf
        if args.use_char != None: self.use_char = args.use_char
        if args.use_trans != None: self.use_trans = args.use_trans

        if args.word_seq_feature != None:
            self.word_seq_feature = args.word_seq_feature
        if args.char_seq_feature != None:
            self.char_seq_feature = args.char_seq_feature

        if args.nbest != None: self.nbest = args.nbest

        if args.status != None: self.status = args.status
        if args.state_training_name != None:
            self.state_training_name = args.state_training_name
        if args.save_model != None: self.save_model = args.save_model
        if args.optimizer != None: self.optimizer = args.optimizer
        if args.iteration != None: self.iteration = args.iteration
        if args.batch_size != None: self.batch_size = args.batch_size
        if args.ave_batch_loss != None:
            self.ave_batch_loss = args.ave_batch_loss
        if args.show_loss_per_batch != None:
            self.show_loss_per_batch = args.show_loss_per_batch

        if args.seed_num != None: self.seed_num = args.seed_num
        if args.cnn_layer != None: self.cnn_layer = args.cnn_layer
        if args.char_hidden_dim != None:
            self.char_hidden_dim = args.char_hidden_dim
        if args.trans_hidden_dim != None:
            self.trans_hidden_dim = args.trans_hidden_dim
        if args.hidden_dim != None: self.hidden_dim = args.hidden_dim
        if args.dropout != None: self.dropout = args.dropout
        if args.lstm_layer != None: self.lstm_layer = args.lstm_layer
        if args.bilstm != None: self.bilstm = args.bilstm
        if args.learning_rate != None: self.learning_rate = args.learning_rate
        if args.lr_decay != None: self.lr_decay = args.lr_decay
        if args.momentum != None: self.momentum = args.momentum
        if args.l2 != None: self.l2 = args.l2
        if args.gpu != None: self.gpu = args.gpu
        if args.clip != None: self.clip = args.clip

        ###base2
        if args.feature_name != None: self.feature_name = args.feature_name
        if args.feature_length != None:
            self.feature_length = args.feature_length
        if args.class_num != None: self.class_num = args.class_num
        if args.feature_ans != None:
            self.feature_ans = args.feature_ans

        ###circul
        if args.circul_time != None: self.circul_time = args.circul_time
        if args.circul_deepth != None: self.circul_deepth = args.circul_deepth
        if args.circul_gather_output_mode != None:
            self.circul_gather_output_mode = args.circul_gather_output_mode

        ###decode_prepare
        if args.decode_prepare_mode != None:
            self.decode_prepare_mode = args.decode_prepare_mode

    def build_translation_alphabet(self, trans_path):
        print("Creating translation alphabet......")
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                if len(line.strip().split(":")) == 2:
                    temp = line.strip().split(":", 1)
                    words = temp[1].split()
                    for word in words:
                        self.translation_alphabet.add(word.strip())
        self.trans_alphabet_size = self.translation_alphabet.size()

    def build_translation_dict(self, trans_path):
        print("Creating Id to Id translation dictionary......")
        translation_id_format_temp = {}
        with codecs.open(trans_path, 'r', "utf-8") as f:
            lines = f.readlines()
            for line in lines:
                ids = []
                if len(line.strip().split(":", 1)) == 2:
                    temp = line.strip().split(":", 1)
                    word_id = self.word_alphabet.get_index(temp[0].strip())
                    translations = temp[1].split()
                    for translation in translations:
                        ids.append(
                            self.translation_alphabet.get_index(
                                translation.strip()))
                    if ids == []:
                        ids = [0]
                    translation_id_format_temp[word_id] = ids

        for word in self.word_alphabet.instances:
            if self.word_alphabet.get_index(
                    word) in translation_id_format_temp.keys():
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = translation_id_format_temp[
                        self.word_alphabet.get_index(word)]
            else:
                self.translation_id_format[self.word_alphabet.get_index(
                    word)] = [0]
        seen = set()
        unique_questions = []
        for q, qid in zip(questions, qids):
            if qid not in seen:
                seen.add(qid)
                unique_questions.append(q)

        docs = answers + unique_questions
        # 计算doc frequency
        word2dfs = compute_dfs(docs)
        print word2dfs.items()[:10]
        #########

        # 词典:单词与编号dict
        alphabet = Alphabet(start_feature_id=0)
        alphabet.add('UNKNOWN_WORD_IDX')

        add_to_vocab(answers, alphabet)
        add_to_vocab(questions, alphabet)

        basename = os.path.basename(train)
        cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w'))
        print "alphabet", len(alphabet)

        # 词典不重复词的数目
        dummy_word_idx = alphabet.fid
        # map函数的原型是map(function, iterable, …),将function应用于iterable的每一个元素,结果以列表的形式返回
        # 此处得到的是最长的questuon和answer的长度
        q_max_sent_length = max(map(lambda x: len(x), questions))
        a_max_sent_length = max(map(lambda x: len(x), answers))
        print 'q_max_sent_length', q_max_sent_length
Exemple #12
0
import cPickle
import os
from alphabet import Alphabet
import operator

data_dir = 'preprocessed_data'

fnames = [
    'vocab_en300M', 'vocab_german40M', 'vocab_italian_44M',
    'vocab_netherlands40M'
]

new_alphabet = Alphabet(start_feature_id=0)
new_alphabet.add('UNKNOWN_WORD_IDX')
dummy_word_idx = new_alphabet.fid

for fname in fnames:
    appfname = '{}.pickle'.format(fname)
    fname_vocab = os.path.join(data_dir, appfname)

    alphabet = cPickle.load(open(fname_vocab))
    print "alphabet", len(alphabet)
    word_freq = map(lambda x: (x[0], x[1][1]), alphabet.items())

    sorted_x = sorted(word_freq, key=operator.itemgetter(1),
                      reverse=True)[:650000]
    print len(sorted_x)
    print sorted_x[0]

    for word, freq in sorted_x:
        new_alphabet.add(word)
class Data:
    def __init__(self, args):

        # Alphabet
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        # data
        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []

        self.input_size = 0

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None

        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0

        # hyper parameters
        self.HP_word_emb_dim = args.embedding_size
        self.HP_char_emb_dim = args.char_embedding_size
        self.HP_iteration = args.max_epoch
        self.HP_batch_size = args.batch_size
        self.HP_char_hidden_dim = args.char_hidden_dim
        self.HP_hidden_dim = args.hidden_size
        self.HP_dropout = args.dropout
        self.HP_char_dropout = args.char_dropout
        self.HP_use_char = True if args.char_encoder else False
        self.HP_char_features = args.char_encoder
        self.HP_gpu = torch.cuda.is_available() and args.gpu
        self.HP_lr = args.lr
        self.HP_model_name = args.model_name
        self.HP_encoder_type = args.encoder
        self.HP_optim = args.optim
        self.HP_number_normalized = args.number_normalized
        self.HP_seed = args.seed
        self.HP_l2 = args.l2
        self.HP_kernel_size = args.kernel_size
        self.HP_kernel_num = args.kernel_num

        # self.HP_lr_decay = 0.05
        # self.HP_clip = None
        # self.HP_momentum = 0
        # self.HP_lstm_layer = 1
        # self.HP_bilstm = True

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Word  alphabet size: %s" % self.word_alphabet_size)
        print("     Char  alphabet size: %s" % self.char_alphabet_size)
        print("     Label alphabet size: %s" % self.label_alphabet_size)
        print("     Word embedding size: %s" % self.HP_word_emb_dim)
        print("     Char embedding size: %s" % self.HP_char_emb_dim)
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Hyper       iteration: %s" % self.HP_iteration)
        print("     Hyper      batch size: %s" % self.HP_batch_size)
        print("     Hyper              lr: %s" % self.HP_lr)
        print("     Hyper      hidden_dim: %s" % self.HP_hidden_dim)
        print("     Hyper         dropout: %s" % self.HP_dropout)
        print("     Hyper             GPU: %s" % self.HP_gpu)
        print("     Hyper        use_char: %s" % self.HP_use_char)
        if self.HP_use_char:
            print("             Char_features: %s" % self.HP_char_features)
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            line = line.strip()
            if line:
                pairs = line.strip().split()
                label = pairs[0].strip()
                self.label_alphabet.add(label)
                for word in pairs[2:]:
                    if self.HP_number_normalized:
                        word = normalize_word(word)
                    self.word_alphabet.add(word)
                    for char in word:
                        self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()

    def extend_word_char_alphabet(self, input_file_list):
        """

        :param
        :return:
        """
        old_word_size = self.word_alphabet_size
        old_char_size = self.char_alphabet_size
        for input_file in input_file_list:
            in_lines = open(input_file, 'r').readlines()
            for line in in_lines:
                line = line.strip()
                if line:
                    pairs = line.strip().split()
                    for word in pairs[2:]:
                        if self.HP_number_normalized:
                            word = normalize_word(word)  # 如果单词中有数字,变为0
                        self.word_alphabet.add(word)
                        for char in word:
                            self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        print("Extend word/char alphabet finished!")
        print("     old word:%s -> new word:%s" %
              (old_word_size, self.word_alphabet_size))
        print("     old char:%s -> new char:%s" %
              (old_char_size, self.char_alphabet_size))
        for input_file in input_file_list:
            print("     from file:%s" % input_file)

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.HP_number_normalized)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.HP_number_normalized)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.HP_number_normalized)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % name)

    def build_word_pretrain_emb(self, emb_path):
        """
        预训练词向量
        :param emb_path:
        :return:
        """
        self.pretrain_word_embedding, self.HP_word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.HP_word_emb_dim)

    def build_char_pretrain_emb(self, emb_path):
        """

        :param emb_path:
        :return:
        """

        self.pretrain_char_embedding, self.HP_char_emb_dim = build_pretrain_embedding(
            emb_path, self.char_alphabet, self.HP_char_emb_dim)
Exemple #14
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.feature_name2id = {}


        self.label_alphabet = Alphabet('label',True)
        self.tagScheme = "NoSeg" ## BMES/BIO
        
        self.seg = True

        ### I/O
        self.train_dir = None 
        self.dev_dir = None 
        self.test_dir = None


        self.model_dir = None ## model save  file


        self.word_emb_dir = None 
        self.char_emb_dir = None
        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []


        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []


        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        ###Networks
        self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None
        self.use_crf = True
        self.nbest = None
        
        ## Training
        self.average_batch_loss = False

        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8

        # both
        self.full_data = False
        self.tune_wordemb = False

        # relation
        self.pretrain = None
        self.max_seq_len = 500
        self.pad_idx = 1
        self.sent_window = 3
        self.output =None
        self.unk_ratio=1
        self.seq_feature_size=256
        self.max_epoch = 100
        self.feature_extractor=None

        self.re_feature_name = []
        self.re_feature_name2id = {}
        self.re_feature_alphabets = []
        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_feat_config = None

        self.re_train_X = []
        self.re_dev_X = []
        self.re_test_X = []
        self.re_train_Y = []
        self.re_dev_Y = []
        self.re_test_Y = []

        
    def show_data_summary(self):
        print("++"*50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s"%(self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s"%(self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s"%(self.number_normalized))
        print("     Word  alphabet size: %s"%(self.word_alphabet_size))
        print("     Char  alphabet size: %s"%(self.char_alphabet_size))
        print("     Label alphabet size: %s"%(self.label_alphabet_size))
        print("     Word embedding  dir: %s"%(self.word_emb_dir))
        print("     Char embedding  dir: %s"%(self.char_emb_dir))
        print("     Word embedding size: %s"%(self.word_emb_dim))
        print("     Char embedding size: %s"%(self.char_emb_dim))
        print("     Norm   word     emb: %s"%(self.norm_word_emb))
        print("     Norm   char     emb: %s"%(self.norm_char_emb))
        print("     Train  file directory: %s"%(self.train_dir))
        print("     Dev    file directory: %s"%(self.dev_dir))
        print("     Test   file directory: %s"%(self.test_dir))


        print("     Model  file directory: %s"%(self.model_dir))


        print("     Train instance number: %s"%(len(self.train_texts)))
        print("     Dev   instance number: %s"%(len(self.dev_texts)))
        print("     Test  instance number: %s"%(len(self.test_texts)))

        print("     FEATURE num: %s"%(self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding  dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print("         Fe: %s  embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx]))
        # for k, v in self.feat_config.items():
        #     print("         Feature: %s, size %s, norm %s, dir %s"%(k, v['emb_size'], v['emb_norm'], v['emb_dir']))

        print(" "+"++"*20)
        print(" Model Network:")
        print("     Model        use_crf: %s"%(self.use_crf))
        print("     Model word extractor: %s"%(self.word_feature_extractor))
        print("     Model       use_char: %s"%(self.use_char))
        if self.use_char:
            print("     Model char extractor: %s"%(self.char_feature_extractor))
            print("     Model char_hidden_dim: %s"%(self.HP_char_hidden_dim))
        print(" "+"++"*20)
        print(" Training:")
        print("     Optimizer: %s"%(self.optimizer))
        print("     Iteration: %s"%(self.HP_iteration))
        print("     BatchSize: %s"%(self.HP_batch_size))
        print("     Average  batch   loss: %s"%(self.average_batch_loss))

        print(" "+"++"*20)
        print(" Hyperparameters:")
        
        print("     Hyper              lr: %s"%(self.HP_lr))
        print("     Hyper        lr_decay: %s"%(self.HP_lr_decay))
        print("     Hyper         HP_clip: %s"%(self.HP_clip))
        print("     Hyper        momentum: %s"%(self.HP_momentum))
        print("     Hyper              l2: %s"%(self.HP_l2))
        print("     Hyper      hidden_dim: %s"%(self.HP_hidden_dim))
        print("     Hyper         dropout: %s"%(self.HP_dropout))
        print("     Hyper      lstm_layer: %s"%(self.HP_lstm_layer))
        print("     Hyper          bilstm: %s"%(self.HP_bilstm))
        print("     Hyper             GPU: %s"%(self.HP_gpu))
        print("     Hyper             NBEST: %s"%(self.nbest))

        print(" " + "++" * 20)
        print(" Both:")

        print("     full data: %s" % (self.full_data))
        print("     Tune  word embeddings: %s" % (self.tune_wordemb))

        print(" "+"++"*20)
        print(" Relation:")

        print("     Pretrain directory: %s" % (self.pretrain))
        print("     max sequence length: %s" % (self.max_seq_len))
        print("     pad index: %s" % (self.pad_idx))
        print("     sentence window: %s" % (self.sent_window))
        print("     Output directory: %s" % (self.output))
        print("     The ratio using negative instnaces 0~1: %s" % (self.unk_ratio))
        print("     Size of seqeuence feature representation: %s" % (self.seq_feature_size))
        print("     Iteration for relation training: %s" % (self.max_epoch))
        print("     feature_extractor: %s" % (self.feature_extractor))

        print("     RE FEATURE num: %s"%(self.re_feature_num))
        for idx in range(self.re_feature_num):
            print("         Fe: %s  alphabet  size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding  dir: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dirs[idx]))
            print("         Fe: %s  embedding size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s"%(self.re_feature_alphabets[idx].name, self.re_norm_feature_embs[idx]))

        print("     RE Train instance number: %s"%(len(self.re_train_Y)))
        print("     RE Dev   instance number: %s"%(len(self.re_dev_Y)))
        print("     RE Test  instance number: %s"%(len(self.re_test_Y)))

        print("DATA SUMMARY END.")
        print("++"*50)
        sys.stdout.flush()


    def initial_feature_alphabets(self, input_file):
        items = open(input_file,'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            id = 0
            for idx in range(1, total_column-1):
                feature_prefix = items[idx].split(']',1)[0]+"]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                self.feature_name2id[feature_prefix] = id
                id += 1
                print "Find feature: ", feature_prefix 
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None]*self.feature_num
        self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None]*self.feature_num 
        self.norm_feature_embs = [False]*self.feature_num
        self.feature_alphabet_sizes = [0]*self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm']
        # exit(0)


    def build_alphabet(self, input_file):
        in_lines = open(input_file,'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet 
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx+1].split(']',1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size()
        startS = False
        startB = False
        for label,_ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"


    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close() 
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def initial_re_feature_alphabets(self):
        id = 0
        for k, v in self.re_feat_config.items():
            self.re_feature_alphabets.append(Alphabet(k))
            self.re_feature_name.append(k)
            self.re_feature_name2id[k] = id
            id += 1

        self.re_feature_num = len(self.re_feature_alphabets)
        self.re_pretrain_feature_embeddings = [None]*self.re_feature_num
        self.re_feature_emb_dims = [20]*self.re_feature_num
        self.re_feature_emb_dirs = [None]*self.re_feature_num
        self.re_norm_feature_embs = [False]*self.re_feature_num
        self.re_feature_alphabet_sizes = [0]*self.re_feature_num
        if self.re_feat_config:
            for idx in range(self.re_feature_num):
                if self.re_feature_name[idx] in self.re_feat_config:
                    self.re_feature_emb_dims[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_size']
                    self.re_feature_emb_dirs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_dir']
                    self.re_norm_feature_embs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_norm']


    def build_re_feature_alphabets(self, tokens, entities, relations):

        entity_type_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_TYPE]']]
        entity_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY]']]
        relation_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[RELATION]']]
        token_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[TOKEN_NUM]']]
        entity_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_NUM]']]
        position_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[POSITION]']]

        for i, doc_token in enumerate(tokens):

            doc_entity = entities[i]
            doc_relation = relations[i]

            sent_idx = 0
            sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]
            while sentence.shape[0] != 0:

                entities_in_sentence = doc_entity[(doc_entity['sent_idx'] == sent_idx)]
                for _, entity in entities_in_sentence.iterrows():
                    entity_type_alphabet.add(entity['type'])
                    tk_idx = entity['tf_start']
                    while tk_idx <= entity['tf_end']:
                        entity_alphabet.add(
                            my_utils1.normalizeWord(sentence.iloc[tk_idx, 0]))  # assume 'text' is in 0 column
                        tk_idx += 1

                sent_idx += 1
                sentence = doc_token[(doc_token['sent_idx'] == sent_idx)]

            for _, relation in doc_relation.iterrows():
                relation_alphabet.add(relation['type'])


        for i in range(data.max_seq_len):
            token_num_alphabet.add(i)
            entity_num_alphabet.add(i)
            position_alphabet.add(i)
            position_alphabet.add(-i)


        for idx in range(self.re_feature_num):
            self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[idx].size()


    def fix_re_alphabet(self):
        for alphabet in self.re_feature_alphabets:
            alphabet.close()


    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb)
        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb)
        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx])

    def build_re_pretrain_emb(self):
        for idx in range(self.re_feature_num):
            if self.re_feature_emb_dirs[idx]:
                print("Load pretrained re feature %s embedding:, norm: %s, dir: %s" % (self.re_feature_name[idx], self.re_norm_feature_embs[idx], self.re_feature_emb_dirs[idx]))
                self.re_pretrain_feature_embeddings[idx], self.re_feature_emb_dims[idx] = build_pretrain_embedding(
                    self.re_feature_emb_dirs[idx], self.re_feature_alphabets[idx], self.re_feature_emb_dims[idx],
                    self.re_norm_feature_embs[idx])

    def generate_instance(self, name, input_file):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))



    def generate_re_instance(self, name, tokens, entities, relations, names):
        self.fix_re_alphabet()
        if name == "train":
            self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self)
        elif name == "dev":
            self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self)
        elif name == "test":
            self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self)
        else:
            print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))


    def load(self,data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self,save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()




    def read_config(self,config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]


        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]


        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]


        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])


        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_feature_extractor = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item] ## feat_config is a dict 






        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])


        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = int(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        # both
        the_item = 'full_data'
        if the_item in config:
            self.full_data = str2bool(config[the_item])

        the_item = 'tune_wordemb'
        if the_item in config:
            self.tune_wordemb = str2bool(config[the_item])

        # relation
        the_item = 'pretrain'
        if the_item in config:
            self.pretrain = config[the_item]

        the_item = 'max_seq_len'
        if the_item in config:
            self.max_seq_len = int(config[the_item])

        the_item = 'pad_idx'
        if the_item in config:
            self.pad_idx = int(config[the_item])

        the_item = 'sent_window'
        if the_item in config:
            self.sent_window = int(config[the_item])

        the_item = 'output'
        if the_item in config:
            self.output = config[the_item]

        the_item = 'unk_ratio'
        if the_item in config:
            self.unk_ratio = float(config[the_item])

        the_item = 'seq_feature_size'
        if the_item in config:
            self.seq_feature_size = int(config[the_item])

        the_item = 'max_epoch'
        if the_item in config:
            self.max_epoch = int(config[the_item])

        the_item = 'feature_extractor'
        if the_item in config:
            self.feature_extractor = config[the_item]

        the_item = 're_feature'
        if the_item in config:
            self.re_feat_config = config[the_item] ## feat_config is a dict
Exemple #15
0
class Data:
    def __init__(self, input_file):
        self.original_data = open(input_file, 'r').readlines()
        self.index_data = []
        self.word_alphabet = Alphabet('word')
        self.gloss_alphabet = Alphabet('gloss')
        self.entity_alphabet = Alphabet('entity')
        self.gaz_alphabet = Alphabet('gaz')
        self.label_alphabet = Alphabet('label')
        self.word_alphabet_size = 0
        self.gloss_alphabet_size = 0
        self.entity_alphabet_size = 0
        self.gaz_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 1
        self.HP_gaz_hidden_dim = 50
        self.HP_lstm_hidden_dim = 200
        self.HP_dropout = 0.5
        self.gaz_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = False
        self.HP_use_entity = False
        self.HP_use_gloss = True
        self.HP_use_gaz = False
        self.HP_gpu = True
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0
        self.HP_iteration = 100
        # embedding hyperparameter
        self.word_emb_dim = 200
        self.entity_emb_dim = 50
        self.gloss_features = "CNN"  #["CNN","LSTM"]
        self.gloss_emb_dim = 200
        self.gloss_hidden_dim = 300
        self.pretrain_word_embedding = np.array([])
        self.pretrain_gaz_embedding = None
        self.word_embed_path = "../LOVECC/NYM.6B.200d.txt"  #"NYM_200.txt"
        self.gaz_embed_path = None
        self.gaz_emb_dim = 200
        self.HP_fix_gaz_emb = True

    def build_alphabet(self):
        in_lines = self.original_data
        for idx in range(len(in_lines)):
            line = json.loads(in_lines[idx])
            words = line["word_context"]
            for word in words:
                self.word_alphabet.add(word)

            sentence_gloss = line["babel_gloss"]
            for word_gloss in sentence_gloss:
                for phrase_gloss in word_gloss:  #一个词可以匹配多个词组
                    if "EN" in phrase_gloss:
                        phrase_gloss_EN = phrase_gloss["EN"]
                        final_gloss = " . ".join(phrase_gloss_EN)
                        for de_word in final_gloss:
                            # for definates in phrase_gloss_EN:
                            # for de_word in definates.split():
                            self.gloss_alphabet.add(de_word)

            entitys = line["entity_context"]
            for entity in entitys:
                self.entity_alphabet.add(entity)

            gazs = line["babel_phase"]
            for gaz in gazs:
                for item in gaz:
                    self.gaz_alphabet.add(item)

            labels = line["detection_label"]
            for label in labels:
                self.label_alphabet.add(label)
        print(self.label_alphabet.get_content())
        self.word_alphabet_size = self.word_alphabet.size()
        self.gloss_alphabet_size = self.gloss_alphabet.size()
        self.entity_alphabet_size = self.entity_alphabet.size()
        self.gaz_alphabet_size = self.gaz_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        self.word_alphabet.close()
        self.gloss_alphabet.close()
        self.entity_alphabet.close()
        self.gaz_alphabet.close()
        self.label_alphabet.close()

    def generate_instance_Ids(self):  #把输入句子变成对应的标号(Id)
        in_lines = self.original_data
        for idx in range(len(in_lines)):
            line = json.loads(in_lines[idx])
            words = line["word_context"]
            words_Id = []
            for word in words:
                words_Id.append(self.word_alphabet.get_index(word))

            sentence_gloss = line["babel_gloss"]
            sentence_glosses_Id = []
            for word_gloss in sentence_gloss:
                word_glosses_Id = []
                for phrase_gloss in word_gloss:  #一个词可以匹配多个词组
                    if "EN" in phrase_gloss:
                        phrase_gloss_EN = phrase_gloss["EN"]  #这是个list
                        final_gloss = " . ".join(phrase_gloss_EN)
                        for de_word in final_gloss:
                            word_glosses_Id.append(
                                self.gloss_alphabet.get_index(de_word))
                sentence_glosses_Id.append(word_glosses_Id)

            entitys = line["entity_context"]
            entitys_Id = []
            for entity in entitys:
                entitys_Id.append(self.entity_alphabet.get_index(entity))

            gazs = line["babel_phase"]
            sentence_gazs_Id = [
            ]  #gazs_Id=[[[take over,take over of,...],[2,3,...]],[[legal,legal procedures,...],[1,2,...]],...,[[open the window,open the window please,...],[3,4,...]]]
            for gaz in gazs:
                word_gazs_Id = []
                Ids = []
                Lens = []
                for item in gaz:
                    Ids.append(self.gaz_alphabet.get_index(item))
                    Lens.append(len(item.split()))
                word_gazs_Id = [Ids, Lens]
                sentence_gazs_Id.append(word_gazs_Id)

            labels = line["detection_label"]
            labels_Id = []
            for label in labels:
                labels_Id.append(self.label_alphabet.get_index(label))
            self.index_data.append([
                words_Id, entitys_Id, sentence_gazs_Id, sentence_glosses_Id,
                labels_Id
            ])

    def load_pretrain_emb(self, embedding_path):
        lines = open(embedding_path, 'r', encoding="utf-8").readlines()
        statistic = lines[0].strip()  #开头的两个统计数据:单词数,向量长度
        # print(statistic)
        embedd_dim = int(statistic.split()[1])
        embedd_dict = dict()
        embedd_dict["<pad>"] = [0.0 for i in range(embedd_dim)]  #填充词对应的向量置为全零
        # print(len(embedd_dict["<pad>"]))
        for line in lines[1:]:
            line = line.strip()
            if len(line) == 0:
                continue
            tokens = line.split()
            if embedd_dim < 0:
                embedd_dim = len(tokens) - 1
            else:
                assert (embedd_dim + 1 == len(tokens))
            embedd_dict[tokens[0]] = [float(i) for i in tokens[1:]]
        return embedd_dict, embedd_dim

    def norm2one(self, vec):
        if np.sum(vec) == 0:
            return vec
        root_sum_square = np.sqrt(np.sum(np.square(vec)))
        return vec / root_sum_square

    def build_pretrain_embedding(self,
                                 embedding_path,
                                 word_alphabet,
                                 embedd_dim=200,
                                 norm=True):
        embedd_dict = dict()
        if embedding_path != None:
            # 读取embedding字典
            embedd_dict, embedd_dim = self.load_pretrain_emb(embedding_path)
        scale = np.sqrt(3.0 / embedd_dim)
        pretrain_emb = np.zeros([word_alphabet.size(),
                                 embedd_dim])  #pretrain_emb就是重排之后的embedding矩阵
        perfect_match = 0
        case_match = 0
        not_match = 0
        for word, index in word_alphabet.get_alphabet().items():
            if word in embedd_dict:
                # print(word,index)
                # print(len(embedd_dict[word]))
                if norm:
                    pretrain_emb[index] = self.norm2one(embedd_dict[word])
                else:
                    pretrain_emb[index] = embedd_dict[word]
                perfect_match += 1
            elif word.lower() in embedd_dict:
                if norm:
                    pretrain_emb[index] = self.norm2one(
                        embedd_dict[word.lower()])
                else:
                    pretrain_emb[index] = embedd_dict[word.lower()]
                case_match += 1
            else:
                pretrain_emb[index] = np.random.uniform(
                    -scale, scale, [1, embedd_dim])
                not_match += 1
        pretrained_size = len(embedd_dict)
        # print("pad's embedding:",pretrain_emb[word_alphabet.get_index(",")])
        print(
            "Embedding:\n  pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s"
            % (pretrained_size, perfect_match, case_match, not_match,
               (not_match + 0.) / word_alphabet.size()))
        return pretrain_emb, embedd_dim  #pretrain_emb就是根据alphabet的顺序重排embedding矩阵,embedd_dim是向量的纬度

    def generate_embedding(self):
        self.pretrain_word_embedding, self.word_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.word_alphabet)
        self.pretrain_gloss_embedding, self.gloss_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.gloss_alphabet)
        self.pretrain_gaz_embedding, self.gaz_pretrain_dim = self.build_pretrain_embedding(
            self.word_embed_path, self.gaz_alphabet)
Exemple #16
0
# -*- coding: utf-8 -*-
# @Author: Shaowei Chen,   Contact: [email protected]
# @Date:   2020-4-27

import sys
import argparse
import torch
from alphabet import Alphabet
sys.path.append("../")

word_alphabet = Alphabet('word', True)
label_alphabet = Alphabet('label', True)
label_alphabet.add("O")
label_alphabet.add("B")
label_alphabet.add("I")
relation_alphabet = Alphabet('relation', True)
char_alphabet = Alphabet('char', True)


class InputFeatures(object):
    """A single set of features of data."""
    def __init__(self, tokens, token_ids, token_mask, chars, char_ids,
                 char_mask, charLength, tokenLength, labels, label_ids,
                 relations, gold_relations):
        self.tokens = tokens
        self.token_ids = token_ids
        self.token_mask = token_mask
        self.tokenLength = tokenLength
        self.labels = labels
        self.label_ids = label_ids
        self.relations = relations
Exemple #17
0
def main(argv):
    vocab_dir = 'preprocessed_data'
    load_vocab = False
    parse_200M = True

    smiley_tweets_fname = ''
    smiley_tweets = ''
    fname_vocab = ''
    n_max_tweets = np.inf
    outdir = ''
    parse_random_tweets = False

    try:
        opts, args = getopt.getopt(
            argv, "v:t:m:nr", ["vocab=", "tweets=", "max_tweets=", "no_big="])
    except getopt.GetoptError:
        print 'test.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-v", "--vocab"):
            load_vocab = True
            fname_vocab = os.path.join(vocab_dir, '{}.pickle'.format(arg))
        elif opt in ("-t", "--tweets"):
            smiley_tweets_fname = arg
            smiley_tweets = 'semeval/{}.gz'.format(arg)
            outdir = 'preprocessed_data_{}'.format(arg)
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            model_dir = 'misc/{}'.format(arg)
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)
        elif opt in ("-m", "--max_tweets"):
            n_max_tweets = int(arg)
        elif opt == '-n':
            parse_200M = False
        elif opt == '-r':
            parse_random_tweets = True

    dev2013 = "semeval/dev2013-task-B.tsv"
    dev2016 = "semeval/dev2016-task-A.tsv"
    devtest2016 = "semeval/devtest2016-task-A.tsv"
    test2013_sms = "semeval/test2013sms-task-B.tsv"
    test2013_twitter = "semeval/test2013-task-B.tsv"
    test2014_livejournal = "semeval/test2014lj-task-B.tsv"
    test2014_sarcasm = "semeval/test2014sarcasm-task-B.tsv"
    test2014_twitter = "semeval/test2014-task-B.tsv"
    test2015 = "semeval/test2015-task-B.tsv"
    test2016 = "semeval/test2016-task-A.tsv"
    train2013 = "semeval/train2013-task-B.tsv"
    train16 = "semeval/train2016-task-A.tsv"
    de_train = "semeval/de_train.tsv"
    de_test = "semeval/de_test.tsv"
    it_test = "semeval/it_test.tsv"
    it_train = "semeval/it_train.tsv"
    nl_train = "semeval/nl_train.tsv"
    nl_test = "semeval/nl_test.tsv"
    de_en_test = "semeval/de_eng_n.tsv"
    de_no_en_test = "semeval/de_no_eng_n.tsv"

    if load_vocab:
        alphabet = cPickle.load(open(fname_vocab))
        dummy_word_idx = alphabet.get('DUMMY_WORD_IDX', DUMMY_WORD_IDX)
        print "alphabet", len(alphabet)
        print 'dummy_word:', dummy_word_idx
    else:
        alphabet = Alphabet(start_feature_id=0)
        alphabet.add('UNKNOWN_WORD_IDX')
        alphabet.add('DUMMY_WORD_IDX')
        dummy_word_idx = DUMMY_WORD_IDX

    print "Loading Semeval Data"
    #ncol is the number of columns iside the files in semeval
    files = [
        (train2013, 4),
        (dev2013, 4),
        (test2013_sms, 4),
        (test2013_twitter, 4),
        (test2014_twitter, 4),
        (test2014_livejournal, 4),
        (test2014_sarcasm, 4),
        (test2015, 4),
        (train16, 3),
        (dev2016, 3),
        (devtest2016, 3),
        (test2016, 3),
        (de_test, 4),
        (de_train, 4),
        (it_test, 4),
        (it_train, 4),
        (nl_test, 4),
        (nl_train, 4),
        (de_en_test, 4),
        (de_no_en_test, 4),
    ]
    if parse_random_tweets:
        outdir = outdir + '_random'
        files = map(lambda x: (os.path.join('random_tweets', x), 3),
                    os.listdir('random_tweets'))
        if not os.path.exists(outdir):
            os.makedirs(outdir)

    for fname, ncols in files:
        tid, tweets, sentiments = load_data(fname, alphabet, ncols=ncols)
        print "Number of tweets:", len(tweets)

        tweet_idx = p_utils.convert2indices(tweets, alphabet, dummy_word_idx)

        basename, _ = os.path.splitext(os.path.basename(fname))
        np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid)
        np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)),
                tweet_idx)
        np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)),
                sentiments)

    if parse_200M:
        print "Loading Smiley Data"
        basename, _ = os.path.splitext(os.path.basename('smiley_tweets'))
        nTweets = p_utils.store_file(
            smiley_tweets,
            os.path.join(outdir, '{}.tweets.npy'.format(basename)),
            alphabet,
            dummy_word_idx,
            sentiment_fname=os.path.join(outdir,
                                         '{}.sentiments.npy'.format(basename)),
            max_tweets=n_max_tweets)
        print "Number of tweets:", nTweets
        nTf = open('misc/{}/nTweets.txt'.format(smiley_tweets_fname), 'wb')
        nTf.write(str(nTweets))
        nTf.close()

    cPickle.dump(alphabet, open(os.path.join(outdir, 'last_vocab.pickle'),
                                'wb'))
Exemple #18
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        # self.punctuation_filter = True
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "NoSeg"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bigram = True
        self.word_emb_dim = 50
        self.biword_emb_dim = 50
        self.char_emb_dim = 30
        self.gaz_emb_dim = 50
        self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = False
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

    def show_data_summary(self):
        addLogSectionMark("DATA SUMMARY")
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        # print("     Punctuation  filter: %s" % (self.punctuation_filter))
        print("     Use          bigram: %s" % (self.use_bigram))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Biword alphabet size: %s" % (self.biword_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Gaz   alphabet size: %s" % (self.gaz_alphabet.size()))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Biword embedding size: %s" % (self.biword_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Gaz embedding size: %s" % (self.gaz_emb_dim))
        print("     Norm     word   emb: %s" % (self.norm_word_emb))
        print("     Norm     biword emb: %s" % (self.norm_biword_emb))
        print("     Norm     gaz    emb: %s" % (self.norm_gaz_emb))
        print("     Norm   gaz  dropout: %s" % (self.gaz_dropout))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyperpara  iteration: %s" % (self.HP_iteration))
        print("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        print("     Hyperpara          lr: %s" % (self.HP_lr))
        print("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        print("     Hyperpara    momentum: %s" % (self.HP_momentum))
        print("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyperpara     dropout: %s" % (self.HP_dropout))
        print("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        print("     Hyperpara         GPU: %s" % (self.HP_gpu))
        print("     Hyperpara     use_gaz: %s" % (self.HP_use_gaz))
        print("     Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))

        logger.info("     Tag          scheme: %s" % (self.tagScheme))
        logger.info("     MAX SENTENCE LENGTH: %s" %
                    (self.MAX_SENTENCE_LENGTH))
        logger.info("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        logger.info("     Number   normalized: %s" % (self.number_normalized))
        logger.info("     Use          bigram: %s" % (self.use_bigram))
        logger.info("     Word  alphabet size: %s" % (self.word_alphabet_size))
        logger.info("     Biword alphabet size: %s" %
                    (self.biword_alphabet_size))
        logger.info("     Char  alphabet size: %s" % (self.char_alphabet_size))
        logger.info("     Gaz   alphabet size: %s" %
                    (self.gaz_alphabet.size()))
        logger.info("     Label alphabet size: %s" %
                    (self.label_alphabet_size))
        logger.info("     Word embedding size: %s" % (self.word_emb_dim))
        logger.info("     Biword embedding size: %s" % (self.biword_emb_dim))
        logger.info("     Char embedding size: %s" % (self.char_emb_dim))
        logger.info("     Gaz embedding size: %s" % (self.gaz_emb_dim))
        logger.info("     Norm     word   emb: %s" % (self.norm_word_emb))
        logger.info("     Norm     biword emb: %s" % (self.norm_biword_emb))
        logger.info("     Norm     gaz    emb: %s" % (self.norm_gaz_emb))
        logger.info("     Norm   gaz  dropout: %s" % (self.gaz_dropout))
        logger.info("     Train instance number: %s" % (len(self.train_texts)))
        logger.info("     Dev   instance number: %s" % (len(self.dev_texts)))
        logger.info("     Test  instance number: %s" % (len(self.test_texts)))
        logger.info("     Raw   instance number: %s" % (len(self.raw_texts)))
        logger.info("     Hyperpara  iteration: %s" % (self.HP_iteration))
        logger.info("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        logger.info("     Hyperpara          lr: %s" % (self.HP_lr))
        logger.info("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        logger.info("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        logger.info("     Hyperpara    momentum: %s" % (self.HP_momentum))
        logger.info("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        logger.info("     Hyperpara     dropout: %s" % (self.HP_dropout))
        logger.info("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        logger.info("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        logger.info("     Hyperpara         GPU: %s" % (self.HP_gpu))
        logger.info("     Hyperpara     use_gaz: %s" % (self.HP_use_gaz))
        logger.info("     Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))
        if self.HP_use_char:
            print("             Char_features: %s" % (self.char_features))
            logger.info("             Char_features: %s" %
                        (self.char_features))
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def refresh_label_alphabet(self, input_file):
        old_size = self.label_alphabet_size
        self.label_alphabet.clear(True)
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                label = pairs[-1]
                self.label_alphabet.add(label)
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"
        self.fix_alphabet()
        print("Refresh label alphabet finished: old:%s -> new:%s" %
              (old_size, self.label_alphabet_size))

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for idx in xrange(len(in_lines)):
            line = in_lines[idx]
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)

                if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2:
                    biword = word + in_lines[
                        idx + 1].strip().split()[0].decode('utf-8')
                else:
                    biword = word + NULLKEY

                self.biword_alphabet.add(biword)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.biword_alphabet_size = self.biword_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def build_gaz_file(self, gaz_file):
        ## build gaz file,initial read gaz embedding file
        if gaz_file:
            fins = open(gaz_file, 'r').readlines()
            for fin in fins:
                fin = fin.strip().split()[0].decode('utf-8')
                if fin:
                    self.gaz.insert(fin, "one_source")
            print "Load gaz file: ", gaz_file, " total size:", self.gaz.size()
        else:
            print "Gaz file is None, load nothing"

    def build_gaz_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        word_list = []
        for line in in_lines:
            if len(line) > 3:
                word = line.split()[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                word_list.append(word)
            else:
                w_length = len(word_list)
                for idx in range(w_length):
                    matched_entity = self.gaz.enumerateMatchList(
                        word_list[idx:])
                    for entity in matched_entity:
                        # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity)
                        self.gaz_alphabet.add(entity)
                word_list = []
        print "gaz alphabet size:", self.gaz_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.biword_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.gaz_alphabet.close()

    def build_word_pretrain_emb(self, emb_path):
        print "build word pretrain emb..."
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_biword_pretrain_emb(self, emb_path):
        print "build biword pretrain emb..."
        self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding(
            emb_path, self.biword_alphabet, self.biword_emb_dim,
            self.norm_biword_emb)

    def build_gaz_pretrain_emb(self, emb_path):
        print "build gaz pretrain emb..."
        self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding(
            emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_with_gaz(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)

        elif name == "sentence":
            self.raw_texts, self.raw_Ids = read_instance_with_gaz_text(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, output_file, predict_results, name):
        fout = open(output_file, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')

            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, output_file))

    def write_decoded_results_back(self, predict_results, name):
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )

        assert (sent_num == len(content_list))
        result = []
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                print(content_list[idx][0][idy].encode('utf-8') + " " +
                      predict_results[idx][idy] + '\n')

        for idx in range(sent_num):
            sent_length = len(predict_results[idx])

            data = {'start': '', 'end': "", 'value': '', 'entity': ''}
            value = ''
            for idy in range(sent_length):
                pre_su_item = predict_results[idx][idy].split('-')
                if pre_su_item[0] == 'S':
                    data['start'] = str(idy)
                    data['end'] = str(idy + 1)
                    data['value'] = content_list[idx][0][idy].encode('utf-8')
                    data['entity'] = pre_su_item[1]
                    result.append(data)
                    data = {'start': '', 'end': "", 'value': '', 'entity': ''}
                if pre_su_item[0] == 'B':
                    data['start'] = str(idy)
                    value = value + (content_list[idx][0][idy].encode('utf-8'))
                if pre_su_item[0] == 'E':
                    value = value + (content_list[idx][0][idy].encode('utf-8'))
                    data['end'] = str(idy + 1)
                    data['value'] = value
                    data['entity'] = pre_su_item[1]
                    result.append(data)
                    data = {'start': '', 'end': "", 'value': '', 'entity': ''}
                    value = ''
                if pre_su_item[0] == 'I':
                    value = value + (content_list[idx][0][idy].encode('utf-8'))

        return result

    def write_http_data(self, output_file, inputData, name):
        fout = open(output_file, 'w')
        get_num = len(inputData)

        start = 0
        numOfParagram = int(math.ceil(get_num / 5.0))
        num_start_sentence = start
        num_end_sentence = numOfParagram

        if name == "test":
            num_start_sentence = 0
            num_end_sentence = numOfParagram
        elif name == "dev":
            num_start_sentence = numOfParagram
            num_end_sentence = numOfParagram * 2
        elif name == "train":
            num_start_sentence = numOfParagram * 2
            num_end_sentence = get_num

        for idx in range(num_start_sentence, num_end_sentence):
            text = inputData[idx]["text"]
            entities = inputData[idx]["entities"]

            idText = 1
            inWord = False
            tagReady = False
            entity_name = ''
            for Text in text:
                ## content_list[idx] is a list with [word, char, label]
                tagReady = False

                for entity in entities:
                    if not inWord:
                        if entity['start'] + 1 == entity['end'] and entity[
                                'end'] == idText:
                            fout.write(
                                Text.encode('utf-8') + " " + "S-" +
                                entity['entity'].encode('utf-8') + '\n')
                            tagReady = True
                            break
                        if entity['start'] + 1 == idText:
                            fout.write(
                                Text.encode('utf-8') + " " + "B-" +
                                entity['entity'].encode('utf-8') + '\n')
                            tagReady = True
                            inWord = True
                            entity_name = entity['entity'].encode('utf-8')
                            break
                    else:
                        if entity['end'] == idText:
                            fout.write(
                                Text.encode('utf-8') + " " + "E-" +
                                entity_name + '\n')
                            tagReady = True
                            inWord = False
                            break

                if not tagReady:
                    if not inWord:
                        fout.write(Text.encode('utf-8') + " " + "O" + '\n')
                    else:
                        fout.write(
                            Text.encode('utf-8') + " " + "I-" + entity_name +
                            '\n')

                idText = idText + 1
            fout.write('\n')
        fout.close()

        print("Predict input data has been written into file. %s" %
              (output_file))
Exemple #19
0
        if opt.use_char:
            enc_char_alphabet = Alphabet('enc_char')
        else:
            enc_char_alphabet = None

        if opt.method == 'cla':
            dec_word_alphabet = None
            dec_char_alphabet = None
        else:
            dec_word_alphabet = Alphabet('dec_word')
            if opt.use_char:
                dec_char_alphabet = Alphabet('dec_char')
            else:
                dec_char_alphabet = None

            dec_word_alphabet.add('<SOS>')
            dec_word_alphabet.add('<EOS>')

        build_alphabet(enc_word_alphabet, enc_char_alphabet, dec_word_alphabet,
                       dec_char_alphabet, train_datapoints)
        build_alphabet_1(enc_word_alphabet, enc_char_alphabet,
                         dec_word_alphabet, dec_char_alphabet, dev_datapoints)
        if len(test_documents) != 0:
            build_alphabet_1(enc_word_alphabet, enc_char_alphabet,
                             dec_word_alphabet, dec_char_alphabet,
                             test_datapoints)
        if opt.pretraining:
            build_alphabet(enc_word_alphabet, enc_char_alphabet,
                           dec_word_alphabet, dec_char_alphabet,
                           dict_datapoints)
Exemple #20
0
class BinarySource( Source ):
	""" Source for binary classification data in following format:
	one example per line with feature-value pair separated by
	separator symbol (' ' by default). E.g.:

	1	f1:1.0 f2:1.0 f3:1.0
	-1	f2:1.0 f3:1.0 f8:1.0
	-1	f1:1.0 f2:1.0
	1	f8:1.0 f9:1.0 f10:1.0
	"""
	def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ):
		Source.__init__(self, data, encoding=encoding)
		self._Instance = BinaryClassificationInstance
		if feature_alphabet != None:
			self._feature_alphabet = feature_alphabet
		else:
			self._feature_alphabet = Alphabet(locked=False)
		self._sep = sep
		self._bias = bias
		self._bias_prefix = bias_prefix
		if alphabet_pop:
			self._populate_alphabet()
		if alphabet_lock:
			self.lock_alphabet()
		else:
			self.unlock_alphabet()
		return

	def _parse( self ):
		""" return parsed line """
		sep = self._sep
		for line in self._stream:
			line = line.rstrip()
			items = line.split()
			cl = items[0]
			assert cl in [POS_LAB, NEG_LAB]
			feats = []
			if self._bias:
				feats.append( (self._bias_prefix, 1.0) ) # implicit bias
			for s in items[1:]:
				try:
					f,v = s.rsplit(sep, 1)
					v = float(v)
					feats.append( (f,v) )
				except ValueError:
					sys.exit("Datasource error: make sure you use the right datasource format.")
			yield ( cl, feats )

	def _populate_alphabet( self ):
		print >> sys.stderr, "Populating feature alphabet...             ",
		self.unlock_alphabet()
		if self._stream_type == "generator":
			for i, gen_inst in enumerate(self._stream): # read stream directly
				sys.stderr.write("%s" %"\b"*len(str(i))+str(i))	
				featvals = gen_inst.get_featvals()
				for (f,_) in featvals:
					self._feature_alphabet.add(f)
		else:
			try:
				for tag,feats in self._parse():
					for f,_ in feats:
						self._feature_alphabet.add( f )
			except ValueError:
				sys.exit("Datasource error: make sure you use the right data format.")
			# rewind stream
		try:
			self.rewind()
		except TypeError:
			sys.exit("TypeError: make sure rewind() is used only on files.")
		print >> sys.stderr, " done."
		print >> sys.stderr, "Number of features: %s" %self._feature_alphabet.size()
		return

	def unlock_alphabet( self ):
		self._feature_alphabet.unlock()
		return

	def lock_alphabet( self ):
		self._feature_alphabet.lock()
		return

	def set_alphabet( self, feature_alphabet ):
		self._feature_alphabet = feature_alphabet
		return

	def get_alphabet( self ):
		return self._feature_alphabet

	def get_input( self ):
		for label,feats in self._parse():
			yield label, feats

	def __iter__( self ):
		""" instance generator """
		feature_alphabet = self._feature_alphabet
		assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!"
		if self._stream_type in ["file","list"]:
			for idx,(label,feats) in enumerate(self._parse()):
				if not feature_alphabet.locked(): # dynamic feature alphabet
					for (f,_) in feats:
						feature_alphabet.add(f)
				instance =  self._Instance(idx, label, feats, feature_alphabet)
				yield instance
		elif self._stream_type == "generator":
			for idx, gen_inst in enumerate(self._stream): # read stream directly
				featvals = gen_inst.get_featvals()
				label = gen_inst.get_label()
				if not feature_alphabet.locked(): # dynamic feature alphabet
					for (f,_) in featvals:
						feature_alphabet.add(f)
				instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet)
				yield instance

	def size( self ):
		s = len(list(self._stream))
		self.rewind()
		return s
Exemple #21
0
class VsmNormer(nn.Module):
    def __init__(self):
        super(VsmNormer, self).__init__()
        self.word_alphabet = Alphabet('word')
        self.embedding_dim = None
        self.word_embedding = None
        self.dict_alphabet = Alphabet('dict')
        self.dict_embedding = None
        self.gpu = opt.gpu

    def transfer_model_into_gpu(self):
        if torch.cuda.is_available():
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            self.dict_embedding = self.dict_embedding.cuda(self.gpu)

    def batch_name_to_ids(self, name):
        tokens = my_tokenize(name)
        length = len(tokens)
        tokens_id = np.zeros((1, length), dtype=np.int)
        for i, word in enumerate(tokens):
            word = norm_utils.word_preprocess(word)
            tokens_id[0][i] = self.word_alphabet.get_index(word)

        tokens_id = torch.from_numpy(tokens_id)

        if torch.cuda.is_available():
            return tokens_id.cuda(self.gpu)
        else:
            return tokens_id

    def init_vector_for_dict(self, meddra_dict):
        self.dict_embedding = nn.Embedding(len(meddra_dict),
                                           self.embedding_dim)
        if torch.cuda.is_available():
            self.dict_embedding = self.dict_embedding.cuda(self.gpu)

        for concept_id, concept_name in meddra_dict.items():
            self.dict_alphabet.add(concept_id)
            with torch.no_grad():
                tokens_id = self.batch_name_to_ids(concept_name)
                length = tokens_id.size(1)
                emb = self.word_embedding(tokens_id)
                emb = emb.unsqueeze_(1)
                pool = functional.avg_pool2d(emb, (length, 1))
                index = norm_utils.get_dict_index(self.dict_alphabet,
                                                  concept_id)
                self.dict_embedding.weight.data[index] = pool[0][0]

    def compute_similarity(self, mention_rep, concep_rep):
        # mention_rep is (batch, emb_dim) and concep_rep is (concept_num, emb_dim)
        mention_rep_norm = torch.norm(mention_rep, 2, 1, True)  # batch 1
        concep_rep_norm = torch.norm(concep_rep, 2, 1, True)  # concept 1
        a = torch.matmul(mention_rep_norm,
                         torch.t(concep_rep_norm))  # batch, concept
        a = a.clamp(min=1e-8)

        b = torch.matmul(mention_rep, torch.t(concep_rep))  # batch, concept

        return b / a

    def forward(self, mention_word_ids):
        length = mention_word_ids.size(1)
        mention_word_emb = self.word_embedding(mention_word_ids)
        mention_word_emb = mention_word_emb.unsqueeze_(1)
        mention_word_pool = functional.avg_pool2d(mention_word_emb,
                                                  (length, 1))  # batch,1,1,100
        mention_word_pool = mention_word_pool.squeeze_(1).squeeze_(
            1)  # batch,100

        # similarities = torch.t(torch.matmul(self.dict_embedding.weight.data, torch.t(mention_word_pool))) # batch, dict
        similarities = self.compute_similarity(mention_word_pool,
                                               self.dict_embedding.weight.data)

        values, indices = torch.max(similarities, 1)

        return values, indices

    def process_one_doc(self, doc, entities, dict):

        for entity in entities:
            with torch.no_grad():
                tokens_id = self.batch_name_to_ids(entity.name)

                values, indices = self.forward(tokens_id)

                norm_id = norm_utils.get_dict_name(self.dict_alphabet,
                                                   indices.item())
                name = dict[norm_id]
                entity.norm_ids.append(norm_id)
                entity.norm_names.append(name)
                entity.norm_confidences.append(values.item())
Exemple #22
0
class Data:
    def __init__(self, opt):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.label_alphabet = Alphabet('label', True)

        self.train_texts = None
        self.train_Ids = None
        self.dev_texts = None
        self.dev_Ids = None
        self.test_texts = None
        self.test_Ids = None

        self.pretrain_word_embedding = None
        self.word_emb_dim = opt.word_emb_dim

        self.config = self.read_config(opt.config)
        self.feat_config = None

        the_item = 'ner_feature'
        if the_item in self.config:
            self.feat_config = self.config[the_item]  ## [POS]:{emb_size:20}
            self.feature_alphabets = []
            self.feature_emb_dims = []
            for k, v in self.feat_config.items():
                self.feature_alphabets.append(Alphabet(k))
                self.feature_emb_dims.append(int(v['emb_size']))

    def clear(self):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.train_texts = None
        self.train_Ids = None
        self.dev_texts = None
        self.dev_Ids = None
        self.test_texts = None
        self.test_Ids = None

        self.pretrain_word_embedding = None

    def build_alphabet(self, data):
        for document in data:
            for sentence in document.sentences:
                for token in sentence:
                    word = token['text']
                    if opt.ner_number_normalized:
                        word = normalize_word(word)
                    self.word_alphabet.add(word)
                    if token.get('label') is not None:
                        self.label_alphabet.add(token['label'])
                    # try:
                    #     self.label_alphabet.add(token['label'])
                    # except Exception, e:
                    #     print("document id {} {} {}".format(document.name))
                    #     exit()
                    if self.feat_config is not None:
                        for alphabet in self.feature_alphabets:
                            if alphabet.name == '[POS]':
                                alphabet.add(token['pos'])
                            elif alphabet.name == '[Cap]':
                                alphabet.add(token['cap'])

                    for char in word:
                        self.char_alphabet.add(char)

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pk.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pk.dump(self.__dict__, f, 2)
        f.close()

    def read_config(self, config_file):

        config = config_file_to_dict(config_file)
        return config
Exemple #23
0
def main(argv):
    outdir = "preprocessed_data"

    out_file = ''
    out_reduced = ''
    in_file = ''
    max_tweets = np.inf
    fwemb_vocabulary = None
    try:
        opts, args = getopt.getopt(
            argv, "i:o:f:m:", ["ifile=", "ofile=", "wfilter", 'maxTweets'])
    except getopt.GetoptError:
        print 'test.py -i <inputfile> -o <outputfile>'
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-o", "--ofile"):
            out_file = '{}.pickle'.format(arg)
            out_reduced = '{}_reduced.pickle'.format(arg)
        elif opt in ("-i", "--ifile"):
            in_file = 'semeval/{}.gz'.format(arg)
        elif opt in ('-f', '--wfilter'):
            fwemb_vocabulary = load_glove_vocabulary(
                'embeddings/{}'.format(arg), ' ')
        elif opt in ('-m', '--maxTweets'):
            max_tweets = int(arg)

    print outdir
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    #unsupervised data
    alphabet = Alphabet(start_feature_id=0)
    alphabet.add('UNKNOWN_WORD_IDX')
    dummy_word_idx = alphabet.fid

    tknzr = TweetTokenizer(reduce_len=True)
    fnames_gz = [in_file]

    counter = 0

    for fname in fnames_gz:
        with gzip.open(fname, 'r') as f:
            for tweet in tqdm(f):
                tweet = tknzr.tokenize(preprocess_tweet(tweet))
                for token in tweet:
                    if fwemb_vocabulary:
                        if token in fwemb_vocabulary:
                            alphabet.add(token)
                    else:
                        alphabet.add(token)
                counter += 1
                if (counter % 1000000) == 0:
                    print 'Processed tweets: {}'.format(counter)
                    print 'Alphabet Lenght: {}'.format(len(alphabet))
                if counter > max_tweets:
                    break
        print len(alphabet)

    print 'Alphabet before purge:', len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))

    for word, (idx, freq) in tqdm(alphabet.items()):
        if freq > 10:
            alphabet.add(word)

    alphabet.add('DUMMY_WORD_IDX"')
    print "Alphabet after purge:", len(alphabet)
    cPickle.dump(alphabet, open(os.path.join(outdir, out_reduced), 'wb'))
Exemple #24
0
class BinarySource(Source):
    """ Source for binary classification data in following format:
	one example per line with feature-value pair separated by
	separator symbol (' ' by default). E.g.:

	1	f1:1.0 f2:1.0 f3:1.0
	-1	f2:1.0 f3:1.0 f8:1.0
	-1	f1:1.0 f2:1.0
	1	f8:1.0 f9:1.0 f10:1.0
	"""
    def __init__(self,
                 data,
                 encoding="utf-8",
                 feature_alphabet=None,
                 alphabet_pop=True,
                 alphabet_lock=True,
                 sep=":",
                 bias=False,
                 bias_prefix="@@BIAS@@"):
        Source.__init__(self, data, encoding=encoding)
        self._Instance = BinaryClassificationInstance
        if feature_alphabet != None:
            self._feature_alphabet = feature_alphabet
        else:
            self._feature_alphabet = Alphabet(locked=False)
        self._sep = sep
        self._bias = bias
        self._bias_prefix = bias_prefix
        if alphabet_pop:
            self._populate_alphabet()
        if alphabet_lock:
            self.lock_alphabet()
        else:
            self.unlock_alphabet()
        return

    def _parse(self):
        """ return parsed line """
        sep = self._sep
        for line in self._stream:
            line = line.rstrip()
            items = line.split()
            cl = items[0]
            assert cl in [POS_LAB, NEG_LAB]
            feats = []
            if self._bias:
                feats.append((self._bias_prefix, 1.0))  # implicit bias
            for s in items[1:]:
                try:
                    f, v = s.rsplit(sep, 1)
                    v = float(v)
                    feats.append((f, v))
                except ValueError:
                    sys.exit(
                        "Datasource error: make sure you use the right datasource format."
                    )
            yield (cl, feats)

    def _populate_alphabet(self):
        print >> sys.stderr, "Populating feature alphabet...             ",
        self.unlock_alphabet()
        if self._stream_type == "generator":
            for i, gen_inst in enumerate(self._stream):  # read stream directly
                sys.stderr.write("%s" % "\b" * len(str(i)) + str(i))
                featvals = gen_inst.get_featvals()
                for (f, _) in featvals:
                    self._feature_alphabet.add(f)
        else:
            try:
                for tag, feats in self._parse():
                    for f, _ in feats:
                        self._feature_alphabet.add(f)
            except ValueError:
                sys.exit(
                    "Datasource error: make sure you use the right data format."
                )
            # rewind stream
        try:
            self.rewind()
        except TypeError:
            sys.exit("TypeError: make sure rewind() is used only on files.")
        print >> sys.stderr, " done."
        print >> sys.stderr, "Number of features: %s" % self._feature_alphabet.size(
        )
        return

    def unlock_alphabet(self):
        self._feature_alphabet.unlock()
        return

    def lock_alphabet(self):
        self._feature_alphabet.lock()
        return

    def set_alphabet(self, feature_alphabet):
        self._feature_alphabet = feature_alphabet
        return

    def get_alphabet(self):
        return self._feature_alphabet

    def get_input(self):
        for label, feats in self._parse():
            yield label, feats

    def __iter__(self):
        """ instance generator """
        feature_alphabet = self._feature_alphabet
        assert not (feature_alphabet.empty() and
                    feature_alphabet.locked()), "Feature alphabet is empty!"
        if self._stream_type in ["file", "list"]:
            for idx, (label, feats) in enumerate(self._parse()):
                if not feature_alphabet.locked():  # dynamic feature alphabet
                    for (f, _) in feats:
                        feature_alphabet.add(f)
                instance = self._Instance(idx, label, feats, feature_alphabet)
                yield instance
        elif self._stream_type == "generator":
            for idx, gen_inst in enumerate(
                    self._stream):  # read stream directly
                featvals = gen_inst.get_featvals()
                label = gen_inst.get_label()
                if not feature_alphabet.locked():  # dynamic feature alphabet
                    for (f, _) in featvals:
                        feature_alphabet.add(f)
                instance = self._Instance(idx, label, featvals, label_alphabet,
                                          feature_alphabet)
                yield instance

    def size(self):
        s = len(list(self._stream))
        self.rewind()
        return s
Exemple #25
0
def create_alphabets(alphabet_directory,
                     data_paths,
                     max_vocabulary_size,
                     normalize_digits=True):
    logger = utils.get_logger("Create Alphabets")
    word_alphabet = Alphabet('word')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')
    if not gfile.Exists(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        pos_alphabet.add(ROOT_POS)
        type_alphabet.add(ROOT_TYPE)

        pos_alphabet.add(PAD_POS)
        type_alphabet.add(PAD_TYPE)

        vocab = dict()
        for data_path in data_paths:
            logger.info("Processing data: %s" % data_path)
            with gfile.GFile(data_path, mode="r") as file:
                for line in file:
                    line = line.decode('utf-8')
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split()
                    word = DIGIT_RE.sub(
                        b"0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word in vocab:
                        vocab[word] += 1
                    else:
                        vocab[word] = 1

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
        logger.info("Type Alphabet Size: %d" % type_alphabet.size())

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]
        for word in vocab_list:
            word_alphabet.add(word)

        word_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        type_alphabet.save(alphabet_directory)

    else:
        word_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        type_alphabet.load(alphabet_directory)

    word_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    return word_alphabet, pos_alphabet, type_alphabet
Exemple #26
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None
        self.label_alphabet = {0: Alphabet('label', True)}
        self.tagScheme = "NoSeg"  ## BMES/BIO

        self.seg = True

        ### I/O
        self.train_dir = None
        self.dev_dir = None
        self.test_dir = None
        self.raw_dir = None

        self.decode_dir = None
        self.dset_dir = None  ## data vocabulary related file
        self.model_dir = None  ## model save  file
        self.load_model_dir = None  ## model load file

        self.word_emb_dir = None
        self.char_emb_dir = None
        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_feature_embeddings = []

        #Added for pretraining
        self.PRETRAINED_ALL = "all"
        self.PRETRAINED_LSTMS = "lstms"
        self.pretrained_model = None
        self.pretrained_part = None

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        #self.label_alphabet_size = 0
        self.label_alphabet_sizes = {0: 0}
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        ###Networks
        self.word_feature_extractor = "LSTM"  ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_feature_extractor = "CNN"  ## "LSTM"/"CNN"/"GRU"/None
        self.use_crf = True
        self.nbest = None

        ## Training
        self.average_batch_loss = False
        self.optimizer = "SGD"  ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.status = "train"
        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_feature_default_size = 20
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True

        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8

        #D: The number of tasks to be solved
        self.HP_tasks = 1
        self.HP_main_tasks = self.HP_tasks
        self.HP_tasks_weights = [1]

        self.optimize_with_evalb = False
        self.optimize_with_las = False
        self.offset = False
        self.choice_of_best_model = "avg"
        self.language = "English"
        #   self.HP_tasks_inputs = [self.LSTMOUT]

        #Policy Gradient
        self.No_samples = 8
        self.pg_variance_reduce = True
        self.variance_reduce_burn_in = 999
        self.pg_valsteps = 1000
        self.entropy_regularisation = True
        self.entropy_reg_coeff = 0.01

        #Hyper-parameters for disjoint training
        self.train_task_ids = []
        self.dev_task_ids = []
        self.test_task_ids = []
        self.raw_task_ids = []
        self.disjoint = True
        self.datasets = {}
        self.tasks_metrics = {}
        self.HP_tasks_weight_decays = [0]

    def show_data_summary(self):
        print("++" * 50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        for idtask in self.label_alphabet:
            print("     Label alphabet size for task %s: %s" %
                  (idtask, self.label_alphabet_sizes[idtask]))
        #print("     Label alphabet size: %s"%(self.label_alphabet_size))
        print("     Word embedding  dir: %s" % (self.word_emb_dir))
        print("     Char embedding  dir: %s" % (self.char_emb_dir))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Norm   word     emb: %s" % (self.norm_word_emb))
        print("     Norm   char     emb: %s" % (self.norm_char_emb))
        print("     Train  file directory: %s" % (self.train_dir))
        print("     Dev    file directory: %s" % (self.dev_dir))
        print("     Test   file directory: %s" % (self.test_dir))
        print("     Raw    file directory: %s" % (self.raw_dir))
        print("     Dset   file directory: %s" % (self.dset_dir))
        print("     Model  file directory: %s" % (self.model_dir))
        print("     Pretrained model     : %s" % (self.pretrained_model))
        print("     Pretrained part      : %s" % (self.pretrained_part))
        print("     Loadmodel   directory: %s" % (self.load_model_dir))
        print("     Decode file directory: %s" % (self.decode_dir))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     FEATURE num: %s" % (self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s" %
                  (self.feature_alphabets[idx].name,
                   self.feature_alphabet_sizes[idx]))
            print(
                "         Fe: %s  embedding  dir: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print(
                "         Fe: %s  embedding size: %s" %
                (self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s" %
                  (self.feature_alphabets[idx].name,
                   self.norm_feature_embs[idx]))
        print(" " + "++" * 20)
        print(" Model Network:")
        print("     Model        use_crf: %s" % (self.use_crf))
        print("     Model word extractor: %s" % (self.word_feature_extractor))
        print("     Model       use_char: %s" % (self.use_char))
        if self.use_char:
            print("     Model char extractor: %s" %
                  (self.char_feature_extractor))
            print("     Model char_hidden_dim: %s" % (self.HP_char_hidden_dim))
        print(" " + "++" * 20)
        print(" Training:")
        print("     Optimizer: %s" % (self.optimizer))
        print("     Iteration: %s" % (self.HP_iteration))
        print("     BatchSize: %s" % (self.HP_batch_size))
        print("     Average  batch   loss: %s" % (self.average_batch_loss))

        print(" " + "++" * 20)
        print(" Hyperparameters:")

        print("     Hyper              lr: %s" % (self.HP_lr))
        print("     Hyper        lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyper         HP_clip: %s" % (self.HP_clip))
        print("     Hyper        momentum: %s" % (self.HP_momentum))
        print("     Hyper              l2: %s" % (self.HP_l2))
        print("     Hyper      hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyper         dropout: %s" % (self.HP_dropout))
        print("     Hyper      lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyper          bilstm: %s" % (self.HP_bilstm))
        print("     Hyper             GPU: %s" % (self.HP_gpu))
        print("     Hyper number of tasks: %s" % (self.HP_tasks))

        print("DATA SUMMARY END.")
        print("++" * 50)
        sys.stdout.flush()

    def initial_feature_alphabets(self):
        for l in open(self.train_dir, 'r').readlines():
            if not l.startswith("#") and not l.startswith("-BOS-"):
                items = l.strip("\n").split()
                break

        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column - 1):
                feature_prefix = items[idx].split(']', 1)[0] + "]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix
        self.feature_num = len(self.feature_alphabets)

        self.pretrain_feature_embeddings = [None] * self.feature_num
        self.feature_emb_dims = [self.HP_feature_default_size
                                 ] * self.feature_num
        #self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None] * self.feature_num
        self.norm_feature_embs = [False] * self.feature_num
        self.feature_alphabet_sizes = [0] * self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[
                        self.feature_name[idx]]['emb_norm']

    def build_alphabet(self, input_file):
        sample_corpus = None
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if line.upper().startswith(
                    TREEBANK_LINE
            ):  #Check the treebank this sentence comes from
                sample_corpus = "[" + line.upper().replace(TREEBANK_LINE,
                                                           "").strip() + "]"

            elif len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]

                if self.HP_tasks > 1 or not self.disjoint:  #self.task_config[sample_corpus]["nb_tasks"] > 1:
                    label = parse_multitask_label(label)
                else:
                    label = [label]

                if len(label) != len(
                        self.label_alphabet) and not self.disjoint:
                    raise ValueError(
                        "The number of tasks and the number of labels in the output column do not match"
                    )

                init_label_alp_index = 0 if not self.disjoint else self.task_config[
                    sample_corpus]["idstask"]
                for idtask, l in enumerate(label, init_label_alp_index):
                    #for idtask, l in enumerate(label):
                    self.label_alphabet[idtask].add(l)
                self.word_alphabet.add(word)
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx + 1].split(']', 1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)

                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()

        for idtask in self.label_alphabet:
            self.label_alphabet_sizes[idtask] = self.label_alphabet[
                idtask].size()

        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[
                idx].size()

        for idtask in self.label_alphabet:
            startS = False
            startB = False

            for label, _ in self.label_alphabet[idtask].iteritems():
                if "S-" in label.upper():
                    startS = True
                elif "B-" in label.upper():
                    startB = True
            if startB:
                if startS:
                    self.tagScheme = "BMES"
                else:
                    self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()

        for idtask in self.label_alphabet:
            self.label_alphabet[idtask].close()
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()

    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s" %
                  (self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
                self.word_emb_dir, self.word_alphabet, self.word_emb_dim,
                self.norm_word_emb)
        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s" %
                  (self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(
                self.char_emb_dir, self.char_alphabet, self.char_emb_dim,
                self.norm_char_emb)
        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print(
                    "Load pretrained feature %s embedding:, norm: %s, dir: %s"
                    % (self.feature_name[idx], self.norm_feature_embs[idx],
                       self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[
                    idx] = build_pretrain_embedding(
                        self.feature_emb_dirs[idx],
                        self.feature_alphabets[idx],
                        self.feature_emb_dims[idx],
                        self.norm_feature_embs[idx])

    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                self.train_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                self.dev_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                self.test_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(
                self.raw_dir, self.word_alphabet, self.char_alphabet,
                self.feature_alphabets, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH,
                self.task_config if self.disjoint else None)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, predict_results, name, indexes=None):
        fout = open(self.decode_dir, 'w')
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        for task_predict_results in predict_results:
            sent_num = len(task_predict_results)
            assert (sent_num == len(content_list))

        for idx in range(sent_num):

            if indexes is not None and idx not in indexes:
                continue

            sent_length = len(
                predict_results[0]
                [idx])  #Index 0 to know the length of the input sentence
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                inputs = []
                for id_input in range(len(content_list[idx]) - 2):
                    if content_list[idx][id_input][0] != []:
                        if type(content_list[idx][id_input][idy]) == type([]):
                            for feature in content_list[idx][id_input][idy]:
                                inputs.append(feature.encode('utf-8'))
                        else:
                            inputs.append(content_list[idx][id_input]
                                          [idy].encode('utf-8'))

                outputs = []
                for task in predict_results:
                    outputs.append(task[idx][idy])

                fout.write("\t".join(inputs) + "\t" + "{}".join(outputs) +
                           '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, self.decode_dir))

    def load(self, data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self, save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        fout = open(self.decode_dir, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )

        for idtask_predict_results, task_predict_results in enumerate(
                predict_results):
            sent_num = len(task_predict_results)
            assert (sent_num == len(content_list))

        for idx in range(sent_num):
            score_string = "# "

            for idtask_predict_results, task_predict_results in enumerate(
                    predict_results):
                sent_length = len(task_predict_results[idx][0])
                nbest = len(task_predict_results[0])

                #Printing the probabilities
                for idz in range(nbest):
                    score_string += format(
                        pred_scores[idtask_predict_results][idx][idz],
                        '.4f') + " "
            fout.write(score_string.strip() + "\t")
            fout.write("\n")

            for idy in range(sent_length):

                label_string = content_list[idx][0][idy].encode('utf-8') + "\t"
                for ifeat in range(len(content_list[idx][1][idy])):
                    label_string += content_list[idx][1][idy][ifeat].encode(
                        'utf-8') + "\t"

                for idtask_predict_results, task_predict_results in enumerate(
                        predict_results):
                    for idz in range(nbest):
                        label_string += task_predict_results[idx][idz][
                            idy] + ","
                    label_string = label_string.strip().strip(",") + "{}"
                fout.write(label_string)
                fout.write('\n')
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s" %
              (name, nbest, self.decode_dir))

    def read_config(self, config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'dset_dir'
        if the_item in config:
            self.dset_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]
        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]

        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])

        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_feature_extractor = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item]  ## feat_config is a dict

        the_item = 'feature_default_size'
        if the_item in config:
            self.HP_feature_default_size = int(config[the_item])

        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]

        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])

        #Hyperparameters for auxiliary tasks over the same treebank

        the_item = 'disjoint'
        if the_item in config:
            self.disjoint = str2bool(config[the_item])

        if not self.disjoint:

            the_item = 'tasks'
            if the_item in config:
                self.HP_tasks = int(config[the_item])
                if self.HP_tasks > 1:
                    self.label_alphabet = {
                        idtask: Alphabet('label', True)
                        for idtask in range(self.HP_tasks)
                    }
                    self.label_alphabet_sizes = {
                        idtask: self.label_alphabet[idtask].size()
                        for idtask in range(self.HP_tasks)
                    }

            the_item = "main_tasks"
            if the_item in config:
                self.HP_main_tasks = int(config[the_item])
                print self.HP_main_tasks, self.HP_tasks
                if self.HP_main_tasks > self.HP_tasks:
                    raise ValueError(
                        "HP_main_tasks cannot be greater than HP_tasks")

            the_item = 'tasks_weights'
            if the_item in config:
                self.HP_tasks_weights = map(float, config[the_item].split("|"))

        else:
            #Hyperparameters for auxiliary tasks over a different treebank
            the_item = 'dataset'
            if the_item in config:
                self.task_config = config[the_item]  ## feat_config is a dict
                self.HP_tasks = sum([
                    self.task_config[idtask]["nb_tasks"]
                    for idtask in self.task_config
                ])

                self.HP_main_tasks = sum([
                    self.task_config[idtask]["nb_tasks"]
                    for idtask in self.task_config
                    if self.task_config[idtask]["main"]
                ])

                self.label_alphabet = {
                    idtask: Alphabet('label', True)
                    for idtask in range(self.HP_tasks)
                }
                self.label_alphabet_sizes = {
                    idtask: self.label_alphabet[idtask].size()
                    for idtask in range(self.HP_tasks)
                }

                self.HP_tasks_weights = []
                self.HP_tasks_weight_decays = []
                for idtask in self.task_config:
                    for weight in self.task_config[idtask]["weight"]:
                        self.HP_tasks_weights.append(weight)

                    if "weight_decay" in self.task_config[idtask]:
                        for weight_decay in self.task_config[idtask][
                                "weight_decay"]:
                            self.HP_tasks_weight_decays.append(weight_decay)
                    else:
                        for j in range(self.task_config[idtask]["nb_tasks"]):
                            self.HP_tasks_weight_decays.append(0)

                self.dataset_ids = {
                    treebank: range(
                        self.task_config[treebank]["idstask"],
                        self.task_config[treebank]["idstask"] +
                        self.task_config[treebank]["nb_tasks"])
                    for id, treebank in enumerate(self.task_config)
                }

                self.ignore_after_epoch = {
                    treebank: self.task_config[treebank]["ignore_after_epoch"]
                    if "ignore_after_epoch" in self.task_config[treebank] else
                    self.HP_iteration + 1
                    for treebank in self.task_config
                }

                self.inv_dataset_ids = {}
                for tb in self.dataset_ids:
                    for subtask in self.dataset_ids[tb]:
                        self.inv_dataset_ids[subtask] = tb

                self.task_metric = {}
                for dataset in self.task_config:
                    for i in range(
                            self.task_config[dataset]["idstask"],
                            self.task_config[dataset]["idstask"] +
                            self.task_config[dataset]["nb_tasks"]):

                        if "metric" in self.task_config[dataset]:
                            self.task_metric[i] = self.task_config[dataset][
                                "metric"]

        the_item = 'evaluate'
        if the_item in config:
            self.evaluate = config[the_item]

        the_item = "gold_dev_trees"
        if the_item in config:
            self.gold_dev_trees = config[the_item]

        the_item = "gold_dev_dep"
        if the_item in config:
            self.gold_dev_dep = config[the_item]

        the_item = "combine_dependency_offset"
        if the_item in config:
            self.offset = str2bool(config[the_item])

        the_item = "pretrained_model"
        if the_item in config:
            self.pretrained_model = config[the_item]

        the_item = "pretrained_part"
        if the_item in config:
            if config[the_item].lower() not in [
                    self.PRETRAINED_ALL, self.PRETRAINED_LSTMS
            ]:
                raise ValueError(
                    "Invalidad value for pretrained_part (must be 'all' or 'lstms' "
                )
            self.pretrained_part = config[the_item]

        the_item = "optimize_with_las"
        if the_item in config:
            self.optimize_with_las = str2bool(config[the_item])

        the_item = "gold_train_trees"
        if the_item in config:
            self.gold_train_trees = config[the_item]
Exemple #27
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 230
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = False
        self.norm_word_emb = True
        self.norm_biword_emb = True
        self.norm_gaz_emb = False
        self.word_alphabet = Alphabet('word')
        self.biword_alphabet = Alphabet('biword')
        self.char_alphabet = Alphabet('character')
        # self.word_alphabet.add(START)
        # self.word_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(START)
        # self.char_alphabet.add(UNKNOWN)
        # self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label', True)
        self.gaz_lower = False
        self.gaz = Gazetteer(self.gaz_lower)
        self.gaz_alphabet = Alphabet('gaz')
        self.HP_fix_gaz_emb = False
        self.HP_use_gaz = True

        self.tagScheme = "BMES"
        self.char_features = "LSTM"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []
        self.use_bigram = False
        self.word_emb_dim = 50
        self.biword_emb_dim = 50
        self.char_emb_dim = 50
        self.gaz_emb_dim = 50
        self.gaz_dropout = 0.5
        self.pretrain_word_embedding = None
        self.pretrain_biword_embedding = None
        self.pretrain_gaz_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.biword_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        # hyperparameters
        self.HP_iteration = 100
        self.HP_batch_size = 1
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = 5.0
        self.HP_momentum = 0

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Use          bigram: %s" % (self.use_bigram))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Biword alphabet size: %s" % (self.biword_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Gaz   alphabet size: %s" % (self.gaz_alphabet.size()))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Biword embedding size: %s" % (self.biword_emb_dim))
        print("     Char embedding size: %s" % (self.char_emb_dim))
        print("     Gaz embedding size: %s" % (self.gaz_emb_dim))
        print("     Norm     word   emb: %s" % (self.norm_word_emb))
        print("     Norm     biword emb: %s" % (self.norm_biword_emb))
        print("     Norm     gaz    emb: %s" % (self.norm_gaz_emb))
        print("     Norm   gaz  dropout: %s" % (self.gaz_dropout))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyperpara  iteration: %s" % (self.HP_iteration))
        print("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        print("     Hyperpara          lr: %s" % (self.HP_lr))
        print("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        print("     Hyperpara    momentum: %s" % (self.HP_momentum))
        print("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyperpara     dropout: %s" % (self.HP_dropout))
        print("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        print("     Hyperpara         GPU: %s" % (self.HP_gpu))
        print("     Hyperpara     use_gaz: %s" % (self.HP_use_gaz))
        print("     Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))
        if self.HP_use_char:
            print("             Char_features: %s" % (self.char_features))
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def refresh_label_alphabet(self, input_file):
        old_size = self.label_alphabet_size
        self.label_alphabet.clear(True)
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                label = pairs[-1]
                self.label_alphabet.add(label)
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"
        self.fix_alphabet()
        print("Refresh label alphabet finished: old:%s -> new:%s" %
              (old_size, self.label_alphabet_size))

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for idx in xrange(len(in_lines)):
            line = in_lines[idx]
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                #  获取label
                label = pairs[-1]
                # 安装出现顺序添加
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2:
                    biword = word + in_lines[
                        idx + 1].strip().split()[0].decode('utf-8')
                else:
                    biword = word + NULLKEY
                self.biword_alphabet.add(biword)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.biword_alphabet_size = self.biword_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        # 判断是否属于BIO,BMES,BIOES其中一�?
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                # 如果有S则为BMES或BIOES
                self.tagScheme = "BMES"
            else:
                # 没有则为BIO
                self.tagScheme = "BIO"

    def build_gaz_file(self, gaz_file):
        # build gaz file,initial read gaz embedding file
        if gaz_file:
            fins = open(gaz_file, 'r').readlines()
            for fin in fins:
                fin = fin.strip().split()[0].decode('utf-8')
                if fin:
                    self.gaz.insert(fin, "one_source")
            print
            "Load gaz file: ", gaz_file, " total size:", self.gaz.size()
        else:
            print
            "Gaz file is None, load nothing"

    def build_gaz_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        word_list = []
        for line in in_lines:
            if len(line) > 3:
                word = line.split()[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                word_list.append(word)
            else:
                w_length = len(word_list)
                for idx in range(w_length):
                    matched_entity = self.gaz.enumerateMatchList(
                        word_list[idx:])
                    for entity in matched_entity:
                        # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity)
                        self.gaz_alphabet.add(entity)
                word_list = []
        print
        "gaz alphabet size:", self.gaz_alphabet.size()

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.biword_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()
        self.gaz_alphabet.close()

    def build_word_pretrain_emb(self, emb_path):
        print
        "build word pretrain emb..."
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_radical_pretrain_emb(self, emb_path):
        print
        "build radical pretrain emb..."
        self.pretrain_word_embedding, self.word_emb_dim = build_radical_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim,
            self.norm_word_emb)

    def build_biword_pretrain_emb(self, emb_path):
        print
        "build biword pretrain emb..."
        self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding(
            emb_path, self.biword_alphabet, self.biword_emb_dim,
            self.norm_biword_emb)

    def build_gaz_pretrain_emb(self, emb_path):
        print
        "build gaz pretrain emb..."
        self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding(
            emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_seg_instance(
                input_file, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def generate_instance_with_gaz(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance_with_gaz(
                input_file, self.gaz, self.word_alphabet, self.biword_alphabet,
                self.char_alphabet, self.gaz_alphabet, self.label_alphabet,
                self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))

    def write_decoded_results(self, output_file, predict_results, name):
        fout = open(output_file, 'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
            content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print(
                "Error: illegal name during writing predict result, name should be within train/dev/test/raw !"
            )
        assert (sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                # content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " +
                           predict_results[idx][idy] + '\n')

            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s" %
              (name, output_file))
Exemple #28
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 250
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = True
        self.norm_word_emb = False
        self.norm_char_emb = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')

        self.feature_name = []
        self.feature_alphabets = []
        self.feature_num = len(self.feature_alphabets)
        self.feat_config = None


        self.label_alphabet = Alphabet('label',True)
        self.tagScheme = "NoSeg" ## BMES/BIO
        
        self.seg = True

        ### I/O
        self.train_dir = None 
        self.dev_dir = None 
        self.test_dir = None 
        self.raw_dir = None

        self.decode_dir = None
        self.dset_dir = None ## data vocabulary related file
        self.model_dir = None ## model save  file
        self.load_model_dir = None ## model load file

        self.word_emb_dir = None 
        self.char_emb_dir = None
        self.feature_emb_dirs = []

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.pretrain_word_embedding = None
        self.pretrain_char_embedding = None
        self.pretrain_feature_embeddings = []

        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        self.feature_alphabet_sizes = []
        self.feature_emb_dims = []
        self.norm_feature_embs = []
        self.word_emb_dim = 50
        self.char_emb_dim = 30

        ###Networks
        self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/
        self.use_char = True
        self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None
        self.use_crf = True
        self.nbest = None
        
        ## Training
        self.average_batch_loss = False
        self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
        self.status = "train"
        ### Hyperparameters
        self.HP_cnn_layer = 4
        self.HP_iteration = 100
        self.HP_batch_size = 10
        self.HP_char_hidden_dim = 50
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0.05
        self.HP_clip = None
        self.HP_momentum = 0
        self.HP_l2 = 1e-8
        
    def show_data_summary(self):
        print("++"*50)
        print("DATA SUMMARY START:")
        print(" I/O:")
        print("     Tag          scheme: %s"%(self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s"%(self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s"%(self.number_normalized))
        print("     Word  alphabet size: %s"%(self.word_alphabet_size))
        print("     Char  alphabet size: %s"%(self.char_alphabet_size))
        print("     Label alphabet size: %s"%(self.label_alphabet_size))
        print("     Word embedding  dir: %s"%(self.word_emb_dir))
        print("     Char embedding  dir: %s"%(self.char_emb_dir))
        print("     Word embedding size: %s"%(self.word_emb_dim))
        print("     Char embedding size: %s"%(self.char_emb_dim))
        print("     Norm   word     emb: %s"%(self.norm_word_emb))
        print("     Norm   char     emb: %s"%(self.norm_char_emb))
        print("     Train  file directory: %s"%(self.train_dir))
        print("     Dev    file directory: %s"%(self.dev_dir))
        print("     Test   file directory: %s"%(self.test_dir))
        print("     Raw    file directory: %s"%(self.raw_dir))
        print("     Dset   file directory: %s"%(self.dset_dir))
        print("     Model  file directory: %s"%(self.model_dir))
        print("     Loadmodel   directory: %s"%(self.load_model_dir))
        print("     Decode file directory: %s"%(self.decode_dir))
        print("     Train instance number: %s"%(len(self.train_texts)))
        print("     Dev   instance number: %s"%(len(self.dev_texts)))
        print("     Test  instance number: %s"%(len(self.test_texts)))
        print("     Raw   instance number: %s"%(len(self.raw_texts)))
        print("     FEATURE num: %s"%(self.feature_num))
        for idx in range(self.feature_num):
            print("         Fe: %s  alphabet  size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx]))
            print("         Fe: %s  embedding  dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
            print("         Fe: %s  embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
            print("         Fe: %s  norm       emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx]))
        print(" "+"++"*20)
        print(" Model Network:")
        print("     Model        use_crf: %s"%(self.use_crf))
        print("     Model word extractor: %s"%(self.word_feature_extractor))
        print("     Model       use_char: %s"%(self.use_char))
        if self.use_char:
            print("     Model char extractor: %s"%(self.char_feature_extractor))
            print("     Model char_hidden_dim: %s"%(self.HP_char_hidden_dim))
        print(" "+"++"*20)
        print(" Training:")
        print("     Optimizer: %s"%(self.optimizer))
        print("     Iteration: %s"%(self.HP_iteration))
        print("     BatchSize: %s"%(self.HP_batch_size))
        print("     Average  batch   loss: %s"%(self.average_batch_loss))

        print(" "+"++"*20)
        print(" Hyperparameters:")
        
        print("     Hyper              lr: %s"%(self.HP_lr))
        print("     Hyper        lr_decay: %s"%(self.HP_lr_decay))
        print("     Hyper         HP_clip: %s"%(self.HP_clip))
        print("     Hyper        momentum: %s"%(self.HP_momentum))
        print("     Hyper              l2: %s"%(self.HP_l2))
        print("     Hyper      hidden_dim: %s"%(self.HP_hidden_dim))
        print("     Hyper         dropout: %s"%(self.HP_dropout))
        print("     Hyper      lstm_layer: %s"%(self.HP_lstm_layer))
        print("     Hyper          bilstm: %s"%(self.HP_bilstm))
        print("     Hyper             GPU: %s"%(self.HP_gpu))   
        print("DATA SUMMARY END.")
        print("++"*50)
        sys.stdout.flush()


    def initial_feature_alphabets(self):
        items = open(self.train_dir,'r').readline().strip('\n').split()
        total_column = len(items)
        if total_column > 2:
            for idx in range(1, total_column-1):
                feature_prefix = items[idx].split(']',1)[0]+"]"
                self.feature_alphabets.append(Alphabet(feature_prefix))
                self.feature_name.append(feature_prefix)
                print "Find feature: ", feature_prefix 
        self.feature_num = len(self.feature_alphabets)
        self.pretrain_feature_embeddings = [None]*self.feature_num
        self.feature_emb_dims = [20]*self.feature_num
        self.feature_emb_dirs = [None]*self.feature_num 
        self.norm_feature_embs = [False]*self.feature_num
        self.feature_alphabet_sizes = [0]*self.feature_num
        if self.feat_config:
            for idx in range(self.feature_num):
                if self.feature_name[idx] in self.feat_config:
                    self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size']
                    self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir']
                    self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm']
        # exit(0)


    def build_alphabet(self, input_file):
        in_lines = open(input_file,'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0].decode('utf-8')
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                ## build feature alphabet 
                for idx in range(self.feature_num):
                    feat_idx = pairs[idx+1].split(']',1)[-1]
                    self.feature_alphabets[idx].add(feat_idx)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        for idx in range(self.feature_num):
            self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size()
        startS = False
        startB = False
        for label,_ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"


    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close() 
        for idx in range(self.feature_num):
            self.feature_alphabets[idx].close()      


    def build_pretrain_emb(self):
        if self.word_emb_dir:
            print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir))
            self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb)
        if self.char_emb_dir:
            print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir))
            self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb)
        for idx in range(self.feature_num):
            if self.feature_emb_dirs[idx]:
                print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx]))
                self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx])


    def generate_instance(self, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        elif name == "raw":
            self.raw_texts, self.raw_Ids = read_instance(self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
        else:
            print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))


    def write_decoded_results(self, predict_results, name):
        fout = open(self.decode_dir,'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
           content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !")
        assert(sent_num == len(content_list))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx])
            for idy in range(sent_length):
                ## content_list[idx] is a list with [word, char, label]
                fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n')
            fout.write('\n')
        fout.close()
        print("Predict %s result has been written into file. %s"%(name, self.decode_dir))


    def load(self,data_file):
        f = open(data_file, 'rb')
        tmp_dict = pickle.load(f)
        f.close()
        self.__dict__.update(tmp_dict)

    def save(self,save_file):
        f = open(save_file, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()



    def write_nbest_decoded_results(self, predict_results, pred_scores, name):
        ## predict_results : [whole_sent_num, nbest, each_sent_length]
        ## pred_scores: [whole_sent_num, nbest]
        fout = open(self.decode_dir,'w')
        sent_num = len(predict_results)
        content_list = []
        if name == 'raw':
           content_list = self.raw_texts
        elif name == 'test':
            content_list = self.test_texts
        elif name == 'dev':
            content_list = self.dev_texts
        elif name == 'train':
            content_list = self.train_texts
        else:
            print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !")
        assert(sent_num == len(content_list))
        assert(sent_num == len(pred_scores))
        for idx in range(sent_num):
            sent_length = len(predict_results[idx][0])
            nbest = len(predict_results[idx])
            score_string = "# "
            for idz in range(nbest):
                score_string += format(pred_scores[idx][idz], '.4f')+" "
            fout.write(score_string.strip() + "\n")

            for idy in range(sent_length):
                label_string = content_list[idx][0][idy].encode('utf-8') + " "
                for idz in range(nbest):
                    label_string += predict_results[idx][idz][idy]+" "
                label_string = label_string.strip() + "\n"
                fout.write(label_string)
            fout.write('\n')
        fout.close()
        print("Predict %s %s-best result has been written into file. %s"%(name,nbest, self.decode_dir))


    def read_config(self,config_file):
        config = config_file_to_dict(config_file)
        ## read data:
        the_item = 'train_dir'
        if the_item in config:
            self.train_dir = config[the_item]
        the_item = 'dev_dir'
        if the_item in config:
            self.dev_dir = config[the_item]
        the_item = 'test_dir'
        if the_item in config:
            self.test_dir = config[the_item]
        the_item = 'raw_dir'
        if the_item in config:
            self.raw_dir = config[the_item]
        the_item = 'decode_dir'
        if the_item in config:
            self.decode_dir = config[the_item]
        the_item = 'dset_dir'
        if the_item in config:
            self.dset_dir = config[the_item]
        the_item = 'model_dir'
        if the_item in config:
            self.model_dir = config[the_item]
        the_item = 'load_model_dir'
        if the_item in config:
            self.load_model_dir = config[the_item]

        the_item = 'word_emb_dir'
        if the_item in config:
            self.word_emb_dir = config[the_item]
        the_item = 'char_emb_dir'
        if the_item in config:
            self.char_emb_dir = config[the_item]


        the_item = 'MAX_SENTENCE_LENGTH'
        if the_item in config:
            self.MAX_SENTENCE_LENGTH = int(config[the_item])
        the_item = 'MAX_WORD_LENGTH'
        if the_item in config:
            self.MAX_WORD_LENGTH = int(config[the_item])

        the_item = 'norm_word_emb'
        if the_item in config:
            self.norm_word_emb = str2bool(config[the_item])
        the_item = 'norm_char_emb'
        if the_item in config:
            self.norm_char_emb = str2bool(config[the_item])
        the_item = 'number_normalized'
        if the_item in config:
            self.number_normalized = str2bool(config[the_item])


        the_item = 'seg'
        if the_item in config:
            self.seg = str2bool(config[the_item])
        the_item = 'word_emb_dim'
        if the_item in config:
            self.word_emb_dim = int(config[the_item])
        the_item = 'char_emb_dim'
        if the_item in config:
            self.char_emb_dim = int(config[the_item])

        ## read network:
        the_item = 'use_crf'
        if the_item in config:
            self.use_crf = str2bool(config[the_item])
        the_item = 'use_char'
        if the_item in config:
            self.use_char = str2bool(config[the_item])
        the_item = 'word_seq_feature'
        if the_item in config:
            self.word_feature_extractor = config[the_item]
        the_item = 'char_seq_feature'
        if the_item in config:
            self.char_feature_extractor = config[the_item]
        the_item = 'nbest'
        if the_item in config:
            self.nbest = int(config[the_item])

        the_item = 'feature'
        if the_item in config:
            self.feat_config = config[the_item] ## feat_config is a dict 






        ## read training setting:
        the_item = 'optimizer'
        if the_item in config:
            self.optimizer = config[the_item]
        the_item = 'ave_batch_loss'
        if the_item in config:
            self.average_batch_loss = str2bool(config[the_item])
        the_item = 'status'
        if the_item in config:
            self.status = config[the_item]

        ## read Hyperparameters:
        the_item = 'cnn_layer'
        if the_item in config:
            self.HP_cnn_layer = int(config[the_item])
        the_item = 'iteration'
        if the_item in config:
            self.HP_iteration = int(config[the_item])
        the_item = 'batch_size'
        if the_item in config:
            self.HP_batch_size = int(config[the_item])

        the_item = 'char_hidden_dim'
        if the_item in config:
            self.HP_char_hidden_dim = int(config[the_item])
        the_item = 'hidden_dim'
        if the_item in config:
            self.HP_hidden_dim = int(config[the_item])
        the_item = 'dropout'
        if the_item in config:
            self.HP_dropout = float(config[the_item])
        the_item = 'lstm_layer'
        if the_item in config:
            self.HP_lstm_layer = int(config[the_item])
        the_item = 'bilstm'
        if the_item in config:
            self.HP_bilstm = str2bool(config[the_item])

        the_item = 'gpu'
        if the_item in config:
            self.HP_gpu = str2bool(config[the_item])
        the_item = 'learning_rate'
        if the_item in config:
            self.HP_lr = float(config[the_item])
        the_item = 'lr_decay'
        if the_item in config:
            self.HP_lr_decay = float(config[the_item])
        the_item = 'clip'
        if the_item in config:
            self.HP_clip = float(config[the_item])
        the_item = 'momentum'
        if the_item in config:
            self.HP_momentum = float(config[the_item])
        the_item = 'l2'
        if the_item in config:
            self.HP_l2 = float(config[the_item])
Exemple #29
0
class Data:
    def __init__(self):
        self.MAX_SENTENCE_LENGTH = 512
        self.MAX_WORD_LENGTH = -1
        self.number_normalized = False
        self.word_alphabet = Alphabet('word')
        self.char_alphabet = Alphabet('character')
        self.word_alphabet.add(START)
        self.word_alphabet.add(UNKNOWN)
        self.char_alphabet.add(START)
        self.char_alphabet.add(UNKNOWN)
        self.char_alphabet.add(PADDING)
        self.label_alphabet = Alphabet('label')
        self.tagScheme = "NoSeg"

        self.train_texts = []
        self.dev_texts = []
        self.test_texts = []
        self.raw_texts = []

        self.train_Ids = []
        self.dev_Ids = []
        self.test_Ids = []
        self.raw_Ids = []

        self.word_emb_dim = 50
        self.pretrain_word_embedding = None
        self.label_size = 0
        self.word_alphabet_size = 0
        self.char_alphabet_size = 0
        self.label_alphabet_size = 0
        ### hyperparameters
        self.HP_batch_size = 10
        self.HP_hidden_dim = 200
        self.HP_dropout = 0.5
        self.HP_lstm_layer = 1
        self.HP_bilstm = True
        self.HP_use_char = True
        self.HP_gpu = False
        self.HP_lr = 0.015
        self.HP_lr_decay = 0
        self.HP_clip = 5.0
        self.HP_momentum = 0

    def show_data_summary(self):
        print("DATA SUMMARY START:")
        print("     Tag          scheme: %s" % (self.tagScheme))
        print("     MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH))
        print("     MAX   WORD   LENGTH: %s" % (self.MAX_WORD_LENGTH))
        print("     Number   normalized: %s" % (self.number_normalized))
        print("     Word  alphabet size: %s" % (self.word_alphabet_size))
        print("     Char  alphabet size: %s" % (self.char_alphabet_size))
        print("     Label alphabet size: %s" % (self.label_alphabet_size))
        print("     Word embedding size: %s" % (self.word_emb_dim))
        print("     Train instance number: %s" % (len(self.train_texts)))
        print("     Dev   instance number: %s" % (len(self.dev_texts)))
        print("     Test  instance number: %s" % (len(self.test_texts)))
        print("     Raw   instance number: %s" % (len(self.raw_texts)))
        print("     Hyperpara  batch size: %s" % (self.HP_batch_size))
        print("     Hyperpara          lr: %s" % (self.HP_lr))
        print("     Hyperpara    lr_decay: %s" % (self.HP_lr_decay))
        print("     Hyperpara     HP_clip: %s" % (self.HP_clip))
        print("     Hyperpara    momentum: %s" % (self.HP_momentum))
        print("     Hyperpara  hidden_dim: %s" % (self.HP_hidden_dim))
        print("     Hyperpara     dropout: %s" % (self.HP_dropout))
        print("     Hyperpara  lstm_layer: %s" % (self.HP_lstm_layer))
        print("     Hyperpara      bilstm: %s" % (self.HP_bilstm))
        print("     Hyperpara    use_char: %s" % (self.HP_use_char))
        print("     Hyperpara         GPU: %s" % (self.HP_gpu))
        print("DATA SUMMARY END.")
        sys.stdout.flush()

    def build_alphabet(self, input_file):
        in_lines = open(input_file, 'r').readlines()
        for line in in_lines:
            if len(line) > 2:
                pairs = line.strip().split()
                word = pairs[0]
                if self.number_normalized:
                    word = normalize_word(word)
                label = pairs[-1]
                self.label_alphabet.add(label)
                self.word_alphabet.add(word)
                for char in word:
                    self.char_alphabet.add(char)
        self.word_alphabet_size = self.word_alphabet.size()
        self.char_alphabet_size = self.char_alphabet.size()
        self.label_alphabet_size = self.label_alphabet.size()
        startS = False
        startB = False
        for label, _ in self.label_alphabet.iteritems():
            if "S-" in label.upper():
                startS = True
            elif "B-" in label.upper():
                startB = True
        if startB:
            if startS:
                self.tagScheme = "BMES"
            else:
                self.tagScheme = "BIO"

    def fix_alphabet(self):
        self.word_alphabet.close()
        self.char_alphabet.close()
        self.label_alphabet.close()

    def build_word_pretrain_emb(self, emb_path, norm=False):
        self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(
            emb_path, self.word_alphabet, self.word_emb_dim, norm)

    def generate_instance(self, input_file, name):
        self.fix_alphabet()
        if name == "train":
            self.train_texts, self.train_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_WORD_LENGTH)
        elif name == "dev":
            self.dev_texts, self.dev_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_WORD_LENGTH)
        elif name == "test":
            self.test_texts, self.test_Ids = read_instance(
                input_file, self.word_alphabet, self.char_alphabet,
                self.label_alphabet, self.number_normalized,
                self.MAX_WORD_LENGTH)
        else:
            print(
                "Error: you can only generate train/dev/test instance! Illegal input:%s"
                % (name))