def read_data(self, source, train_data=True):
        df = pd.read_csv(source, sep='\t', header=None)

        df.columns = [
            "polarity", "aspect_category", "target_term", "character_offset",
            "sentence"
        ]
        df["label"] = df["polarity"].apply(lambda x: 1 if x == "positive" else
                                           (0 if x == "neutral" else -1))

        ### Formating output
        label = to_categorical(df['label'] + 1)

        # remove target
        sentence_red = [0] * len(df)
        for i in range(len(df)):
            sentence_red[i] = df["sentence"][i][:int(df["character_offset"][i].split(":")[0])] + \
                              df["sentence"][i][int(
                                  df["character_offset"][i].split(":")[1]):]

        df["sentence_red"] = sentence_red
        # remove stopwords
        # df["sentence_red"] = df["sentence_red"].apply(lambda x: self.remove_stopwords(x))
        # word2vec embeddings
        PATH_TO_DATA = Path('../data')
        en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz'
        print(en_embeddings_path)
        if not en_embeddings_path.exists():
            urlretrieve(
                'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz',
                en_embeddings_path)

        w2vec = word2vec.Word2Vec(en_embeddings_path, vocab_size=50000)
        sentence2vec = word2vec.BagOfWords(w2vec)
        # Each word in a sentence will be encoded with its corresponding vector (size 300) in the embedding layer,
        # Each sentence contains N word -> N vector of size 300. In order to have the same input size for each sentence
        # we will pad the sequence of words with zeros vector until 100 words (for sentences with more than 100 words:
        # we will truncate the sentence at 100 words. (meaning we could possibly loss information)
        # If needed, this parameter could be set as more than 100

        sentences = [
            sentence2vec.encode(df["sentence"][i],
                                ag_sentence=False,
                                padding=100)
            for i in range(len(df["sentence"]))
        ]

        #### Transform as array
        sentences = np.stack(sentences)

        #### Encoding categorie (oneHotEncoding):
        if train_data == True:
            self.enc = OneHotEncoder(handle_unknown='ignore')
            self.enc.fit(df['aspect_category'].values.reshape(-1, 1))
        categories = self.enc.transform(df['aspect_category'].values.reshape(
            -1, 1))

        return (sentences, categories, label)
Ejemplo n.º 2
0
    def read_data(self, source, train_data=True):
        df = pd.read_csv(source, sep='\t', header=None)

        df.columns = [
            "polarity", "aspect_category", "target_term", "character_offset",
            "sentence"
        ]
        df["label"] = df["polarity"].apply(lambda x: 1 if x == "positive" else
                                           (0 if x == "neutral" else -1))

        # Formating output
        label = to_categorical(df['label'] + 1)

        # Remove target term from sentences
        sentence_red = [0] * len(df)
        for i in range(len(df)):
            sentence_red[i] = df["sentence"][i][:int(df["character_offset"][i].split(":")[0])] + \
                              df["sentence"][i][int(
                                  df["character_offset"][i].split(":")[1]):]

        df["sentence_red"] = sentence_red

        # Remove stopwords from sentences
        df["sentence_red"] = df["sentence_red"].apply(
            lambda x: self.remove_stopwords(x))

        # word2vec embeddings
        PATH_TO_RESOURCES = Path('../resources')
        en_embeddings_path = PATH_TO_RESOURCES / 'cc.en.300.vec.gz'
        print(en_embeddings_path)
        if not en_embeddings_path.exists():
            urlretrieve(
                'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz',
                en_embeddings_path)

        w2vec = word2vec.Word2Vec(en_embeddings_path, vocab_size=50000)
        sentence2vec = word2vec.BagOfWords(w2vec)

        sentences = [
            sentence2vec.encode(df["sentence"][i],
                                ag_sentence=False,
                                padding=100)
            for i in range(len(df["sentence"]))
        ]

        # Transform as array
        sentences = np.stack(sentences)

        # Encoding categories (oneHotEncoding):
        if train_data == True:
            self.enc = OneHotEncoder(handle_unknown='ignore')
            self.enc.fit(df['aspect_category'].values.reshape(-1, 1))
        categories = self.enc.transform(df['aspect_category'].values.reshape(
            -1, 1))

        return (sentences, categories, label)
Ejemplo n.º 3
0
def train_word2vec(pat_corpus, corpus='patents', seed=1, embed_dim=300):
    """Train the word2vec model"""
    # train the skipgram model; default window=5
    model = word2vec.Word2Vec(pat_corpus,
                              mtype='sg',
                              hs=1,
                              neg=13,
                              embed_dim=embed_dim,
                              seed=seed)
    # delete the huge stupid table again
    model.table = None
    # pickle the entire model to disk, so we can load&resume training later
    saven = "%s_sg_%i_hs0_neg13_seed%i.model" % (corpus, embed_dim, seed)
    print "saving model"
    pkl.dump(model, open("human_eval/models/%s" % saven, 'wb'), -1)
    return model
Ejemplo n.º 4
0
def train_model_word2vec():
    file_name = './data/zh_classicalwiki_extracted_word_seg_result.txt'
    with open(file_name, 'r') as fp:
        words = utils.read_file(fp)
    w2v = word2vec.Word2Vec(
        batch_size=128,  # 批训练大小
        skip_window=3,  # 单边窗口长
        num_skips=2,  #
        embedding_size=200,  # 词向量的长度
        vocabulary_size=50000,  # 字典大小
        num_sampled=64,  # 负采样
        learning_rate=1e-2,
        n_steps=100001,  # 训练次数
        logdir='./model/word2vec/tmp_word2vec')

    w2v.train_model(words)
    w2v.save_model("./model/word2vec")
Ejemplo n.º 5
0
def main():

    embedding_dir = args.embedding+args.language
    print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention)

    #train_docs,dev_docs,test_docs = get_doc_data()
    train_docs = doc_data_generater("train")
    dev_docs = doc_data_generater("dev")
    test_docs = doc_data_generater("test")

    train_doc_mention_arrays,train_doc_pair_arrays,train_doc_gold_chains = array_generater(train_docs,"train",w2v)
    test_doc_mention_arrays,test_doc_pair_arrays,test_doc_gold_chains = array_generater(test_docs,"test",w2v)
    dev_doc_mention_arrays,dev_doc_pair_arrays,dev_doc_gold_chains = array_generater(dev_docs,"dev",w2v)
Ejemplo n.º 6
0
def callback():
    wsdl_path = e1.get()
    k = e2.get()
    start = time.time()
    print(start)
    # 建立目录
    wsdl_father_path = wsdl_path[0:wsdl_path.rfind('/')]
    service_name_path = wsdl_father_path + '/serviceName'
    service_name_stemmed_path = wsdl_father_path + '/serviceNameStemmed'
    if not os.path.exists(service_name_path):
        os.mkdir(service_name_path)
    if not os.path.exists(service_name_stemmed_path):
        os.mkdir(service_name_stemmed_path)

    subprocess.call(['java', '-jar', 'ServiceNameParsing.jar', wsdl_path])
    service_num = WordProc.WordProc(service_name_path)
    word2vec.Word2Vec(service_name_stemmed_path)
    service_name_sim.service_name_sim(service_name_stemmed_path, service_num)
    clustering_result = spectral_clustering.spectral_clustering(k, service_num)
    print(clustering_result)
    end = time.time()
    lb0.insert(END, "服务数量: " + str(service_num))
    lb0.insert(END, "聚类数量: " + str(k))
    lb0.insert(END, "消耗时间: " + str(end - start) + "秒")
    for j in range(0, int(k)):
        lb0.insert(
            END,
            "第" + str(j + 1) + "个聚类中服务的数量: " + str(clustering_result.count(j)))

    lb1.insert(END, "服务       所在聚类")
    for i in range(0, service_num):
        if i < 9:
            lb1.insert(
                END,
                str(i + 1) + "              " + str(clustering_result[i] + 1))
        if i >= 9:
            lb1.insert(
                END,
                str(i + 1) + "            " + str(clustering_result[i] + 1))

    for i in range(0, int(k)):
        lb2.insert(END, "聚类" + str(i + 1) + "中的服务")
        re = [j for j, a in enumerate(clustering_result) if a == i]
        for k in range(0, len(re)):
            lb2.insert(END, "  " + str(re[k] + 1))
Ejemplo n.º 7
0
def main():

    embedding_dir = args.embedding + args.language
    print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir, embedding_dimention)

    #train_docs,dev_docs,test_docs = get_doc_data()
    train_docs = doc_data_generater("train")
    dev_docs = doc_data_generater("dev")
    test_docs = doc_data_generater("test")

    for cases, gold_chain in case_generater(train_docs, "train", w2v):
        print len(cases)
        for m_self, index_self, m_array, m_index in cases[1:]:
            print len(m_self[0]), len(index_self[0])
            print len(m_array[0])
        print gold_chain
Ejemplo n.º 8
0
def run():
    """Runs evaluation in a loop, and logs summaries to TensorBoard."""
    # Create the evaluation directory if it doesn't exist.
    eval_dir = FLAGS.eval_dir
    if not tf.gfile.IsDirectory(eval_dir):
        tf.logging.info("Creating eval directory: %s", eval_dir)
        tf.gfile.MakeDirs(eval_dir)

    # generate eval dump file
    dump_file = open(os.path.join(eval_dir, 'evaluation.json'), 'a')

    g = tf.Graph()
    with g.as_default():
        # Build the model for evaluation.
        model_config = configuration.ModelConfig()
        model = word2vec.Word2Vec(model_config, mode="eval")
        model.build()

        # Create the Saver to restore model Variables.
        saver = tf.train.Saver()

        # Create the summary operation and the summary writer.
        summary_op = tf.merge_all_summaries()
        summary_writer = tf.train.SummaryWriter(eval_dir)

        g.finalize()

        # Run a new evaluation run every eval_interval_secs.
        try:
            while True:
                start = time.time()
                tf.logging.info(
                    "Starting evaluation at " +
                    time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()))
                run_once(model, saver, summary_writer, summary_op, dump_file)
                time_to_next_eval = start + FLAGS.eval_interval_secs - time.time(
                )
                if time_to_next_eval > 0:
                    time.sleep(time_to_next_eval)
        except KeyboardInterrupt:
            dump_file.close()
def tocsv(trainword_file, testword_file, sg, hs, window, size, model_name,
          traincsv, testcsv, b, flag, iter, spmodel):
    if spmodel:
        print("loading model")
        model = gensim.models.KeyedVectors.load_word2vec_format(model_name,
                                                                binary=False)
    else:
        sentences = word2vec.LineSentence(trainword_file)
        model = word2vec.Word2Vec(sentences,
                                  sg=sg,
                                  hs=hs,
                                  min_count=1,
                                  window=window,
                                  size=size,
                                  iter=iter)
        model.wv.save_word2vec_format(model_name, binary=False)

    save_csv(trainword_file, model, traincsv, b)

    if flag:
        save_csv(testword_file, model, testcsv, b)
Ejemplo n.º 10
0
    def __init__(self, mes, trainable=True, truncated=False):
        self.mes = mes
        self.col_name = mes.train_col
        self.trainable = trainable
        self.truncated = truncated
        self.lang = self.mes.config['LANG']
        self.fids = self.mes.config['DG_FIDS']
        self.sentence_sz = self.mes.config['DG_SENTENCE_SZ']
        self.label_num = self.mes.config['LABEL_NUM']
        self.batch_sz = self.mes.config['DG_BATCH_SZ']
        self.test_batch_sz = self.mes.config['DG_TEST_BATCH_SZ']
        self.rnum = self.mes.config['DG_RNUM']
        self.w2v = word2vec.Word2Vec(self.mes, trainable=False)
        print("Train mode:", trainable)
        if trainable and self.col_name is not None:
            self.fold_num = self.mes.config['DG_FOLD_NUM']
            self.fold_test_id = self.mes.config['DG_FOLD_TEST_ID']
            self.fold_valid_id = self.mes.config['DG_FOLD_VALID_ID']
            self.docs = utils.get_docs(self.col_name)
            records = self.docs.find()
            records = [record for record in records]
            self.test_data, self.test_labels = DataGenerator.get_data_by_fold_ids(
                records, [self.fold_test_id])
            self.valid_data, self.valid_labels = DataGenerator.get_data_by_fold_ids(
                records, [self.fold_valid_id])
            self.train_data, self.train_labels = \
                DataGenerator.get_data_by_fold_ids(
                    records, [i for i in range(self.fold_num)
                              if i != self.fold_test_id and i != self.fold_valid_id])

            self.test_sz = len(self.test_data)
            self.valid_sz = len(self.valid_data)
            self.train_sz = len(self.train_data)
            self.test_inds = [0, 0, 0]
            self.valid_inds = [0, 0, 0]
            self.train_inds = [0, 0, self.rnum]
        elif not trainable:
            self.cutter = text_extractor.parser_holder.get_parser()
Ejemplo n.º 11
0
 def __init__(self):
     self.parser = parser.Parser()
     self.w2v = word2vec.Word2Vec()
Ejemplo n.º 12
0
def main():
    args = parse_args()
    config = configparser.ConfigParser()
    """ARGS DETAIL"""
    config_file = args.config_file
    batch_size = args.batch
    n_epoch = args.epoch
    pretrain_epoch = args.pretrain_epoch
    gpu_id = args.gpu
    model_type = args.model
    vocab_type = args.vocab
    pretrain_w2v = args.pretrain_w2v
    data_path = args.data_path
    load_model = args.load_model
    """DIR PREPARE"""
    config.read(config_file)
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])

    vocab_name = vocab_type
    if pretrain_w2v:
        vocab_name = 'p' + vocab_name

    if model_type == 'multi':
        base_dir = './{}_{}{}_{}_c{}/'.format(model_type, vocab_name,
                                              vocab_size, data_path[0],
                                              coefficient)
    else:
        base_dir = './{}_{}{}_{}/'.format(model_type, vocab_name, vocab_size,
                                          data_path[0])
    model_save_dir = base_dir

    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
        shutil.copyfile(config_file, base_dir + config_file)
    config_file = base_dir + config_file
    config.read(config_file)

    if load_model is not None:
        model_save_dir = base_dir + load_model.replace('.npz', '') + '/'
        if not os.path.exists(model_save_dir):
            os.mkdir(model_save_dir)
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])
    valid_num = int(config['Parameter']['valid_num'])
    """LOGGER"""
    log_file = model_save_dir + 'log.txt'
    logger = dataset.prepare_logger(log_file)

    logger.info(args)  # 引数を記録
    logger.info('[Training start] logging to {}'.format(log_file))
    """DATASET"""
    train_src_file = config[data_path]['train_src_file']
    train_trg_file = config[data_path]['train_trg_file']
    valid_src_file = config[data_path]['valid_src_file']
    valid_trg_file = config[data_path]['valid_trg_file']
    test_src_file = config[data_path]['single_src_file']
    test_trg_file = config[data_path]['single_trg_file']
    src_w2v_file = config[data_path]['src_w2v_file']
    trg_w2v_file = config[data_path]['trg_w2v_file']

    train_data_size = dataset.data_size(train_src_file)
    valid_data_size = dataset.data_size(valid_src_file)
    logger.info('train size: {}, valid size: {}'.format(
        train_data_size, valid_data_size))
    """VOCABULARY"""
    src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab(
        base_dir, vocab_type, train_src_file, train_trg_file, vocab_size,
        gpu_id)
    src_vocab_size = len(src_vocab.vocab)
    trg_vocab_size = len(trg_vocab.vocab)

    src_initialW = None
    trg_initialW = None

    if pretrain_w2v:
        w2v = word2vec.Word2Vec()
        src_initialW, vector_size, src_match_word_count = w2v.make_initialW(
            src_vocab.vocab, src_w2v_file)
        trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW(
            trg_vocab.vocab, trg_w2v_file)
        logger.info(
            'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format(
                src_match_word_count, src_vocab_size, trg_match_word_count,
                trg_vocab_size))

    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(
        src_vocab_size, trg_vocab_size))
    """ITERATOR"""
    _, src_label, src_text, _ = dataset.load_binary_score_file(train_src_file)
    trg_text = dataset.load(train_trg_file)
    train_iter = dataset.Iterator(src_text,
                                  src_label,
                                  trg_text,
                                  src_vocab,
                                  trg_vocab,
                                  batch_size,
                                  gpu_id,
                                  sort=True,
                                  shuffle=True)
    # train_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False)

    _, src_label, src_text, _ = dataset.load_binary_score_file(valid_src_file)
    trg_text = dataset.load(valid_trg_file)
    valid_iter = dataset.Iterator(src_text,
                                  src_label,
                                  trg_text,
                                  src_vocab,
                                  trg_vocab,
                                  batch_size,
                                  gpu_id,
                                  sort=False,
                                  shuffle=False)

    correct_label, correct_binary_label, correct_text, correct_index = dataset.load_binary_score_file(
        test_src_file)
    trg_text = dataset.load(test_trg_file)
    test_iter = dataset.Iterator(correct_text,
                                 correct_binary_label,
                                 trg_text,
                                 src_vocab,
                                 trg_vocab,
                                 batch_size,
                                 gpu_id,
                                 sort=False,
                                 shuffle=False)
    """MODEL"""
    if model_type == 'multi':
        model = model.Multi(src_vocab_size, trg_vocab_size, embed_size,
                            hidden_size, class_size, dropout_ratio,
                            coefficient, src_initialW, trg_initialW)
    elif model_type in ['label', 'pretrain']:
        model = model.Label(src_vocab_size, trg_vocab_size, embed_size,
                            hidden_size, class_size, dropout_ratio,
                            src_initialW, trg_initialW)
    else:
        model = model.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                     embed_size, hidden_size, dropout_ratio,
                                     src_initialW, trg_initialW)

    gridsearcher = gridsearch.GridSearch(valid_num)
    """OPTIMIZER"""
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """PRETRAIN"""
    if model_type == 'pretrain' and load_model is None:
        logger.info('Pre-train start')
        pretrain_loss_dic = {}
        for epoch in range(1, pretrain_epoch + 1):
            train_loss = 0
            for i, batch in enumerate(train_iter.generate(), start=1):
                try:
                    loss = model.pretrain(*batch)
                    train_loss += loss.data
                    optimizer.target.cleargrads()
                    loss.backward()
                    optimizer.update()

                except Exception as e:
                    logger.info('P{} ## train iter: {}, {}'.format(
                        epoch, i, e))
                    # with open(model_dir + 'error_log.txt', 'a')as f:
                    #     f.write('P{} ## train iter {}\n'.format(epoch, i))
                    #     f.write(traceback.format_exc())
                    #     f.write('P{} ## [batch detail]\n'.format(epoch))
                    #     for b in batch[0]:
                    #         [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b]
            chainer.serializers.save_npz(
                model_save_dir + 'p_model_epoch_{}.npz'.format(epoch), model)
            """EVALUATE"""
            valid_loss = 0
            for batch in valid_iter.generate():
                with chainer.no_backprop_mode(), chainer.using_config(
                        'train', False):
                    valid_loss += model.pretrain(*batch).data
            logger.info('P{} ## train loss: {}, val loss:{}'.format(
                epoch, train_loss, valid_loss))
            pretrain_loss_dic[epoch] = valid_loss
        """MODEL SAVE & LOAD"""
        best_epoch = min(pretrain_loss_dic,
                         key=(lambda x: pretrain_loss_dic[x]))
        logger.info('best_epoch:{}, val loss: {}'.format(
            best_epoch, pretrain_loss_dic[best_epoch]))
        shutil.copyfile(
            model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch),
            model_save_dir + 'p_best_model.npz')
        logger.info('Pre-train finish')

    if load_model:
        logger.info('load model: {}'.format(load_model))
        chainer.serializers.load_npz(model_save_dir + load_model, model)
    """TRAIN"""
    accuracy_dic = {}
    for epoch in range(1, n_epoch + 1):
        train_loss = 0
        for i, batch in enumerate(train_iter.generate(), start=1):
            try:
                loss = optimizer.target(*batch)
                train_loss += loss.data
                optimizer.target.cleargrads()
                loss.backward()
                optimizer.update()

            except Exception as e:
                logger.info('E{} ## train iter: {}, {}'.format(epoch, i, e))
                # with open(model_dir + 'error_log.txt', 'a')as f:
                #     f.write('E{} ## train iter: {}\n'.format(epoch, i))
                #     f.write(traceback.format_exc())
                #     f.write('E{} ## [batch detail]\n'.format(epoch))
                #     for b in batch[0]:
                #         [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b]
        chainer.serializers.save_npz(
            model_save_dir + 'model_epoch_{}.npz'.format(epoch), model)
        """DEV & TEST"""
        outputs = []
        labels = []
        alignments = []
        for i, batch in enumerate(test_iter.generate(), start=1):
            try:
                with chainer.no_backprop_mode(), chainer.using_config(
                        'train', False):
                    output, label, align = model.predict(batch[0], sos, eos)
            except Exception as e:
                logger.info('E{} ## test iter: {}, {}'.format(epoch, i, e))
                # with open(model_dir + 'error_log.txt', 'a')as f:
                #     f.write('E{} ## test iter: {}\n'.format(epoch, i))
                #     f.write(traceback.format_exc())
                #     f.write('E{} ## [batch detail]\n'.format(epoch))
                #     for b in batch[0]:
                #         [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b]

            if model_type == 'multi':
                for o, l, a in zip(output, label, align):
                    outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                    labels.append(chainer.cuda.to_cpu(l))
                    alignments.append(chainer.cuda.to_cpu(a))
            elif model_type in ['label', 'pretrain']:
                for l in label:
                    labels.append(chainer.cuda.to_cpu(l))
            else:
                for o, a in zip(output, align):
                    outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                    alignments.append(chainer.cuda.to_cpu(a))

        if model_type in ['multi', 'label', 'pretrain']:
            dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch(
                correct_label, correct_index, labels, alignments)
        else:
            dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch(
                correct_label, correct_index, alignments, [])

        accuracy_dic[epoch] = [dev_score, test_score]

        # log保存
        logger.info('E{} ## loss:{}, dev: {}, test: {}'.format(
            epoch, train_loss, dev_score, test_score))
        logger.info('E{} ## {}'.format(
            epoch, ' '.join(dataset.float_to_str(test_score_list[-1]))))
        for i, (l, p) in enumerate(zip(test_score_list[:-1], param_list),
                                   start=1):
            logger.info('E{} ##   {}: {}\t{}'.format(
                epoch, i, p, ' '.join(dataset.float_to_str(l))))

        # 結果保存
        dataset.save_output(model_save_dir, epoch, labels, alignments, outputs,
                            s_result_list)
    """MODEL SAVE"""
    best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][0]))
    logger.info('best_epoch:{}, dev: {}, test: {}, {}'.format(
        best_epoch, accuracy_dic[best_epoch][0], accuracy_dic[best_epoch][1],
        model_dir))
    shutil.copyfile(model_save_dir + 'model_epoch_{}.npz'.format(best_epoch),
                    model_save_dir + 'best_model.npz')
Ejemplo n.º 13
0
def main():
    args = parse_args()
    config = configparser.ConfigParser()
    """ARGS DETAIL"""
    config_file = args.config_file
    batch_size = args.batch
    n_epoch = args.epoch
    pretrain_epoch = args.pretrain_epoch
    gpu_id = args.gpu
    model_type = args.model
    pretrain_w2v = args.pretrain_w2v
    data_path = args.data_path
    load_model = args.load_model
    """DIR PREPARE"""
    config.read(config_file)
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])
    shuffle_data = bool(config['Parameter']['shuffle'])

    if pretrain_w2v:
        vocab_size = 'p' + str(vocab_size)

    if model_type == 'multi':
        if shuffle_data:
            base_dir = './pseudo_{}_{}_{}_c{}_shuffle/'.format(
                model_type, vocab_size, data_path[0], coefficient)
        else:
            base_dir = './pseudo_{}_{}_{}_c{}/'.format(model_type, vocab_size,
                                                       data_path[0],
                                                       coefficient)
    else:
        if shuffle_data:
            base_dir = './pseudo_{}_{}_{}_shuffle/'.format(
                model_type, vocab_size, data_path[0])
        else:
            base_dir = './pseudo_{}_{}_{}/'.format(model_type, vocab_size,
                                                   data_path[0])
    model_save_dir = base_dir

    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
        shutil.copyfile(config_file, base_dir + config_file)
    config_file = base_dir + config_file
    config.read(config_file)
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_size = int(config['Parameter']['vocab_size'])
    valid_num = int(config['Parameter']['valid_num'])
    shuffle_data = bool(config['Parameter']['shuffle'])
    """LOGGER"""
    log_file = model_save_dir + 'log.txt'
    logger = dataset.prepare_logger(log_file)
    logger.info(args)  # 引数を記録
    logger.info('[Training start] logging to {}'.format(log_file))
    """DATASET"""
    train_src_file = config[data_path]['train_src_file']
    train_trg_file = config[data_path]['train_trg_file']
    valid_src_file = config[data_path]['valid_src_file']
    valid_trg_file = config[data_path]['valid_trg_file']
    test_src_file = config[data_path]['single_src_file']
    test_trg_file = config[data_path]['single_trg_file']
    src_w2v_file = config[data_path]['src_w2v_file']
    trg_w2v_file = config[data_path]['trg_w2v_file']

    train_data = dataset.load_label_corpus_file(train_src_file, train_trg_file)
    qa_data_sub_lit = dataset.split_valid_data(train_data, valid_num)
    valid_data = dataset.load_label_corpus_file(valid_src_file, valid_trg_file)
    test_data = dataset.load_label_corpus_file(test_src_file, test_trg_file)
    test_data_sub_lit = dataset.split_valid_data(test_data, valid_num)
    """VOCABULARY"""
    src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab(
        base_dir, train_data, vocab_size, gpu_id)
    src_vocab_size = len(src_vocab.vocab)
    trg_vocab_size = len(trg_vocab.vocab)

    src_initialW, trg_initialW = None, None
    if pretrain_w2v:
        w2v = word2vec.Word2Vec()
        src_initialW, vector_size, src_match_word_count = w2v.make_initialW(
            src_vocab.vocab, src_w2v_file)
        trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW(
            trg_vocab.vocab, trg_w2v_file)
        logger.info(
            'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format(
                src_match_word_count, src_vocab_size, trg_match_word_count,
                trg_vocab_size))

    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(
        src_vocab_size, trg_vocab_size))

    evaluater = evaluate.Evaluate()
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()

    cross_valid_result = []
    for ite in range(1, valid_num + 1):
        model_valid_dir = base_dir + 'valid{}/'.format(ite)
        if not os.path.exists(model_valid_dir):
            os.mkdir(model_valid_dir)

        qa_train_data, qa_dev_data, qa_test_data = dataset.separate_train_dev_test(
            qa_data_sub_lit, ite)
        train_data, dev_data, test_data = dataset.separate_train_dev_test(
            test_data_sub_lit, ite)
        test_data_id = [t['id'] for t in test_data]

        qa_iter = dataset.Iterator(qa_train_data,
                                   src_vocab,
                                   trg_vocab,
                                   batch_size,
                                   gpu_id,
                                   sort=True,
                                   shuffle=True)
        valid_iter = dataset.Iterator(valid_data,
                                      src_vocab,
                                      trg_vocab,
                                      batch_size,
                                      gpu_id,
                                      sort=False,
                                      shuffle=False)
        train_iter = dataset.Iterator(train_data,
                                      src_vocab,
                                      trg_vocab,
                                      batch_size,
                                      gpu_id,
                                      sort=True,
                                      shuffle=True)
        dev_iter = dataset.Iterator(dev_data,
                                    src_vocab,
                                    trg_vocab,
                                    batch_size,
                                    gpu_id,
                                    sort=False,
                                    shuffle=False)
        test_iter = dataset.Iterator(test_data,
                                     src_vocab,
                                     trg_vocab,
                                     batch_size,
                                     gpu_id,
                                     sort=False,
                                     shuffle=False)

        qa_size = len(qa_train_data)
        train_size = len(train_data)
        logger.info('V{} ## QA:{}, train:{}, dev:{} ,test:{}'.format(
            ite, qa_size, train_size, len(dev_data), len(test_data)))
        """MODEL"""
        if model_type == 'multi':
            model = model.Multi(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                coefficient, src_initialW, trg_initialW)
        elif model_type in ['label', 'pretrain']:
            model = model.Label(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                src_initialW, trg_initialW)
        else:
            model = model.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                         embed_size, hidden_size,
                                         dropout_ratio, src_initialW,
                                         trg_initialW)

        if gpu_id >= 0:
            model.to_gpu()
        """OPTIMIZER"""
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
        optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
        """PRETRAIN"""
        if model_type == 'pretrain' and load_model is None:
            logger.info('Pre-train start')
            pretrain_loss_dic = {}
            for epoch in range(1, pretrain_epoch + 1):
                train_loss = 0
                for i, batch in enumerate(train_iter.generate(), start=1):
                    try:
                        loss = model.pretrain(*batch)
                        train_loss += loss.data
                        optimizer.target.cleargrads()
                        loss.backward()
                        optimizer.update()

                    except Exception as e:
                        logger.info('P{} ## train iter: {}, {}'.format(
                            epoch, i, e))
                chainer.serializers.save_npz(
                    model_save_dir + 'p_model_epoch_{}.npz'.format(epoch),
                    model)
                """EVALUATE"""
                valid_loss = 0
                for batch in valid_iter.generate():
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        valid_loss += model.pretrain(*batch).data
                logger.info('P{} ## train loss: {}, val loss:{}'.format(
                    epoch, train_loss, valid_loss))
                pretrain_loss_dic[epoch] = valid_loss
            """MODEL SAVE & LOAD"""
            best_epoch = min(pretrain_loss_dic,
                             key=(lambda x: pretrain_loss_dic[x]))
            logger.info('best_epoch:{}, val loss: {}'.format(
                best_epoch, pretrain_loss_dic[best_epoch]))
            shutil.copyfile(
                model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch),
                model_save_dir + 'p_best_model.npz')
            logger.info('Pre-train finish')

        if load_model:
            logger.info('load model: {}'.format(load_model))
            chainer.serializers.load_npz(base_dir + load_model, model)
        """TRAIN"""
        epoch_info = {}
        for epoch in range(1, n_epoch + 1):
            train_loss = 0
            mix_train_iter = dataset.MixIterator(qa_iter,
                                                 train_iter,
                                                 seed=0,
                                                 shuffle=shuffle_data)
            for i, batch in enumerate(mix_train_iter.generate(), start=1):
                try:
                    loss = optimizer.target(*batch[0])
                    train_loss += loss.data
                    optimizer.target.cleargrads()
                    loss.backward()
                    optimizer.update()

                except Exception as e:
                    logger.info('V{} ## E{} ## train iter: {}, {}'.format(
                        ite, epoch, i, e))
            chainer.serializers.save_npz(
                model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model)
            """DEV"""
            labels, alignments = [], []
            for i, batch in enumerate(dev_iter.generate(), start=1):
                try:
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        _, label, align = model.predict(batch[0], sos, eos)
                except Exception as e:
                    logger.info('V{} ## E{} ## dev iter: {}, {}'.format(
                        ite, epoch, i, e))

                if model_type == 'multi':
                    for l, a in zip(label, align):
                        labels.append(chainer.cuda.to_cpu(l))
                        alignments.append(chainer.cuda.to_cpu(a))
                elif model_type in ['label', 'pretrain']:
                    for l in label:
                        labels.append(chainer.cuda.to_cpu(l))
                else:
                    for a in align:
                        alignments.append(chainer.cuda.to_cpu(a))

            best_param_dic = evaluater.param_search(labels, alignments,
                                                    dev_data)
            param = max(best_param_dic,
                        key=lambda x: best_param_dic[x]['macro'])
            init, mix = evaluate.key_to_param(param)
            dev_score = round(best_param_dic[param]['macro'], 3)
            """TEST"""
            outputs, labels, alignments = [], [], []
            for i, batch in enumerate(test_iter.generate(), start=1):
                try:
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        output, label, align = model.predict(
                            batch[0], sos, eos)
                except Exception as e:
                    logger.info('V{} ## E{} ## test iter: {}, {}'.format(
                        ite, epoch, i, e))

                if model_type == 'multi':
                    for l, a in zip(label, align):
                        labels.append(chainer.cuda.to_cpu(l))
                        alignments.append(chainer.cuda.to_cpu(a))
                elif model_type in ['label', 'pretrain']:
                    for l in label:
                        labels.append(chainer.cuda.to_cpu(l))
                else:
                    for a in align:
                        alignments.append(chainer.cuda.to_cpu(a))

            rate, count, tf_lit, macro, micro = evaluater.eval_param(
                labels, alignments, test_data, init, mix)
            test_macro_score = round(macro, 3)
            test_micro_score = round(micro, 3)
            logger.info(
                'V{} ## E{} ## loss: {}, dev: {}, param: {}, micro: {}, macro: {}'
                .format(ite, epoch, train_loss, dev_score, param,
                        test_micro_score, test_macro_score))

            epoch_info[epoch] = {
                'id': test_data_id,
                'label': labels,
                'align': alignments,
                'hypo': outputs,
                'epoch': epoch,
                'dev_score': dev_score,
                'param': param,
                'rate': rate,
                'count': count,
                'tf': tf_lit,
                'macro': test_macro_score,
                'micro': test_micro_score
            }
            dataset.save_output(model_valid_dir, epoch_info[epoch])
        """MODEL SAVE"""
        best_epoch = max(epoch_info,
                         key=(lambda x: epoch_info[x]['dev_score']))
        cross_valid_result.append(epoch_info[best_epoch])
        logger.info(
            'V{} ## best_epoch: {}, dev: {}, micro: {}, macro: {}'.format(
                ite, best_epoch, epoch_info[best_epoch]['dev_score'],
                epoch_info[best_epoch]['micro'],
                epoch_info[best_epoch]['macro']))
        shutil.copyfile(
            model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch),
            model_valid_dir + 'best_model.npz')

        logger.info('')

    ave_dev_score, ave_macro_score, ave_micro_score = 0, 0, 0
    ave_test_score = [0 for _ in range(len(cross_valid_result[0]['rate']))]
    id_total, label_total, align_total, tf_total = [], [], [], []

    for v, r in enumerate(cross_valid_result, start=1):
        ave_dev_score += r['dev_score']
        ave_macro_score += r['macro']
        ave_micro_score += r['micro']
        for i, rate in enumerate(r['rate']):
            ave_test_score[i] += rate
        logger.info('   {}: e{}, {}\tdev: {}, micro: {}, macro: {} {}'.format(
            v, r['epoch'], r['param'], r['dev_score'], r['micro'],
            dataset.float_to_str(r['rate']), r['macro']))

        id_total.extend(r['id'])
        label_total.extend(r['label'])
        align_total.extend(r['align'])
        tf_total.extend(r['tf'])
    ave_dev_score = round(ave_dev_score / valid_num, 3)
    ave_macro_score = round(ave_macro_score / valid_num, 3)
    ave_micro_score = round(ave_micro_score / valid_num, 3)
    ave_test_score = [
        ave_test_score[i] / valid_num for i in range(len(ave_test_score))
    ]
    logger.info('dev: {}, micro: {}, macro: {} {}'.format(
        ave_dev_score, ave_micro_score, dataset.float_to_str(ave_test_score),
        ave_macro_score))

    label, align, tf = dataset.sort_multi_list(id_total, label_total,
                                               align_total, tf_total)
    dataset.save_list(base_dir + 'label.txt', label)
    dataset.save_list(base_dir + 'align.txt', align)
    dataset.save_list(base_dir + 'tf.txt', tf)
Ejemplo n.º 14
0
def main():

    embedding_dir = args.embedding+args.language

    print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention)

    #network_model
    net_dir = "./model/pretrain_ana/network_model_pretrain.cn.3"
    #net_dir = "./model/pretrain/network_model_pretrain.cn.10"
    #net_dir = "./model/nets/network_model.cn.1"
    #net_dir = './model/network_model.cn'
        #read_f = file('./model/network_model_pretrain.'+args.language, 'rb')
    print >> sys.stderr,"Read model from ./model/network_model."+args.language
    read_f = file(net_dir, 'rb')
    network_model = cPickle.load(read_f)
    #network_model = network.NetWork(1738,855,1000)

    train_docs = DataGenerate.doc_data_generater("train")
    dev_docs = DataGenerate.doc_data_generater("dev")
    test_docs = DataGenerate.doc_data_generater("test")
    
    MAX=5

    train4test = [] # add 5 items for testing the training performance
    ## test performance after pretraining
    dev_docs_for_test = []
    num = 0
    for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v):
    #for cases,gold_chain in DataGenerate.case_generater(dev_docs,"dev",w2v):
        ev_doc = policy_network.generate_policy_test(cases,gold_chain,network_model)
        dev_docs_for_test.append(ev_doc)
        train4test.append((cases,gold_chain))
        num += 1
        if num >= MAX:
            break

    print "Performance on DATA after PreTRAINING"
    mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
    bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
    cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
    print "#################################################" 
    sys.stdout.flush()
    print >> sys.stderr,"Pre Train done"

    ##train
    add2train = True

    ran_p = 0.0
    l2_lambda = 0.0000003
    #l2_lambda = 0.0001
    lr = 0.0002
    #lr = 0.0
    #lr = 0.0001
    #ce_lmbda = 0.1
    ce_lmbda = 0.0

    for echo in range(50):
        start_time = timeit.default_timer()
        cost_this_turn = 0.0
        average_reward = 0.0
        done_case_num = 0

        #for cases,gold_chain in DataGenerate.case_generater_trick(train_docs,"train",w2v):
        for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v):
            #for single_mention_array,train_list,lable_list in pretrain.generate_pretrain_case(cases,gold_chain,network_model):
            #    print lable_list


            this_reward = 0.0

            reward_baseline = []
    
            zero_num = 0

            for single, train, action, reward in policy_network.generate_policy_case(cases,gold_chain,network_model,ran_p):
            #for single, train, action, reward , acp in policy_network.generate_policy_case_trick(cases,gold_chain,network_model,ran_p):


                reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline))

                norm_reward = reward - reward_b if reward > reward_b else 0.00001

                this_reward = reward

                this_cost = network_model.train_step(single,train,action,reward,lr,l2_lambda,ce_lmbda,0.0)[0]
                #this_cost = network_model.train_step(single,train,action,norm_reward,lr,l2_lambda,ce_lmbda)[0]
                #print reward,this_cost
                cost_this_turn += this_cost

                #print this_cost,acp,reward
                #print this_cost
                reward_baseline.append(this_reward)
                if len(reward_baseline) >= 32:
                    reward_baselin = reward_baseline[1:]

            average_reward += this_reward
            done_case_num += 1

            if done_case_num >= MAX:
                break

        print network_model.get_weight_sum()
        end_time = timeit.default_timer()
        print >> sys.stderr, "Total cost:",cost_this_turn
        print >> sys.stderr, "Average Reward:",average_reward/float(done_case_num)
        print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time)
        ran_p = ran_p*0.5
        ## test training performance
        train_docs_for_test = []
        start_time = timeit.default_timer()

        for train_cases,train_doc_gold_chain in train4test:
            ev_doc = policy_network.generate_policy_test(train_cases,train_doc_gold_chain,network_model)
            train_docs_for_test.append(ev_doc)
        print "** Echo: %d **"%echo
        print "TRAIN"
        mp,mr,mf = evaluation.evaluate_documents(train_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(train_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(train_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print
        sys.stdout.flush()

        '''
Ejemplo n.º 15
0
def main():

    embedding_dir = args.embedding + args.language

    print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir, embedding_dimention)

    #network_model
    #net_dir = "./model/pretrain/network_model_pretrain.cn.19"
    #net_dir = "./model/pretrain_manu_dropout/network_model_pretrain.cn.10"
    if os.path.isfile("./model/network_model." + args.language):
        read_f = file('./model/network_model.' + args.language, 'rb')
        #read_f = file('./model/network_model_pretrain.'+args.language, 'rb')
        #read_f = file('./model/network_model_pretrain.cn.best', 'rb')
        #read_f = file(net_dir, 'rb')
        network_model = cPickle.load(read_f)
        print >> sys.stderr, "Read model from ./model/network_model." + args.language
    else:
        inpt_dimention = 1738
        single_dimention = 855
        if args.language == "en":
            inpt_dimention = 1374
            single_dimention = 673

        network_model = network.NetWork(inpt_dimention, single_dimention, 1000)
        print >> sys.stderr, "save model ..."
        save_f = file('./model/network_model.' + args.language, 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    train_docs = DataGenerate.doc_data_generater("train")
    dev_docs = DataGenerate.doc_data_generater("dev")
    test_docs = DataGenerate.doc_data_generater("test")

    #pretrain
    l2_lambda = 0.0000001
    lr = 0.03
    ce_lambda = 0.0001
    dropout_rate = 0.2

    print "Weight Sum", network_model.get_weight_sum()

    times = 0
    #for echo in range(11,40):
    for echo in range(10):

        start_time = timeit.default_timer()
        print "Pretrain ECHO:", echo
        cost_this_turn = 0.0
        #print >> sys.stderr, network_model.get_weight_sum()
        done_num = 0
        pos_num = 0
        neg_num = 0
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):
            if len(cases) >= 700:
                continue
            for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case(
                    cases, gold_chain, network_model):

                #cost_this_turn += network_model.pre_train_step(single_mention_array,train_list,lable_list,lr,l2_lambda,dropout_rate)[0]

                if lable_list[0] == 1:
                    neg_num += 1
                    ana_cost, ana_result = network_model.ana_train_step(
                        single_mention_array, 1, lr, l2_lambda, dropout_rate)
                else:
                    pos_num += 1
                    ana_cost, ana_result = network_model.ana_train_step(
                        single_mention_array, 0, lr, l2_lambda, dropout_rate)
                for intance, lable in zip(train_list, lable_list):
                    mention_cost, mention_result = network_model.mention_train_step(
                        intance, lable, lr, l2_lambda, dropout_rate)

            done_num += 1
            if done_num == 10:
                break
        lr = lr * 0.99

        save_f = file(
            './model/pretrain_manu_new/network_model_pretrain_pair.%s.%d' %
            (args.language, echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn
        print >> sys.stderr, "POS:NEG", pos_num, neg_num
        print >> sys.stderr, "lr", lr
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)
        print "Weight Sum", network_model.get_weight_sum()

        ## test performance after pretraining
        dev_docs_for_test = []
        num = 0
        for cases, gold_chain in DataGenerate.case_generater(
                dev_docs, "dev", w2v):
            ev_doc = policy_network.generate_policy_test(
                cases, gold_chain, network_model)
            dev_docs_for_test.append(ev_doc)
            num += 1
            if num == 10:
                break
        print "Performance on DEV after PreTRAINING"
        mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print "#################################################"
        sys.stdout.flush()

    print >> sys.stderr, "Begin Normal Training"
    for echo in range(30):

        start_time = timeit.default_timer()
        print "Pretrain ECHO:", echo
        cost_this_turn = 0.0
        #print >> sys.stderr, network_model.get_weight_sum()
        done_num = 0
        pos_num = 0
        neg_num = 0
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):
            if len(cases) >= 700:
                continue
            for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case(
                    cases, gold_chain, network_model):
                cost_this_turn += network_model.pre_train_step(
                    single_mention_array, train_list, lable_list, lr,
                    l2_lambda, dropout_rate)[0]
                #cost_this_turn += network_model.pre_top_train_step(single_mention_array,train_list,lable_list,lr,l2_lambda)[0]

                if lable_list[0] == 1:
                    neg_num += 1
                else:
                    pos_num += 1

            done_num += 1
            #if done_num == 10:
            #    break
        lr = lr * 0.99

        save_f = file(
            './model/pretrain_manu_new/network_model_pretrain.%s.%d' %
            (args.language, echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn
        print >> sys.stderr, "POS:NEG", pos_num, neg_num
        print >> sys.stderr, "lr", lr
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)
        print "Weight Sum", network_model.get_weight_sum()

        ## test performance after pretraining
        dev_docs_for_test = []
        num = 0
        for cases, gold_chain in DataGenerate.case_generater(
                dev_docs, "dev", w2v):
            ev_doc = policy_network.generate_policy_test(
                cases, gold_chain, network_model)
            dev_docs_for_test.append(ev_doc)
            num += 1
            if num == 10:
                break
        print "Performance on DEV after PreTRAINING"
        mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print "#################################################"
        sys.stdout.flush()

    return

    for echo in range(30, 50):
        start_time = timeit.default_timer()
        cost_this_turn = 0.0
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):
            if len(cases) >= 700:
                continue
            for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case(
                    cases, gold_chain, network_model):
                cost_this_turn += network_model.pre_ce_train_step(
                    single_mention_array, train_list, lable_list, lr,
                    l2_lambda, ce_lambda)[0]

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)
        print "Weight Sum", network_model.get_weight_sum()

        ## test performance after pretraining
        dev_docs_for_test = []
        num = 0
        for cases, gold_chain in DataGenerate.case_generater(
                dev_docs, "dev", w2v):
            ev_doc = policy_network.generate_policy_test(
                cases, gold_chain, network_model)
            dev_docs_for_test.append(ev_doc)
        print "Performance on DEV after PreTRAINING"
        mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print "#################################################"
        sys.stdout.flush()

        save_f = file(
            './model/pretrain_manu_new/network_model_pretrain.%s.%d' %
            (args.language, echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    ## test performance after pretraining
    print >> sys.stderr, "Begin test on DEV after pertraining"
    dev_docs_for_test = []
    num = 0
    #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
    #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
    for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v):
        ev_doc = policy_network.generate_policy_test(cases, gold_chain,
                                                     network_model)
        dev_docs_for_test.append(ev_doc)
    print "Performance on DEV after PreTRAINING"
    mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
    bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
    cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
    print "#################################################"
    sys.stdout.flush()
    print >> sys.stderr, "Pre Train done"
Ejemplo n.º 16
0
def main():

    embedding_dir = args.embedding+args.language

    print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention)

    #network_model
    net_dir = "./model/pretrain_batch/network_model_pretrain.cn.9"
    if os.path.isfile("./model/network_model_batch."+args.language):
        #read_f = file('./model/network_model_batch.'+args.language, 'rb')
        #read_f = file('./model/network_model_pretrain.'+args.language, 'rb')
        #read_f = file('./model/network_model_pretrain.cn.best', 'rb')
        read_f = file(net_dir, 'rb')
        network_model = cPickle.load(read_f)
        print >> sys.stderr,"Read model from ./model/network_model_batch."+args.language
    else:
        inpt_dimention = 1738
        single_dimention = 855
        if args.language == "en":
            inpt_dimention = 1374
            single_dimention = 673

        network_model = network.NetWork(inpt_dimention,single_dimention,1000)
        print >> sys.stderr,"save model ..."
        save_f = file('./model/network_model_batch.'+args.language, 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    train_docs = DataGenerate.doc_data_generater("train")
    dev_docs = DataGenerate.doc_data_generater("dev")
    test_docs = DataGenerate.doc_data_generater("test")


    #pretrain
    l2_lambda = 0.0000003
    lr = 0.00002
    ce_lambda = 0.005

    times = 0
    for echo in range(0):

        start_time = timeit.default_timer()
        print "Pretrain ECHO:",echo
        cost_this_turn = 0.0
        #print >> sys.stderr, network_model.get_weight_sum()
        for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v):
            if len(cases) >= 700:
                continue
            for train_list,single_mention_array,mask_list,lable_list in pretrain.generate_pretrain_case_batch(cases,gold_chain,network_model):
                cost_this_turn += network_model.pre_train_step(single_mention_array,train_list,mask_list,lable_list,lr,l2_lambda)[0]
        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time)

        save_f = file('./model/pretrain_batch/network_model_pretrain_noNorm.%s.%d'%(args.language,echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()


    for echo in range(0):
        start_time = timeit.default_timer()
        cost_this_turn = 0.0
        for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v):
            if len(cases) >= 700:
                continue
            for train_list,single_mention_array,mask_list,lable_list in pretrain.generate_pretrain_case_batch(cases,gold_chain,network_model):
                cost_this_turn += network_model.pre_ce_train_step(single_mention_array,train_list,mask_list,lable_list,lr,l2_lambda,ce_lambda)[0]

        save_f = file('./model/pretrain_batch/network_model_pretrain.%s.%d'%(args.language,echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time)

    print >> sys.stderr,"Begin test on DEV after pertraining"
    
    ## test performance after pretraining
    dev_docs_for_test = []
    num = 0
    #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
        #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
    for cases,gold_chain in DataGenerate.case_generater(dev_docs,"dev",w2v):
        ev_doc = policy_network.generate_policy_test(cases,gold_chain,network_model)
        dev_docs_for_test.append(ev_doc)
    print "Performance on DEV after PreTRAINING"
    mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
    bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
    cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
    print "#################################################" 
    sys.stdout.flush()
    print >> sys.stderr,"Pre Train done"
    ##train
    train4test = [] # add 5 items for testing the training performance
    add2train = True

    for echo in range(10):
        start_time = timeit.default_timer()
        reward_baseline = []
        cost_this_turn = 0.0
        average_reward = 0.0
        done_case_num = 0

        l2_lambda = 0.000003
        lr = 0.000002
        ce_lambda = 0.0

        for cases,gold_chain in DataGenerate.case_generater(train_docs,"train",w2v):
            if len(cases) >= 700:
                continue

            if add2train:
                if random.randint(1,200) == 10:
                #if not random.randint(1,200) == 10:
                    #train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain))
                    train4test.append((cases,gold_chain))
                    if len(train4test) == 20:
                        add2train = False

            this_reward = 0.0
            reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline))

            for train, single, mask, action, reward in policy_network.generate_policy_case(cases,gold_chain,network_model):

                if len(train) <= 1:
                    continue
            #for single, train, action, reward , acp in policy_network.generate_policy_case_trick(cases,gold_chain,network_model):

                norm_reward = reward - reward_b

                this_reward = reward
                
                this_cost = network_model.train_step(single,train,mask,action,reward*100,lr,l2_lambda,ce_lambda)[0]
                #print this_cost,acp,reward
                cost_this_turn += this_cost

            average_reward += this_reward
            done_case_num += 1

            #if done_case_num >= 1:
            #    break
        print network_model.get_weight_sum()
        end_time = timeit.default_timer()
        print >> sys.stderr, "Total cost:",cost_this_turn
        print >> sys.stderr, "Average Reward:",average_reward/float(done_case_num)
        print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time)

        reward_baseline.append(this_reward)
        if len(reward_baseline) >= 64:
            reward_baselin = reward_baseline[1:]

        ## test training performance
        train_docs_for_test = []
        start_time = timeit.default_timer()

        for train_cases,train_doc_gold_chain in train4test:
            ev_doc = policy_network.generate_policy_test(train_cases,train_doc_gold_chain,network_model)
            train_docs_for_test.append(ev_doc)
        print "** Echo: %d **"%echo
        print "TRAIN"
        mp,mr,mf = evaluation.evaluate_documents(train_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(train_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(train_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print

        ## dev
        dev_docs_for_test = []
        start_time = timeit.default_timer()
        for dev_cases,dev_doc_gold_chain in DataGenerate.case_generater(dev_docs,"dev",w2v):
            ev_doc = policy_network.generate_policy_test(dev_cases,dev_doc_gold_chain,network_model)
            dev_docs_for_test.append(ev_doc)
        print "DEV"
        mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print 

        end_time = timeit.default_timer()
        print >> sys.stderr, "DEV Use %.3f seconds"%(end_time-start_time)
        sys.stdout.flush()
   
        ## test
        test_docs_for_test = []
        start_time = timeit.default_timer()
        #for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v):
        for test_cases,test_doc_gold_chain in DataGenerate.case_generater(test_docs,"test",w2v):
            ev_doc = policy_network.generate_policy_test(test_cases,test_doc_gold_chain,network_model)
            test_docs_for_test.append(ev_doc)
        print "TEST"
        mp,mr,mf = evaluation.evaluate_documents(test_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(test_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(test_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print 

        end_time = timeit.default_timer()
        print >> sys.stderr, "TEST Use %.3f seconds"%(end_time-start_time)
        sys.stdout.flush()

        save_f = file('./model/nets/network_model_batch.%s.%d'%(args.language,echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()
Ejemplo n.º 17
0
def main():

    embedding_dir = args.embedding + args.language

    print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir + ".filtered", embedding_dimention)

    #network_model
    if os.path.isfile("./model/network_model_index." + args.language):
        read_f = file('./model/network_model_index.' + args.language, 'rb')
        #read_f = file('./model/network_model_pretrain.'+args.language, 'rb')
        network_model = cPickle.load(read_f)
        print >> sys.stderr, "Read model from ./model/network_model_index." + args.language
    else:
        inpt_dimention = 1738
        single_dimention = 855
        if args.language == "en":
            inpt_dimention = 1374
            single_dimention = 673

        network_model = network.NetWork(inpt_dimention, single_dimention, 1000,
                                        embedding_dir + ".filtered",
                                        embedding_dimention)
        print >> sys.stderr, "save model ..."
        save_f = file('./model/network_model_index.' + args.language, 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    train_docs = DataGenerate.doc_data_generater("train")
    dev_docs = DataGenerate.doc_data_generater("dev")
    test_docs = DataGenerate.doc_data_generater("test")

    #pretrain
    times = 0
    for echo in range(20):
        start_time = timeit.default_timer()
        print "Pretrain ECHO:", echo
        cost_this_turn = 0.0
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):
            if len(cases) >= 700:
                continue
            for single_mention_array, single_index, train_list, train_index, lable_list in pretrain.generate_pretrain_case(
                    cases, gold_chain, network_model):
                print network_model.fff(train_list, train_index)
                cost_this_turn += network_model.pre_train_step(
                    single_mention_array, single_index, train_list,
                    train_index, lable_list, 0.0001)[0]

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain", echo, "Total cost:", cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds" % (end_time -
                                                               start_time)

        if echo % 4 == 0:
            save_f = file(
                './model/network_model_pretrain_index.' + args.language, 'wb')
            cPickle.dump(network_model,
                         save_f,
                         protocol=cPickle.HIGHEST_PROTOCOL)
            save_f.close()

    save_f = file('./model/network_model_pretrain_index.' + args.language,
                  'wb')
    cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
    save_f.close()

    print >> sys.stderr, "Begin test on DEV after pertraining"

    ## test performance after pretraining
    dev_docs_for_test = []
    num = 0
    for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v):
        ev_doc = policy_network.generate_policy_test(cases, gold_chain,
                                                     network_model)
        dev_docs_for_test.append(ev_doc)
    print "Performance on DEV after PreTRAINING"
    mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
    bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
    cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
    print "##################################################"
    sys.stdout.flush()
    print >> sys.stderr, "Pre Train done"

    ##train
    train4test = []  # add 5 items for testing the training performance
    add2train = True

    for echo in range(20):
        start_time = timeit.default_timer()
        reward_baseline = []
        cost_this_turn = 0.0

        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):

            if add2train:
                if random.randint(1, 200) == 10:
                    #train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain))
                    train4test.append((cases, gold_chain))
                    if len(train4test) == 5:
                        add2train = False

            this_reward = 0.0

            for single, sindex, train, tindex, action, reward in policy_network.generate_policy_case(
                    cases, gold_chain, network_model):
                cost_this_turn += network_model.train_step(
                    single, sindex, train, tindex, action, reward, 0.0001)[0]

        end_time = timeit.default_timer()
        print >> sys.stderr, "Total cost:", cost_this_turn
        print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time -
                                                            start_time)

        ## test training performance
        train_docs_for_test = []
        start_time = timeit.default_timer()

        for train_cases, train_doc_gold_chain in train4test:
            ev_doc = policy_network.generate_policy_test(
                train_cases, train_doc_gold_chain, network_model)
            train_docs_for_test.append(ev_doc)
        print "** Echo: %d **" % echo
        print "TRAIN"
        mp, mr, mf = evaluation.evaluate_documents(train_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(train_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(train_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print

        ## dev
        dev_docs_for_test = []
        start_time = timeit.default_timer()
        #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
        #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
        for dev_cases, dev_doc_gold_chain in DataGenerate.case_generater(
                dev_docs, "dev", w2v):
            ev_doc = policy_network.generate_policy_test(
                dev_cases, dev_doc_gold_chain, network_model)
            dev_docs_for_test.append(ev_doc)
        print "DEV"
        mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print

        end_time = timeit.default_timer()
        print >> sys.stderr, "DEV Use %.3f seconds" % (end_time - start_time)
        sys.stdout.flush()

        ## test
        test_docs_for_test = []
        start_time = timeit.default_timer()
        #for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v):
        for test_cases, test_doc_gold_chain in DataGenerate.case_generater(
                test_docs, "test", w2v):
            ev_doc = policy_network.generate_policy_test(
                test_cases, test_doc_gold_chain, network_model)
            test_docs_for_test.append(ev_doc)
        print "TEST"
        mp, mr, mf = evaluation.evaluate_documents(test_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(test_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(test_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print

        end_time = timeit.default_timer()
        print >> sys.stderr, "TEST Use %.3f seconds" % (end_time - start_time)
        sys.stdout.flush()
        '''
Ejemplo n.º 18
0
import numpy

import pyximport
pyximport.install(inplace=True,
                  setup_args={"include_dirs": numpy.get_include()})

# import test_sdot

import word2vec
import itertools

sentences = list(
    itertools.islice(
        word2vec.Text8Corpus('/Users/kofola/workspace/word2vec/text8'), 100))
model = word2vec.Word2Vec(sentences[:1], size=10, min_count=0)
print model.syn0
Ejemplo n.º 19
0
def main():

    embedding_dir = args.embedding+args.language

    print >> sys.stderr,"Read Embedding from %s ..."%embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir,embedding_dimention)

    #network_model
    if os.path.isfile("./model/network_model."+args.language):
        read_f = file('./model/network_model.'+args.language, 'rb')
        #read_f = file('./model/network_model_pretrain.'+args.language, 'rb')
        network_model = cPickle.load(read_f)
        print >> sys.stderr,"Read model from ./model/network_model."+args.language
    else:
        inpt_dimention = 1738
        single_dimention = 855
        if args.language == "en":
            inpt_dimention = 1374
            single_dimention = 673

        network_model = network.NetWork(inpt_dimention,single_dimention,1000)
        print >> sys.stderr,"save model ..."
        save_f = file('./model/network_model.'+args.language, 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    train_docs = DataGenerate.doc_data_generater("train")
    dev_docs = DataGenerate.doc_data_generater("dev")
    test_docs = DataGenerate.doc_data_generater("test")

    most_time = 100
    most_time_test = 50

    #pretrain
    for echo in range(10):
        start_time = timeit.default_timer()
        print "Pretrain ECHO:",echo
        cost_this_turn = 0.0
        num = most_time
        #print >> sys.stderr, network_model.get_weight_sum()
        for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v):
            num -= 1
            if num <= 0:
                break
            for single_mention_array,train_list,lable_list in pretrain.generate_pretrain_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model):
                #print single_mention_array
                cost_this_turn += network_model.pre_train_step(single_mention_array,train_list,lable_list,0.0003)[0]

        for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v):
            ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
            break
 
        print network_model.get_weight_sum()
        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain",echo,"Total cost:",cost_this_turn
        print >> sys.stderr, "PreTRAINING Use %.3f seconds"%(end_time-start_time)

    save_f = file('./model/network_model_pretrain.'+args.language, 'wb')
    cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
    save_f.close()
    print >> sys.stderr,"Begin test on DEV after pertraining"
    
    ## test performance on dev after pretraining
    dev_docs_for_test = []
    num = most_time_test
    #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
    for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v):
        num -= 1
        if num <= 0:
            break
        ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
        dev_docs_for_test.append(ev_doc)
    print "Performance on TRAIN after PreTRAINING"
    mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
    bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
    cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
    print "##################################################" 
    sys.stdout.flush()
    print >> sys.stderr,"Pre Train done"

    ## test performance on dev after pretraining
    dev_docs_for_test = []
    num = most_time_test
    for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
        num -= 1
        if num <= 0:
            break
        ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
        dev_docs_for_test.append(ev_doc)
    print "Performance on DEV after PreTRAINING"
    mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
    bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
    cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
    print "##################################################" 
    sys.stdout.flush()
    print >> sys.stderr,"Pre Train done"
    return



    ##train
    train4test = [] # add 5 items for testing the training performance
    add2train = True

    for echo in range(20):
        start_time = timeit.default_timer()
        reward_baseline = []
        cost_this_turn = 0.0

        trick_num = 0
        for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v):
            
            #trick_num += 1
            #if trick_num < 80:
            #    continue
        
            if add2train:
                if random.randint(1,200) == 100:
                    train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain))
                    if len(train4test) == 5:
                        add2train = False

            this_reward = 0.0

            #for train_batch, mask_batch, action_batch, reward_batch in policy_network.generate_policy_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model):
            for single, train, action, reward in policy_network.generate_policy_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model):
                #this_reward = reward_batch

                #reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline))
                #norm_reward = numpy.array(reward_batch) - reward_b

                #cost_this_turn += network_model.train_step(train_batch,mask_batch,action_batch,norm_reward,0.0001)[0]
                cost_this_turn += network_model.train_step(single,train,action,reward,0.0001)[0]
        end_time = timeit.default_timer()
        print >> sys.stderr, "Total cost:",cost_this_turn
        print >> sys.stderr, "TRAINING Use %.3f seconds"%(end_time-start_time)
        
        #reward_baseline.append(this_reward)
        #if len(reward_baseline) >= 32:
        #    reward_baselin = reward_baseline[1:]

        ## test training performance
        train_docs_for_test = []
        start_time = timeit.default_timer()
        for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in train4test:
            ev_doc = policy_network.generate_policy_test(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model)
            train_docs_for_test.append(ev_doc)
        print "** Echo: %d **"%echo
        print "TRAIN"
        mp,mr,mf = evaluation.evaluate_documents(train_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(train_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(train_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print

        ## dev
        dev_docs_for_test = []
        start_time = timeit.default_timer()
        for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
            ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
            dev_docs_for_test.append(ev_doc)
        print "DEV"
        mp,mr,mf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(dev_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print 

        end_time = timeit.default_timer()
        print >> sys.stderr, "DEV Use %.3f seconds"%(end_time-start_time)
        sys.stdout.flush()
    
        ## test
        test_docs_for_test = []
        start_time = timeit.default_timer()
        for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v):
            ev_doc = policy_network.generate_policy_test(test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain,network_model)
            test_docs_for_test.append(ev_doc)
        print "TEST"
        mp,mr,mf = evaluation.evaluate_documents(test_docs_for_test,evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f"%(mr,mp,mf)
        bp,br,bf = evaluation.evaluate_documents(test_docs_for_test,evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f"%(br,bp,bf)
        cp,cr,cf = evaluation.evaluate_documents(test_docs_for_test,evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f"%(cr,cp,cf)
        print 

        end_time = timeit.default_timer()
        print >> sys.stderr, "TEST Use %.3f seconds"%(end_time-start_time)
        sys.stdout.flush()

        save_f = file('./model/nets/network_model.%s.%d'%(args.language,echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()
Ejemplo n.º 20
0
def main():
    args = parse_args()
    config = configparser.ConfigParser()
    """ARGS DETAIL"""
    config_file = args.config_file
    batch_size = args.batch
    n_epoch = args.epoch
    pretrain_epoch = args.pretrain_epoch
    gpu_id = args.gpu
    model_type = args.model
    vocab_type = args.vocab
    pretrain_w2v = args.pretrain_w2v
    data_path = args.data_path
    """DIR PREPARE"""
    config.read(config_file)
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])

    vocab_name = vocab_type
    if pretrain_w2v:
        vocab_name = 'p' + vocab_name

    if model_type == 'multi':
        model_dir = './super_{}_{}{}_{}_c{}/'.format(model_type, vocab_name,
                                                     vocab_size, data_path[0],
                                                     coefficient)
    else:
        model_dir = './super_{}_{}{}_{}/'.format(model_type, vocab_name,
                                                 vocab_size, data_path[0])

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
        shutil.copyfile(config_file, model_dir + config_file)
    config_file = model_dir + config_file
    config.read(config_file)
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])
    valid_num = int(config['Parameter']['valid_num'])
    """LOGGER"""
    log_file = model_dir + 'log.txt'
    logger = dataset.prepare_logger(log_file)

    logger.info(args)  # 引数を記録
    logger.info('[Training start] logging to {}'.format(log_file))
    """DATASET"""
    train_src_file = config[data_path]['train_src_file']
    train_trg_file = config[data_path]['train_trg_file']
    valid_src_file = config[data_path]['valid_src_file']
    valid_trg_file = config[data_path]['valid_trg_file']
    test_src_file = config[data_path]['single_src_file']
    test_trg_file = config[data_path]['single_trg_file']
    src_w2v_file = config[data_path]['src_w2v_file']
    trg_w2v_file = config[data_path]['trg_w2v_file']

    correct_label, src_label, src_text, correct_index = dataset.load_binary_score_file(
        test_src_file)
    trg_text = dataset.load(test_trg_file)
    slice_size = len(correct_label) // valid_num
    correct_label, src_label, src_text, trg_text, correct_index = gridsearch.shuffle_list(
        correct_label, src_label, src_text, trg_text, correct_index)

    correct_label = gridsearch.slice_list(correct_label, slice_size)
    src_label = gridsearch.slice_list(src_label, slice_size)
    src_text = gridsearch.slice_list(src_text, slice_size)
    trg_text = gridsearch.slice_list(trg_text, slice_size)
    correct_index = gridsearch.slice_list(correct_index, slice_size)

    evaluater = evaluate.Evaluate()

    cross_valid_result = []
    for ite in range(1, valid_num + 1):
        model_valid_dir = model_dir + 'valid{}/'.format(ite)
        if not os.path.exists(model_valid_dir):
            os.mkdir(model_valid_dir)

        index = ite - 1
        c_label_train, c_label_dev, c_label_test = gridsearch.split_train_dev_test(
            correct_label, index)
        label_train, label_dev, label_test = gridsearch.split_train_dev_test(
            src_label, index)
        src_train, src_dev, src_test = gridsearch.split_train_dev_test(
            src_text, index)
        trg_train, trg_dev, trg_test = gridsearch.split_train_dev_test(
            trg_text, index)
        c_index_train, c_index_dev, c_index_test = gridsearch.split_train_dev_test(
            correct_index, index)
        """VOCABULARY"""
        src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab(
            model_valid_dir, vocab_type, src_train, trg_train, vocab_size,
            gpu_id)
        src_vocab_size = len(src_vocab.vocab)
        trg_vocab_size = len(trg_vocab.vocab)

        src_initialW = None
        trg_initialW = None

        if pretrain_w2v:
            w2v = word2vec.Word2Vec()
            src_initialW, vector_size, src_match_word_count = w2v.make_initialW(
                src_vocab.vocab, src_w2v_file)
            trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW(
                trg_vocab.vocab, trg_w2v_file)
            logger.info(
                'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format(
                    src_match_word_count, src_vocab_size, trg_match_word_count,
                    trg_vocab_size))
        """ITERATOR"""
        train_iter = dataset.Iterator(src_train,
                                      label_train,
                                      trg_train,
                                      src_vocab,
                                      trg_vocab,
                                      batch_size,
                                      gpu_id,
                                      sort=True,
                                      shuffle=True)
        # train_iter = dataset.Iterator(src_train, label_train, trg_train, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False)
        dev_iter = dataset.Iterator(src_dev,
                                    label_dev,
                                    trg_dev,
                                    src_vocab,
                                    trg_vocab,
                                    batch_size,
                                    gpu_id,
                                    sort=False,
                                    shuffle=False)
        test_iter = dataset.Iterator(src_test,
                                     label_test,
                                     trg_test,
                                     src_vocab,
                                     trg_vocab,
                                     batch_size,
                                     gpu_id,
                                     sort=False,
                                     shuffle=False)

        logger.info(
            'V{} ## train:{}, dev:{}, test:{}, src_vocab:{}, trg_vocab:{}'.
            format(ite, len(label_train), len(label_dev), len(label_test),
                   src_vocab_size, trg_vocab_size))
        """MODEL"""
        if model_type == 'multi':
            model = model.Multi(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                coefficient, src_initialW, trg_initialW)
        elif model_type in ['label', 'pretrain']:
            model = model.Label(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                src_initialW, trg_initialW)
        else:
            model = model.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                         embed_size, hidden_size,
                                         dropout_ratio, src_initialW,
                                         trg_initialW)
        """OPTIMIZER"""
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
        optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
        """GPU"""
        if gpu_id >= 0:
            chainer.cuda.get_device_from_id(gpu_id).use()
            model.to_gpu()
        """PRETRAIN"""
        if model_type == 'pretrain':
            logger.info('Pre-train start')
            logger.info('train size: {}, valid size: {}'.format(
                len(label_train), len(label_dev)))
            pretrain_loss_dic = {}
            for epoch in range(1, pretrain_epoch + 1):
                train_loss = 0
                for i, batch in enumerate(train_iter.generate(), start=1):
                    try:
                        loss = model.pretrain(*batch)
                        train_loss += loss.data
                        optimizer.target.cleargrads()
                        loss.backward()
                        optimizer.update()

                    except Exception as e:
                        logger.info('V{} ## P{} ## train iter: {}, {}'.format(
                            ite, epoch, i, e))

                chainer.serializers.save_npz(
                    model_valid_dir + 'p_model_epoch_{}.npz'.format(epoch),
                    model)
                """EVALUATE"""
                valid_loss = 0
                for batch in dev_iter.generate():
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        valid_loss += model.pretrain(*batch).data
                logger.info('V{} ## P{} ## train loss: {}, val loss:{}'.format(
                    ite, epoch, train_loss, valid_loss))
                pretrain_loss_dic[epoch] = valid_loss
            """MODEL SAVE"""
            best_epoch = min(pretrain_loss_dic,
                             key=(lambda x: pretrain_loss_dic[x]))
            logger.info('best_epoch:{}, val loss: {}'.format(
                best_epoch, pretrain_loss_dic[best_epoch]))
            shutil.copyfile(
                model_valid_dir + 'p_model_epoch_{}.npz'.format(best_epoch),
                model_valid_dir + 'p_best_model.npz')
            logger.info('Pre-train finish')
        """TRAIN"""
        accuracy_dic = {}
        for epoch in range(1, n_epoch + 1):
            train_loss = 0
            for i, batch in enumerate(train_iter.generate(), start=1):
                try:
                    loss = optimizer.target(*batch)
                    train_loss += loss.data
                    optimizer.target.cleargrads()
                    loss.backward()
                    optimizer.update()

                except Exception as e:
                    logger.info('V{} ## E{} ## train iter: {}, {}'.format(
                        ite, epoch, i, e))
            chainer.serializers.save_npz(
                model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model)
            """DEV"""
            outputs = []
            labels = []
            alignments = []
            for i, batch in enumerate(dev_iter.generate(), start=1):
                try:
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        output, label, align = model.predict(
                            batch[0], sos, eos)
                except Exception as e:
                    logger.info('V{} ## E{} ## dev iter: {}, {}'.format(
                        ite, epoch, i, e))

                if model_type == 'multi':
                    for o, l, a in zip(output, label, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        labels.append(chainer.cuda.to_cpu(l))
                        alignments.append(chainer.cuda.to_cpu(a))
                elif model_type in ['label', 'pretrain']:
                    for l in label:
                        labels.append(chainer.cuda.to_cpu(l))
                else:
                    for o, a in zip(output, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        alignments.append(chainer.cuda.to_cpu(a))

            if model_type == 'encdec':
                best_param_dic = evaluater.param_search(
                    alignments, [], c_label_dev)
            else:
                best_param_dic = evaluater.param_search(
                    labels, alignments, c_label_dev)
            param = max(best_param_dic, key=lambda x: best_param_dic[x])
            init, mix = evaluate.key_to_param(param)
            dev_score = round(best_param_dic[param], 3)
            """TEST"""
            outputs = []
            labels = []
            alignments = []
            for i, batch in enumerate(test_iter.generate(), start=1):
                try:
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        output, label, align = model.predict(
                            batch[0], sos, eos)
                except Exception as e:
                    logger.info('V{} ## E{} ## test iter: {}, {}'.format(
                        ite, epoch, i, e))
                if model_type == 'multi':
                    for o, l, a in zip(output, label, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        labels.append(chainer.cuda.to_cpu(l))
                        alignments.append(chainer.cuda.to_cpu(a))
                elif model_type in ['label', 'pretrain']:
                    for l in label:
                        labels.append(chainer.cuda.to_cpu(l))
                else:
                    for o, a in zip(output, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        alignments.append(chainer.cuda.to_cpu(a))

            if model_type in ['multi', 'label', 'pretrain']:
                s_rate, s_count, _, _, s_result = evaluater.eval_param(
                    labels, alignments, c_label_test, c_index_test, init, mix)
            else:
                s_rate, s_count, _, _, s_result = evaluater.eval_param(
                    alignments, [], c_label_test, c_index_test, init, mix)
            test_score = round(s_rate[-1], 3)
            logger.info('V{} ## E{} ## loss:{}, dev: {}, test: {}'.format(
                ite, epoch, train_loss, dev_score, test_score))

            dataset.save_output(model_valid_dir, epoch, labels, alignments,
                                outputs, s_result)
            accuracy_dic[epoch] = [
                epoch, dev_score, test_score, param, s_rate, s_result
            ]
        """MODEL SAVE"""
        best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][1]))
        cross_valid_result.append(accuracy_dic[best_epoch])
        logger.info('V{} ## best_epoch:{}, dev:{}, test:{}'.format(
            ite, best_epoch, accuracy_dic[best_epoch][1],
            accuracy_dic[best_epoch][2]))
        shutil.copyfile(
            model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch),
            model_valid_dir + 'best_model.npz')

        logger.info('')

    average_dev_score = 0
    average_test_score = [0 for _ in range(len(cross_valid_result[0][4]))]
    s_result_total = []
    for i, r in enumerate(cross_valid_result, start=1):
        epoch = r[0]
        dev_score = r[1]
        param = r[3]
        test_score_list = [round(rr, 3) for rr in r[4]]
        s_result = r[5]

        average_dev_score += dev_score
        average_test_score = [
            average_test_score[i] + test_score_list[i]
            for i in range(len(average_test_score))
        ]
        logger.info('   {}: epoch{}, {}\t{}'.format(
            i, epoch, param, ' '.join(dataset.float_to_str(test_score_list))))
        s_result_total.extend(s_result)
    average_dev_score = round(average_dev_score / len(cross_valid_result), 3)
    average_test_score = [
        round(average_test_score[i] / len(cross_valid_result), 3)
        for i in range(len(average_test_score))
    ]
    logger.info('dev: {}, test: {}'.format(
        average_dev_score, ' '.join(dataset.float_to_str(average_test_score))))

    with open(model_dir + 's_res.txt', 'w') as f:
        [
            f.write('{}\n'.format(l[1]))
            for l in sorted(s_result_total, key=lambda x: x[0])
        ]
Ejemplo n.º 21
0
    def parseFile(filepath):
        output = []

        fp = open(filepath, 'r')
        lines = fp.readlines()

        #discard headers
        lines = lines[1:]
        #Engagements/Followers at Posting/Created/Type/Description

        data = []
        expected = []

        captions = []
        joined = True
        while joined:
            joined = False
            for li in range(0, len(lines)):
                splitline = lines[li].split(',', 4)
                if len(splitline) < 5:
                    lines[li - 1:li + 1] = [''.join(lines[li - 1:li + 1])]
                    joined = True
                    break

        for line in lines:
            splitline = line.split(',', 4)
            caption = splitline[4]
            captions = captions + caption.split()

        w2v = word2vec.Word2Vec(captions, True)

        for line in lines:
            splitline = line.split(',', 4)
            if len(splitline) == 5:
                parsedline = []

                expected.append(int(splitline[0]))
                #engagements - add this to training input

                parsedline.append(int(splitline[1]))
                #followers

                parsedline = parsedline + dateparser.DateParser.getDateTime(
                    splitline[2])
                #created time

                parsedline = parsedline + posttypeparser.PostTypeParser.getPostType(
                    splitline[3])
                #post type

                parsedline = parsedline + w2v.getVector(splitline[4])
                #description

                data.append(parsedline)
                #add parsed line to data set
            fp.close()
            output = [data, expected]
        return output


#of = open("output.txt", 'w')
#o = FileParser.parseFile("data/business/training_set.csv")

#writestring = ""
#for entry in o[0]:
#    writestring += str(entry) + '\n'
#of.write(writestring)
Ejemplo n.º 22
0
    else:
        return numpy.array([0.5] * (n - 4) + p_list)


if args.type == "nn_train":

    if os.path.isfile("./model/save_data"):
        print >> sys.stderr, "Read from file ./model/save_data"
        read_f = file('./model/save_data', 'rb')
        training_instances = cPickle.load(read_f)
        anaphorics_result = cPickle.load(read_f)
        test_instances = cPickle.load(read_f)
        read_f.close()
    else:
        print >> sys.stderr, "Read W2V"
        w2v = word2vec.Word2Vec(args.embedding)

        ### Training ####
        path = args.data
        training_instances = generate_instance.generate_training_instances(
            path, w2v)

        ####  Test process  ####

        path = args.test_data
        test_instances, anaphorics_result = generate_instance.generate_test_instances(
            path, w2v)

        w2v = None  # 释放空间
        print >> sys.stderr, "Save file ./model/save_data"
Ejemplo n.º 23
0
def main():

    embedding_dir = args.embedding + args.language

    print >> sys.stderr, "Read Embedding from %s ..." % embedding_dir
    embedding_dimention = 50
    if args.language == "cn":
        embedding_dimention = 64
    w2v = word2vec.Word2Vec(embedding_dir, embedding_dimention)

    #network_model_manager
    if os.path.isfile("./model/network_model_manager." + args.language):
        #read_f = file('./model/network_model_manager.'+args.language, 'rb')
        read_f = file(
            './model/network_model_pretrain_manager.' + args.language, 'rb')
        network_manager = cPickle.load(read_f)
        print >> sys.stderr, "Read model from ./model/network_model_manager." + args.language
    else:
        inpt_dimention = 1738
        single_dimention = 855
        cluster_dimention = 855
        if args.language == "en":
            inpt_dimention = 1374
            single_dimention = 673
            cluster_dimention = 855

        network_manager = network.Manager(inpt_dimention, single_dimention,
                                          1000)
        print >> sys.stderr, "save model network_manager..."
        save_f = file('./model/network_model_manager.' + args.language, 'wb')
        cPickle.dump(network_manager,
                     save_f,
                     protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    #network_model_worker
    if os.path.isfile("./model/network_model_worker." + args.language):
        read_f = file('./model/network_model_worker.' + args.language, 'rb')
        #read_f = file('./model/network_model_pretrain_worker.'+args.language, 'rb')
        network_worker = cPickle.load(read_f)
        print >> sys.stderr, "Read model from ./model/network_model_worker." + args.language
    else:
        inpt_dimention = 1738
        single_dimention = 855
        cluster_dimention = 855
        if args.language == "en":
            inpt_dimention = 1374
            single_dimention = 673
            cluster_dimention = 855

        network_worker = network.Worker(inpt_dimention, single_dimention,
                                        cluster_dimention, 1000)
        print >> sys.stderr, "save model network_worker..."
        save_f = file('./model/network_model_worker.' + args.language, 'wb')
        cPickle.dump(network_worker, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    train_docs = DataGenerate.doc_data_generater("train")
    dev_docs = DataGenerate.doc_data_generater("dev")
    test_docs = DataGenerate.doc_data_generater("test")

    #pretrain_manager
    times = 0
    best_cost = 99999999
    step = 0
    lr = 0.00009
    for echo in range(10):
        start_time = timeit.default_timer()
        print "Pretrain ECHO:", echo
        cost_this_turn = 0.0
        #print >> sys.stderr, network_model.get_weight_sum()
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):
            if len(cases) >= 700:
                continue
            for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case(
                    cases, gold_chain):
                #cost_this_turn += network_manager.pre_train_step(single_mention_array,train_list,lable_list,0.0001)[0]
                cost_this_turn += network_manager.pre_train_step(
                    single_mention_array, train_list, lable_list, lr)[0]
            step += 1
            if step % 128 == 0:
                lr = lr * 0.99

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain for Manager", echo, "Total cost:", cost_this_turn
        print >> sys.stderr, "PreTraining for Manager Use %.3f seconds" % (
            end_time - start_time)

        if cost_this_turn <= best_cost:
            save_f = file(
                './model/network_model_pretrain_manager_best.' + args.language,
                'wb')
            cPickle.dump(network_manager,
                         save_f,
                         protocol=cPickle.HIGHEST_PROTOCOL)
            save_f.close()
            best_cost = cost_this_turn

        save_f = file(
            './model/network_model_pretrain_manager.' + args.language, 'wb')
        cPickle.dump(network_manager,
                     save_f,
                     protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    ## test performance after pretraining
    print >> sys.stderr, "Begin test on DEV after Manager pertraining"
    dev_docs_for_test = []
    num = 0
    for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v):
        ev_doc = pretrain.generate_pretrain_test(cases, gold_chain,
                                                 network_manager)
        dev_docs_for_test.append(ev_doc)
    print "Performance on DEV after Manager PreTRAINING"
    mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
    bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
    cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
    print "##################################################"
    sys.stdout.flush()
    print >> sys.stderr, "Manager Pre Train done"

    return

    #pretrain_worker
    times = 0
    best_cost = 99999999
    for echo in range(20):
        start_time = timeit.default_timer()
        print "Pretrain ECHO:", echo
        cost_this_turn = 0.0
        #print >> sys.stderr, network_model.get_weight_sum()
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):
            if len(cases) >= 700:
                continue
            for single_mention_array, train_list, lable_list in pretrain.generate_pretrain_case(
                    cases, gold_chain, network_model):
                cost_this_turn += network_manager.pre_train_step(
                    single_mention_array, train_list, lable_list, 0.0001)[0]

        end_time = timeit.default_timer()
        print >> sys.stderr, "PreTrain4Manager", echo, "Total cost:", cost_this_turn
        print >> sys.stderr, "PreTraining4Manager Use %.3f seconds" % (
            end_time - start_time)

        if cost_this_turn <= best_cost:
            save_f = file(
                './model/network_model_pretrain_manager_best.' + args.language,
                'wb')
            cPickle.dump(network_manager,
                         save_f,
                         protocol=cPickle.HIGHEST_PROTOCOL)
            save_f.close()
            best_cost = cost_this_turn

        save_f = file(
            './model/network_model_pretrain_manager.' + args.language, 'wb')
        cPickle.dump(network_manager,
                     save_f,
                     protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()

    ## test performance after pretraining
    print >> sys.stderr, "Begin test on DEV after Manager pertraining"
    dev_docs_for_test = []
    num = 0
    for cases, gold_chain in DataGenerate.case_generater(dev_docs, "dev", w2v):
        ev_doc = policy_network.generate_policy_test(cases, gold_chain,
                                                     network_manager)
        dev_docs_for_test.append(ev_doc)
    print "Performance on DEV after Manager PreTRAINING"
    mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.muc)
    print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
    bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.b_cubed)
    print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
    cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                               evaluation.ceafe)
    print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
    print "##################################################"
    sys.stdout.flush()
    print >> sys.stderr, "Manager Pre Train done"

    ##train
    train4test = []  # add 5 items for testing the training performance
    add2train = True

    for echo in range(20):
        start_time = timeit.default_timer()
        reward_baseline = []
        cost_this_turn = 0.0

        #for train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain in DataGenerate.array_generater(train_docs,"train",w2v):
        for cases, gold_chain in DataGenerate.case_generater(
                train_docs, "train", w2v):

            if add2train:
                if random.randint(1, 200) == 10:
                    #train4test.append((train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain))
                    train4test.append((cases, gold_chain))
                    if len(train4test) == 5:
                        add2train = False

            this_reward = 0.0

            #for single, train, action, reward in policy_network.generate_policy_case(train_doc_mention_array,train_doc_pair_array,train_doc_gold_chain,network_model):
            for single, train, action, reward in policy_network.generate_policy_case(
                    cases, gold_chain, network_model):
                #reward_b = 0 if len(reward_baseline) < 1 else float(sum(reward_baseline))/float(len(reward_baseline))
                #norm_reward = numpy.array(reward_batch) - reward_b

                cost_this_turn += network_model.train_step(
                    single, train, action, reward, 0.0001)[0]
        end_time = timeit.default_timer()
        print >> sys.stderr, "Total cost:", cost_this_turn
        print >> sys.stderr, "TRAINING Use %.3f seconds" % (end_time -
                                                            start_time)

        #reward_baseline.append(this_reward)
        #if len(reward_baseline) >= 32:
        #    reward_baselin = reward_baseline[1:]

        ## test training performance
        train_docs_for_test = []
        start_time = timeit.default_timer()

        for train_cases, train_doc_gold_chain in train4test:
            ev_doc = policy_network.generate_policy_test(
                train_cases, train_doc_gold_chain, network_model)
            train_docs_for_test.append(ev_doc)
        print "** Echo: %d **" % echo
        print "TRAIN"
        mp, mr, mf = evaluation.evaluate_documents(train_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(train_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(train_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print

        ## dev
        dev_docs_for_test = []
        start_time = timeit.default_timer()
        #for dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain in DataGenerate.array_generater(dev_docs,"dev",w2v):
        #ev_doc = policy_network.generate_policy_test(dev_doc_mention_array,dev_doc_pair_array,dev_doc_gold_chain,network_model)
        for dev_cases, dev_doc_gold_chain in DataGenerate.case_generater(
                dev_docs, "dev", w2v):
            ev_doc = policy_network.generate_policy_test(
                dev_cases, dev_doc_gold_chain, network_model)
            dev_docs_for_test.append(ev_doc)
        print "DEV"
        mp, mr, mf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(dev_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print

        end_time = timeit.default_timer()
        print >> sys.stderr, "DEV Use %.3f seconds" % (end_time - start_time)
        sys.stdout.flush()

        ## test
        test_docs_for_test = []
        start_time = timeit.default_timer()
        #for test_doc_mention_array,test_doc_pair_array,test_doc_gold_chain in DataGenerate.array_generater(test_docs,"test",w2v):
        for test_cases, test_doc_gold_chain in DataGenerate.case_generater(
                test_docs, "test", w2v):
            ev_doc = policy_network.generate_policy_test(
                test_cases, test_doc_gold_chain, network_model)
            test_docs_for_test.append(ev_doc)
        print "TEST"
        mp, mr, mf = evaluation.evaluate_documents(test_docs_for_test,
                                                   evaluation.muc)
        print "MUC: recall: %f precision: %f  f1: %f" % (mr, mp, mf)
        bp, br, bf = evaluation.evaluate_documents(test_docs_for_test,
                                                   evaluation.b_cubed)
        print "BCUBED: recall: %f precision: %f  f1: %f" % (br, bp, bf)
        cp, cr, cf = evaluation.evaluate_documents(test_docs_for_test,
                                                   evaluation.ceafe)
        print "CEAF: recall: %f precision: %f  f1: %f" % (cr, cp, cf)
        print

        end_time = timeit.default_timer()
        print >> sys.stderr, "TEST Use %.3f seconds" % (end_time - start_time)
        sys.stdout.flush()

        save_f = file(
            './model/nets/network_model.%s.%d' % (args.language, echo), 'wb')
        cPickle.dump(network_model, save_f, protocol=cPickle.HIGHEST_PROTOCOL)
        save_f.close()