Esempio n. 1
0
def __pre_train_nrdj(word_vecs_file,
                     tok_texts_file,
                     aspect_terms_file,
                     opinion_terms_file,
                     dst_model_file,
                     task,
                     load_model_file=None):
    init_logging('log/nrdj-pre-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)

    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_file, tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, opinion_terms_file, tok_texts_file, 'opinion')
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    # nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData(
    #     train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list,
    #     valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None
    # )
    # nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData(
    #     train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list,
    #     valid_data_src2.labels_list, valid_data_src2.tok_texts, None,
    #     valid_data_src2.opinions_true_list
    # )

    nrdj.pre_train(train_data_src1,
                   valid_data_src1,
                   train_data_src2,
                   valid_data_src2,
                   vocab,
                   n_epochs=30,
                   lr=lr,
                   save_file=dst_model_file)
Esempio n. 2
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file,
                 load_model_file, task):
    init_logging('log/{}-train-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    dst_aspects_file = '/home/hldai/data/aspect/semeval14/nrdj-aspects.txt'
    dst_opinions_file = '/home/hldai/data/aspect/semeval14/nrdj-opinions.txt'
    # dst_aspects_file, dst_opinions_file = None, None

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info('load model {}'.format(load_model_file))
    logging.info(test_sents_file)

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))
    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file,
                              batch_size=batch_size)
    nrdj.train(train_data,
               valid_data,
               test_data,
               vocab,
               n_epochs=n_epochs,
               lr=lr,
               dst_aspects_file=dst_aspects_file,
               dst_opinions_file=dst_opinions_file)
Esempio n. 3
0
def __train_nrdj(word_vecs_file, train_tok_texts_file, train_sents_file,
                 train_valid_split_file, test_tok_texts_file, test_sents_file):
    init_logging('log/nrdj-jtrain-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 10
    lr = 0.001
    share_lstm = False

    print('loading data ...')
    with open(word_vecs_file, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    logging.info('word vec dim: {}, n_words={}'.format(
        word_vecs_matrix.shape[1], word_vecs_matrix.shape[0]))

    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, pre_aspect_terms_file, pre_tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, pre_opinion_terms_file, pre_tok_texts_file, 'opinion')

    train_data, valid_data, test_data = datautils.get_data_semeval(
        train_sents_file, train_tok_texts_file, train_valid_split_file,
        test_sents_file, test_tok_texts_file, vocab, n_train, label_opinions)
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=None,
                              batch_size=batch_size)

    nrdj.join_train(train_data_src1,
                    valid_data_src1,
                    train_data_src2,
                    valid_data_src2,
                    train_data,
                    valid_data,
                    test_data,
                    n_epochs=n_epochs,
                    lr=lr)
Esempio n. 4
0
def __pre_train_nrdj(word_vecs_file,
                     tok_texts_file,
                     aspect_terms_file,
                     opinion_terms_file,
                     dst_model_file,
                     task,
                     lamb,
                     lstm_l2,
                     train_word_embeddings=False,
                     load_model_file=None):
    init_logging('log/{}-pre-{}-{}.log'.format(
        os.path.splitext(os.path.basename(__file__))[0],
        utils.get_machine_name(), str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    label_opinions = True
    # label_opinions = False
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 32
    lr = 0.001
    share_lstm = False

    logging.info(word_vecs_file)
    logging.info(aspect_terms_file)
    logging.info(opinion_terms_file)
    logging.info('dst: {}'.format(dst_model_file))

    print('loading data ...')
    vocab, word_vecs_matrix = __load_word_vecs(word_vecs_file)

    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_file, tok_texts_file, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, opinion_terms_file, tok_texts_file, 'opinion')
    print('done')
    logging.info('train_word_embeddings={} lstm_l2={}'.format(
        train_word_embeddings, lstm_l2))

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              train_word_embeddings=train_word_embeddings,
                              lamb=lamb,
                              lstm_l2_src=lstm_l2,
                              model_file=load_model_file,
                              batch_size=batch_size)

    nrdj.pre_train(train_data_src1,
                   valid_data_src1,
                   train_data_src2,
                   valid_data_src2,
                   vocab,
                   n_epochs=50,
                   lr=lr,
                   save_file=dst_model_file)
Esempio n. 5
0
def __train_neurule_double_joint():
    init_logging('log/nrdj-{}.log'.format(str_today), mode='a', to_stdout=True)

    # n_train = 1000
    n_train = -1
    # task = 'pretrain'
    task = 'train'
    label_opinions = True
    n_tags = 5 if label_opinions else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    lr = 0.001
    share_lstm = False
    train_mode = 'target-only'

    print('loading data ...')
    with open(config.SE14_LAPTOP_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data_tar, valid_data_tar = datautils.get_data_semeval(
        config.SE14_LAPTOP_TRAIN_SENTS_FILE, config.SE14_LAPTOP_TRAIN_TOK_TEXTS_FILE,
        config.SE14_LAPTOP_TEST_SENTS_FILE, config.SE14_LAPTOP_TEST_TOK_TEXTS_FILE,
        vocab, n_train, label_opinions)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, config.AMAZON_TERMS_TRUE2_FILE, config.AMAZON_TOK_TEXTS_FILE, 'aspect')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, config.AMAZON_TERMS_TRUE4_FILE, config.AMAZON_TOK_TEXTS_FILE, 'opinion')
    rule_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE if task == 'train' else None
    # rule_model_file = None
    pretrain_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE
    save_model_file = config.LAPTOP_NRDJ_RULE_MODEL_FILE
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags, word_vecs_matrix, share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=rule_model_file)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData(
        train_data_src1.word_idxs_list, train_data_src1.labels_list, valid_data_src1.word_idxs_list,
        valid_data_src1.labels_list, valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None
    )
    nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData(
        train_data_src2.word_idxs_list, train_data_src2.labels_list, valid_data_src2.word_idxs_list,
        valid_data_src2.labels_list, valid_data_src2.tok_texts, None,
        valid_data_src2.opinions_true_list
    )

    nrj_train_data_tar = NeuRuleDoubleJoint.TrainData(
        train_data_tar.word_idxs_list, train_data_tar.labels_list, valid_data_tar.word_idxs_list,
        valid_data_tar.labels_list, valid_data_tar.tok_texts, valid_data_tar.aspects_true_list,
        valid_data_tar.opinions_true_list
    )

    if task == 'pretrain':
        nrdj.pre_train(nrj_train_data_src1, nrj_train_data_src2, vocab, n_epochs=n_epochs, lr=lr,
                       save_file=pretrain_model_file)
    if task == 'train':
        nrdj.train(nrj_train_data_src1, nrj_train_data_src2, nrj_train_data_tar, vocab, train_mode,
                   n_epochs=n_epochs, lr=lr)
Esempio n. 6
0
def __train_nrdj_restaurant_pr():
    init_logging('log/nrdj-restaurant-{}.log'.format(str_today),
                 mode='a',
                 to_stdout=True)

    # n_train = 1000
    n_train = -1
    # task = 'pretrain'
    task = 'train'
    label_task = 'aspect'
    n_tags = 5 if label_task == 'both' else 3
    # n_tags = 5 if task == 'train' else 3
    batch_size = 20
    hidden_size_lstm = 100
    n_epochs = 200
    lr = 0.001
    share_lstm = False
    # load_pretrained_model = True
    load_pretrained_model = False
    # train_mode = 'target-only'
    train_mode = 'all'
    src1_label, src2_label = 'aspect', 'opinion'

    aspect_terms_p_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-p.txt'
    aspect_terms_r_file = 'd:/data/aspect/semeval14/restaurant/yelp-aspect-rule-result-r.txt'
    # opinion_terms_file = 'd:/data/aspect/semeval14/restaurant/yelp-opinion-rule-result.txt'
    yelp_tok_texts_file = 'd:/data/res/yelp-review-eng-tok-sents-round-9.txt'
    rule_model_file = 'd:/data/aspect/semeval14/tf-model/drest/yelp-nrdj.ckpl'
    # rule_model_file = None

    load_model_file = None
    if task == 'train' and load_pretrained_model:
        load_model_file = rule_model_file
    # save_model_file = None if task == 'train' else rule_model_file
    save_model_file = rule_model_file if task == 'pretrain' else None

    print('loading data ...')
    with open(config.SE14_REST_GLOVE_WORD_VEC_FILE, 'rb') as f:
        vocab, word_vecs_matrix = pickle.load(f)
    train_data_tar, valid_data_tar = datautils.get_data_semeval(
        config.SE14_REST_TRAIN_SENTS_FILE,
        config.SE14_REST_TRAIN_TOK_TEXTS_FILE,
        config.SE14_REST_TEST_SENTS_FILE, config.SE14_REST_TEST_TOK_TEXTS_FILE,
        vocab, n_train, label_task)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE1_FILE)
    # train_data_src1, valid_data_src1 = __get_data_amazon(vocab, config.AMAZON_TERMS_TRUE3_FILE)
    train_data_src1, valid_data_src1 = datautils.get_data_amazon(
        vocab, aspect_terms_p_file, yelp_tok_texts_file, src1_label)
    # train_data_src2, valid_data_src2 = datautils.get_data_amazon(
    #     vocab, aspect_terms_r_file, yelp_tok_texts_file, 'opinion')
    train_data_src2, valid_data_src2 = datautils.get_data_amazon(
        vocab, aspect_terms_r_file, yelp_tok_texts_file, src2_label)
    # train_data_src2, valid_data_src2 = datautils.get_data_amazon(
    #     vocab, opinion_terms_file, yelp_tok_texts_file, 'opinion')
    print('done')

    nrdj = NeuRuleDoubleJoint(n_tags,
                              word_vecs_matrix,
                              share_lstm,
                              hidden_size_lstm=hidden_size_lstm,
                              model_file=load_model_file)

    nrj_train_data_src1 = nrj_train_data_src2 = None
    # if train_mode != 'target-only':
    nrj_train_data_src1 = NeuRuleDoubleJoint.TrainData(
        train_data_src1.word_idxs_list, train_data_src1.labels_list,
        valid_data_src1.word_idxs_list, valid_data_src1.labels_list,
        valid_data_src1.tok_texts, valid_data_src1.aspects_true_list, None)
    nrj_train_data_src2 = NeuRuleDoubleJoint.TrainData(
        train_data_src2.word_idxs_list, train_data_src2.labels_list,
        valid_data_src2.word_idxs_list, valid_data_src2.labels_list,
        valid_data_src2.tok_texts, None, valid_data_src2.opinions_true_list)

    nrj_train_data_tar = NeuRuleDoubleJoint.TrainData(
        train_data_tar.word_idxs_list, train_data_tar.labels_list,
        valid_data_tar.word_idxs_list, valid_data_tar.labels_list,
        valid_data_tar.tok_texts, valid_data_tar.aspects_true_list,
        valid_data_tar.opinions_true_list)

    if task == 'pretrain':
        nrdj.pre_train(nrj_train_data_src1,
                       nrj_train_data_src2,
                       vocab,
                       n_epochs=n_epochs,
                       lr=lr,
                       save_file=save_model_file)
    if task == 'train':
        nrdj.train(nrj_train_data_src1,
                   nrj_train_data_src2,
                   nrj_train_data_tar,
                   vocab,
                   train_mode,
                   n_epochs=n_epochs,
                   lr=lr)