def dump_debug_data():
    train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td'
    entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
                           'wid_entity_rep_wiki50_indices_with_keywords_fixed_len_10kw.bin'
    # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
    #                        'wid_entity_rep_wiki50_indices.bin'

    # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
    #     entity_rep_file_name, max_num_entity_words, entity_pad_len)
    global entity_rep_len
    wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
        entity_rep_file_name)
    f_train = open(train_data_file_name, 'rb')
    for i in xrange(143):
        data_load.skip_training_sample(f_train, 50000)
        if i % 40 == 0:
            print i
    print 'skipped'

    cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
                                                                            training_part_size,
                                                                            wid_idx_dict,
                                                                            sentence_len,
                                                                            sentence_pad_len)
    f_debug = open('debug_data_vlen.bin', 'wb')
    cPickle.dump([cur_train_contexts, cur_train_indices], f_debug)
    f_debug.close()
def dump_debug_data():
    train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td'
    entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
                           'wid_entity_rep_wiki50_indices_with_keywords_fixed_len_10kw.bin'
    # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
    #                        'wid_entity_rep_wiki50_indices.bin'

    # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
    #     entity_rep_file_name, max_num_entity_words, entity_pad_len)
    global entity_rep_len
    wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
        entity_rep_file_name)
    f_train = open(train_data_file_name, 'rb')
    for i in xrange(143):
        data_load.skip_training_sample(f_train, 50000)
        if i % 40 == 0:
            print i
    print 'skipped'

    cur_train_contexts, cur_train_indices = data_load.load_training_samples(
        f_train, training_part_size, wid_idx_dict, sentence_len,
        sentence_pad_len)
    f_debug = open('debug_data_vlen.bin', 'wb')
    cPickle.dump([cur_train_contexts, cur_train_indices], f_debug)
    f_debug.close()
def main():
    if len(sys.argv) < 2:
        print 'need params file'

    params = load_params(sys.argv[1])

    entity_side_cnn = params['entity_side_cnn'] == '1'
    word_vec_file_name = params['word_vec_file']

    if entity_side_cnn:
        entity_rep_file_name = params['entity_rep_indices_file']
    else:
        entity_rep_file_name = params['entity_rep_vec_file']

    train_data_file_name = params['train_data_file']
    val_data_file_name = params['val_data_file']
    test_data_file_name = params['test_data_file']

    training_part_size = int(params['training_part_size'])
    sentence_len = int(params['context_sentence_len'])

    _, word_vecs = data_load.load_word_vectors(word_vec_file_name)
    word_vec_len = len(word_vecs[0])

    if entity_side_cnn:
        print 'entity use cnn'
        global entity_rep_len
        wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
            entity_rep_file_name)
    else:
        wid_idx_dict, entity_vecs = data_load.load_entities(
            entity_rep_file_name,
            False)

    # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
    #     entity_rep_file_name, max_num_entity_words, entity_pad_len)

    num_val_candidates = 30
    num_test_candidates = 30
    skipwidth_loading = 0
    train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     sentence_len, word_vec_len,
                     word_vecs,
                     wid_idx_dict,
                     entity_vecs,
                     entity_side_cnn=entity_side_cnn,
                     gold_as_first_candidate=False,
                     skip_width_loading=skipwidth_loading,
                     n_epochs=1,
                     training_part_size=training_part_size)
Exemple #4
0
def main():
    if len(sys.argv) < 2:
        print 'need params file'

    params = load_params(sys.argv[1])

    entity_side_cnn = params['entity_side_cnn'] == '1'
    word_vec_file_name = params['word_vec_file']

    if entity_side_cnn:
        entity_rep_file_name = params['entity_rep_indices_file']
    else:
        entity_rep_file_name = params['entity_rep_vec_file']

    train_data_file_name = params['train_data_file']
    val_data_file_name = params['val_data_file']
    test_data_file_name = params['test_data_file']

    training_part_size = int(params['training_part_size'])
    sentence_len = int(params['context_sentence_len'])

    _, word_vecs = data_load.load_word_vectors(word_vec_file_name)
    word_vec_len = len(word_vecs[0])

    if entity_side_cnn:
        print 'entity use cnn'
        global entity_rep_len
        wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
            entity_rep_file_name)
    else:
        wid_idx_dict, entity_vecs = data_load.load_entities(
            entity_rep_file_name, False)

    # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
    #     entity_rep_file_name, max_num_entity_words, entity_pad_len)

    num_val_candidates = 30
    num_test_candidates = 30
    skipwidth_loading = 0
    train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     sentence_len,
                     word_vec_len,
                     word_vecs,
                     wid_idx_dict,
                     entity_vecs,
                     entity_side_cnn=entity_side_cnn,
                     gold_as_first_candidate=False,
                     skip_width_loading=skipwidth_loading,
                     n_epochs=1,
                     training_part_size=training_part_size)
def main():
    local_flg = True
    if len(sys.argv) > 1:
        if sys.argv[1] == '0':
            local_flg = False

    if local_flg:
        word_vec_file_name = '/media/dhl/Data/el/word2vec/wiki_vectors.jbin'
        entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
                               'wid_entity_rep_wiki50_indices_with_keywords_fixed_len.bin'
        # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
        #                        'wid_entity_rep_wiki50_indices.bin'
        train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td'
        val_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_training.bin'
        test_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_eval.bin'
    else:
        word_vec_file_name = '/home/dhl/data/word_vec/wiki_vectors.jbin'
        entity_rep_file_name = '/home/dhl/data/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len_0kw.bin'
        train_data_file_name = '/home/dhl/data/vec_rep/wiki_train_word_vec_indices_wiki50.td'
        val_data_file_name = '/home/dhl/data/vec_rep/tac_2014_training.bin'
        test_data_file_name = '/home/dhl/data/vec_rep/tac_2014_eval.bin'

    _, word_vecs = data_load.load_word_vectors(word_vec_file_name)
    word_vec_len = len(word_vecs[0])

    # wid_idx_dict, entity_vecs = data_load.load_entities(
    #     '/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin',
    #     False)

    # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
    #     entity_rep_file_name, max_num_entity_words, entity_pad_len)

    global entity_rep_len
    wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
        entity_rep_file_name)

    num_val_candidates = 30
    num_test_candidates = 30
    skipwidth_loading = 0
    img_h = sentence_len + 2 * sentence_pad_len
    train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     img_h,
                     word_vec_len,
                     word_vecs,
                     wid_idx_dict,
                     entity_vecs,
                     gold_as_first_candidate=False,
                     skip_width_loading=skipwidth_loading,
                     n_epochs=1)
def main():
    local_flg = True
    if len(sys.argv) > 1:
        if sys.argv[1] == '0':
            local_flg = False

    if local_flg:
        word_vec_file_name = '/media/dhl/Data/el/word2vec/wiki_vectors.jbin'
        entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
                               'wid_entity_rep_wiki50_indices_with_keywords_fixed_len.bin'
        # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
        #                        'wid_entity_rep_wiki50_indices.bin'
        train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td'
        val_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_training.bin'
        test_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_eval.bin'
    else:
        word_vec_file_name = '/home/dhl/data/word_vec/wiki_vectors.jbin'
        entity_rep_file_name = '/home/dhl/data/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len_0kw.bin'
        train_data_file_name = '/home/dhl/data/vec_rep/wiki_train_word_vec_indices_wiki50.td'
        val_data_file_name = '/home/dhl/data/vec_rep/tac_2014_training.bin'
        test_data_file_name = '/home/dhl/data/vec_rep/tac_2014_eval.bin'

    _, word_vecs = data_load.load_word_vectors(word_vec_file_name)
    word_vec_len = len(word_vecs[0])

    # wid_idx_dict, entity_vecs = data_load.load_entities(
    #     '/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin',
    #     False)

    # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
    #     entity_rep_file_name, max_num_entity_words, entity_pad_len)

    global entity_rep_len
    wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
        entity_rep_file_name)

    num_val_candidates = 30
    num_test_candidates = 30
    skipwidth_loading = 0
    img_h = sentence_len + 2 * sentence_pad_len
    train_cnn_for_el(train_data_file_name,
                     val_data_file_name,
                     num_val_candidates,
                     test_data_file_name,
                     num_test_candidates,
                     img_h, word_vec_len,
                     word_vecs,
                     wid_idx_dict,
                     entity_vecs,
                     gold_as_first_candidate=False,
                     skip_width_loading=skipwidth_loading,
                     n_epochs=1)