Beispiel #1
0
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/get_bert_dists.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')

# Get document of all training (unshuffled) responses
# Convert each word to an ID
# Initialise an np array of dimension of the largest possible ID for a word - call it word_freqs
# Initialise each element in array to 0
# Loop through list of all response word IDs and increment corresponding position in word_freqs by 1 for each ID encountered
# Loop through word_freqs: if element is > 0, set to 1/element; if element is = 0, set to 1.5 (this is the value set by lots of real algorithms)
# Save word_freqs array as a numpy array that can be loaded by step_train_simGrid.py and converted to tf tensor and then have tf.gather to applied to it for a list of word IDs.
    
    response_path = '/home/alta/relevance/vr311/data_vatsal/BULATS/content_words/responses.txt'
    wlist_path = '/home/alta/relevance/vr311/data_vatsal/BULATS/content_words/tfrecords_train/input.wlist.index'
    data, _ = text_to_array(response_path, wlist_path, strip_start_end=False)
    
# Note, input.wlist.index starts word IDs at 0 but all word IDs in 'data' start at 1 (i.e. corresponds to the line number in input.wlist.index instead), so a word ID of 0 is impossible

    word_freqs = np.zeros(62415+1)
    print('GOT HERE')

    print(data[:2])
    

    for w_id in np.nditer(data):
        word_freqs[w_id] += 1

    print('WOW, HERE NOW')
    
    for w in np.nditer(word_freqs, op_flags = ['readwrite']):
        if w > 0:
            w[...] = 1/w
        else:
            w[...] = 1.5

    print('PHEW')
    # Word ID of 0 is impossible
    word_freqs[0] = -1 

    np.savetxt('/home/alta/relevance/vr311/data_vatsal/BULATS/content_words/idf.txt', word_freqs) 
    
    sort_word_freqs = np.sort(word_freqs)
    top_ten = sort_word_freqs[:10]
    for val in np.nditer(top_ten):
        print(np.where(val >= word_freqs ))
Beispiel #2
0
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/get_bert_dists.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')

    resp_path = '/home/alta/relevance/vr311/data_vatsal/BULATS/responses.txt'
    wlist_path = '/home/alta/relevance/vr311/data_vatsal/input.wlist.index'
    resps, resp_lens = text_to_array(resp_path, wlist_path)
    #print(resps.shape)

    save_path = '/home/alta/relevance/vr311/models_min_data/baseline/ATM'
    path = os.path.join(save_path, 'sorted_resps.txt')
    np.savetxt(path, resps)
    path = os.path.join(save_path, 'sorted_resp_lens.txt')
    np.savetxt(path, resp_lens)
Beispiel #3
0
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_compute_prompt_embeddings.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')

    prompts, prompt_lens = text_to_array(args.prompt_path,
                                         input_index=args.wlist_path)
    # Initialize and Run the Model

    atm = AttentionTopicModel(network_architecture=None,
                              load_path=args.load_path,
                              debug_mode=args.debug,
                              epoch=args.epoch)

    atm.get_prompt_embeddings(prompts, prompt_lens,
                              os.path.join(args.load_path, 'model'))
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_compute_prompt_embeddings.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')

    if args.strip_start_end:
        print("Stripping the first and last word (should correspond to <s> and </s> marks) from the input prompts. Should only be used with legacy dataset formatting")


    prompts, prompt_lens = text_to_array(args.prompt_path, args.wlist_path, strip_start_end=args.strip_start_end)
    # Initialize and Run the Model
    atm = AttentionTopicModel(network_architecture=None,
                              load_path=args.load_path,
                              debug_mode=args.debug,
                              epoch=args.epoch)

    atm.get_prompt_embeddings(prompts, prompt_lens, args.save_path)
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_train_attention_grader.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    topics, topic_lens = text_to_array(args.topic_path,
                                       input_index=args.wlist_path)

    atm = HierarchicialAttentionTopicModel(network_architecture=None,
                                           seed=args.seed,
                                           name=args.name,
                                           save_path='./',
                                           load_path=args.load_path,
                                           debug_mode=args.debug,
                                           epoch=args.epoch)

    atm.fit(train_data=args.train_data,
            valid_data=args.valid_data,
            load_path=args.init,
            topics=topics,
            topic_lens=topic_lens,
            unigram_path=args.topic_count_path,
            train_size=args.train_size,
            learning_rate=args.learning_rate,
            lr_decay=args.lr_decay,
            dropout=args.dropout,
            attention_dropout=args.attention_dropout,
            distortion=args.distortion,
            batch_size=args.batch_size,
            optimizer=tf.train.AdamOptimizer,
            optimizer_params={},
            n_epochs=args.n_epochs,
            n_samples=args.n_samples,
            epoch=0)

    atm.save()
Beispiel #6
0
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_train_attention_grader.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    train_size = get_train_size_from_meta(args.meta_data_path)

    topics, topic_lens = text_to_array(args.topic_path,
                                       args.wlist_path,
                                       strip_start_end=False)
    # Augmented data
    aug_topics, aug_topic_lens = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ar1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics2, aug_topic_lens2 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/de1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics3, aug_topic_lens3 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/fr1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics4, aug_topic_lens4 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/greek1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics5, aug_topic_lens5 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/hebrew1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics6, aug_topic_lens6 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/hi1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics7, aug_topic_lens7 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ja1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics8, aug_topic_lens8 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ko1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics9, aug_topic_lens9 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/ru1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics10, aug_topic_lens10 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/translate/af1.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics11, aug_topic_lens11 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics12, aug_topic_lens12 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug2.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics13, aug_topic_lens13 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug3.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics14, aug_topic_lens14 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug4.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics15, aug_topic_lens15 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug5.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics16, aug_topic_lens16 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug6.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics17, aug_topic_lens17 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug7.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics18, aug_topic_lens18 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug8.txt",
        args.wlist_path,
        strip_start_end=False)
    aug_topics19, aug_topic_lens19 = text_to_array(
        "/home/alta/relevance/vr311/data_vatsal/BULATS/eda/data/one_aug9.txt",
        args.wlist_path,
        strip_start_end=False)

    # if args.strip_start_end:
    #    print("Stripping the first and last word (should correspond to <s> and </s> marks) from the input prompts. Should only be used with legacy dataset formatting")

    bert_dists = np.loadtxt(
        "/home/alta/relevance/vr311/models_min_data/baseline/ATM/bert_dists.txt",
        dtype=np.float32)
    sbert_weights = np.loadtxt(
        "/home/alta/relevance/vr311/models_min_data/baseline/ATM/sbert_weights.txt",
        dtype=np.float32)
    arr_unigrams = np.loadtxt(
        "/home/alta/relevance/vr311/models_min_data/baseline/ATM/arr_unigrams.txt",
        dtype=np.float32)

    atm = HierarchicialAttentionTopicModel(network_architecture=None,
                                           seed=args.seed,
                                           name=args.name,
                                           save_path='./',
                                           load_path=args.load_path,
                                           debug_mode=args.debug,
                                           epoch=args.epoch)

    atm.fit(train_data=args.train_data,
            valid_data=args.valid_data,
            load_path=args.init,
            topics=topics,
            topic_lens=topic_lens,
            aug_topics=aug_topics,
            aug_topic_lens=aug_topic_lens,
            aug_topics2=aug_topics2,
            aug_topic_lens2=aug_topic_lens2,
            aug_topics3=aug_topics3,
            aug_topic_lens3=aug_topic_lens3,
            aug_topics4=aug_topics4,
            aug_topic_lens4=aug_topic_lens4,
            aug_topics5=aug_topics5,
            aug_topic_lens5=aug_topic_lens5,
            aug_topics6=aug_topics6,
            aug_topic_lens6=aug_topic_lens6,
            aug_topics7=aug_topics7,
            aug_topic_lens7=aug_topic_lens7,
            aug_topics8=aug_topics8,
            aug_topic_lens8=aug_topic_lens8,
            aug_topics9=aug_topics9,
            aug_topic_lens9=aug_topic_lens9,
            aug_topics10=aug_topics10,
            aug_topic_lens10=aug_topic_lens10,
            aug_topics11=aug_topics11,
            aug_topic_lens11=aug_topic_lens11,
            aug_topics12=aug_topics12,
            aug_topic_lens12=aug_topic_lens12,
            aug_topics13=aug_topics13,
            aug_topic_lens13=aug_topic_lens13,
            aug_topics14=aug_topics14,
            aug_topic_lens14=aug_topic_lens14,
            aug_topics15=aug_topics15,
            aug_topic_lens15=aug_topic_lens15,
            aug_topics16=aug_topics16,
            aug_topic_lens16=aug_topic_lens16,
            aug_topics17=aug_topics17,
            aug_topic_lens17=aug_topic_lens17,
            aug_topics18=aug_topics18,
            aug_topic_lens18=aug_topic_lens18,
            aug_topics19=aug_topics19,
            aug_topic_lens19=aug_topic_lens19,
            bert_dists=bert_dists,
            bert_weights=sbert_weights,
            arr_unigrams=arr_unigrams,
            unigram_path=args.topic_count_path,
            train_size=train_size,
            learning_rate=args.learning_rate,
            lr_decay=args.lr_decay,
            dropout=args.dropout,
            distortion=args.distortion,
            batch_size=args.batch_size,
            optimizer=tf.train.AdamOptimizer,
            optimizer_params={},
            n_epochs=args.n_epochs,
            n_samples=args.n_samples,
            epoch=0)

    atm.save()
def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_train_attention_grader.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    tfrecords_dir_in_domain = os.path.join(args.data_dir_in_domain, 'tfrecords')

    topic_path_in_domain = os.path.join(tfrecords_dir_in_domain, args.topic_file)

    wlist_path = os.path.join(tfrecords_dir_in_domain, args.wlist_file)

    topic_count_path_in_domain = os.path.join(tfrecords_dir_in_domain, args.topic_count_file)

    train_data_in_domain = os.path.join(tfrecords_dir_in_domain, args.train_file)

    valid_data = os.path.join(tfrecords_dir_in_domain, args.valid_file)

    dataset_meta_path_in_domain = os.path.join(tfrecords_dir_in_domain, args.meta_file)

    train_size = get_train_size_from_meta(dataset_meta_path_in_domain)

    topics_in_domain, topic_lens_in_domain = text_to_array(topic_path_in_domain, wlist_path,
                                                           strip_start_end=args.strip_start_end)

    if args.which_training_cost != 'conflictive':
        assert args.data_dir_in_domain is not None
        tfrecords_dir_out_domain = os.path.join(args.data_dir_out_domain, 'tfrecords')
        train_data_out_domain = os.path.join(tfrecords_dir_out_domain, args.train_file)

    if args.strip_start_end:
        print("Stripping the first and last word (should correspond to <s> and </s> marks) "
              "from the input prompts. Should only be used with legacy dataset formatting")

    atm = ATMPriorNetwork(network_architecture=None,
                          seed=args.seed,
                          name=args.name,
                          save_path='./',
                          load_path=args.load_path,
                          debug_mode=args.debug,
                          epoch=args.epoch)

    if args.which_training_cost == 'conflictive':
        atm.fit(train_data=train_data_in_domain,
                valid_data=valid_data,
                load_path=args.init,
                topics=topics_in_domain,
                topic_lens=topic_lens_in_domain,
                unigram_path=topic_count_path_in_domain,
                train_size=train_size,
                learning_rate=args.learning_rate,
                lr_decay=args.lr_decay,
                dropout=args.dropout,
                distortion=args.distortion,
                presample_batch_size=args.batch_size,
                optimizer=tf.train.AdamOptimizer,
                optimizer_params={},
                n_epochs=args.n_epochs,
                epoch=0,
                which_trn_cost=args.which_training_cost,
                loss_regularisation_weight=args.conflictive_weight)
    else:
        atm.fit_with_ood(train_data_in_domain=train_data_in_domain,
                         train_data_out_domain=train_data_out_domain,
                         valid_data=valid_data,
                         load_path=args.init,
                         topics_in_domain=topics_in_domain,
                         topic_lens_in_domain=topic_lens_in_domain,
                         unigram_path_in_domain=topic_count_path_in_domain,
                         train_size=train_size,
                         learning_rate=args.learning_rate,
                         lr_decay=args.lr_decay,
                         dropout=args.dropout,
                         distortion=args.distortion,
                         presample_batch_size=args.batch_size,
                         optimizer=tf.train.AdamOptimizer,
                         optimizer_params={},
                         n_epochs=args.n_epochs,
                         epoch=0,
                         which_trn_cost=args.which_training_cost,
                         out_of_domain_weight=args.out_of_domain_weight)

    atm.save()
def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_train_attention_grader.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    if args.strip_start_end:
        print(
            "Stripping the first and last word (should correspond to <s> and </s> marks) from the input prompts. Should only be used with legacy dataset formatting"
        )

    # Whether to train a prior network or standard ATM
    if args.train_prior_network:
        atm_class = ATMPriorNetworkStudent
    else:
        atm_class = AttentionTopicModelStudent

    for epoch in range(0, args.n_epochs):

        atm_student = atm_class(network_architecture=None,
                                seed=args.seed,
                                name=args.name,
                                save_path='./',
                                load_path=args.load_path,
                                debug_mode=args.debug,
                                epoch=args.load_epoch,
                                num_teachers=args.num_teachers)

        # Get the paths to all the relevant files for this epoch
        if args.reuse_epoch_dataset:
            epoch_tfrecords_dir = os.path.join(args.teacher_data_dir,
                                               'tfrecords')
        else:
            if args.loop_epochs:
                epoch_data_num = epoch % args.loop_epochs
            else:
                epoch_data_num = epoch
            epoch_tfrecords_dir = os.path.join(
                args.teacher_data_dir, 'epoch' + str(epoch_data_num + 1),
                'tfrecords')
        topic_path = os.path.join(epoch_tfrecords_dir, args.topic_file)
        wlist_path = os.path.join(epoch_tfrecords_dir, args.wlist_file)

        topic_count_path = os.path.join(epoch_tfrecords_dir,
                                        args.topic_count_file)

        train_data = os.path.join(epoch_tfrecords_dir, args.train_file)
        valid_data = os.path.join(epoch_tfrecords_dir, args.valid_file)

        dataset_meta_path = os.path.join(epoch_tfrecords_dir, args.meta_file)
        train_size = get_train_size_from_meta(dataset_meta_path)

        topics, topic_lens = text_to_array(
            topic_path, wlist_path, strip_start_end=args.strip_start_end)

        if epoch == 0:
            init = args.init
        else:
            init = None
        atm_student.fit_student(train_data=train_data,
                                valid_data=valid_data,
                                load_path=init,
                                topics=topics,
                                topic_lens=topic_lens,
                                unigram_path=topic_count_path,
                                train_size=train_size,
                                learning_rate=args.learning_rate,
                                lr_decay=args.lr_decay,
                                dropout=args.dropout,
                                distortion=args.distortion,
                                batch_size=args.batch_size,
                                optimizer=tf.train.AdamOptimizer,
                                optimizer_params={},
                                n_epochs=1,
                                epoch=epoch,
                                use_teacher_stat=(not args.match_samples))

        atm_student.save()

        # Reset the graph so that the model can be reloaded for the next epoch (not the nicest way to do it, I know)
        tf.reset_default_graph()