Esempio n. 1
0
def fit_models(args):
    if not os.path.exists(args.log):
        os.mkdir(args.log)

    curr_date = datetime.datetime.now().timestamp()  # seconds
    # folder to store all outputed files of a run
    secondary_log_folder = os.path.join(args.log,
                                        "log_results_%s" % (int(curr_date)))
    if not os.path.exists(secondary_log_folder):
        os.mkdir(secondary_log_folder)

    logfolder_result = os.path.join(secondary_log_folder,
                                    "%s_result.txt" % int(curr_date))
    FileHandler.init_log_files(logfolder_result)
    settings = json.dumps(vars(args), sort_keys=True, indent=2)
    FileHandler.myprint("Running script " + str(os.path.realpath(__file__)))
    FileHandler.myprint(settings)

    root = args.path
    train_pack = load_data.load_data2(root, 'train', prefix=args.dataset)
    valid_pack = load_data.load_data2(root, 'dev', prefix=args.dataset)
    predict_pack = load_data.load_data2(root, 'test', prefix=args.dataset)

    # print(train_pack.left)

    a = train_pack["text_left"].str.lower().str.split().apply(len).max()
    b = valid_pack["text_left"].str.lower().str.split().apply(len).max()
    c = predict_pack["text_left"].str.lower().str.split().apply(len).max()
    max_query_length = max([a, b, c])
    min_query_length = min([a, b, c])

    a = train_pack["text_right"].str.lower().str.split().apply(len).max()
    b = valid_pack["text_right"].str.lower().str.split().apply(len).max()
    c = predict_pack["text_right"].str.lower().str.split().apply(len).max()
    max_doc_length = max([a, b, c])
    min_doc_length = min([a, b, c])

    FileHandler.myprint("Min query length, " + str(min_query_length) +
                        " Min doc length " + str(min_doc_length))
    FileHandler.myprint("Max query length, " + str(max_query_length) +
                        " Max doc length " + str(max_doc_length))
    t1 = time.time()
    # get_query_docs(train_pack)
    dev_queries = get_query_docs(valid_pack)
    test_queries = get_query_docs(predict_pack)

    additional_data = {}
    if args.reranking:
        predict2_hard_pack = load_data.load_data2(root,
                                                  'test2_hard',
                                                  prefix=args.dataset)
        predict3_hard_pack = load_data.load_data2(root,
                                                  'test3_hard',
                                                  prefix=args.dataset)
        test2_queries = get_query_docs(predict2_hard_pack)
        test3_queries = get_query_docs(predict3_hard_pack)
        additional_data[KeyWordSettings.Test2Hard] = test2_queries
        additional_data[KeyWordSettings.Test3Hard] = test3_queries

    FileHandler.myprint('done extracting')
    t2 = time.time()
    FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1))
    params = {"b": args.b, "k1": args.k1}
    """ Many other things"""

    FileHandler.myprint("Fitting Model")
    fit_model = bm25_fit.BM25Fitter(params)

    try:
        fit_model.fit(None,
                      verbose=True,
                      topN=args.topk,
                      val_queries=dev_queries,
                      test_queries=test_queries,
                      **additional_data)
    except KeyboardInterrupt:
        FileHandler.myprint('Exiting from training early')
    t10 = time.time()
    FileHandler.myprint('Total time:  %d (seconds)' % (t10 - t1))
Esempio n. 2
0
def fit_models(args):
    if not os.path.exists(args.log):
        os.mkdir(args.log)

    curr_date = datetime.datetime.now().timestamp()  # seconds
    # folder to store all outputed files of a run
    secondary_log_folder = os.path.join(args.log,
                                        "log_results_%s" % (int(curr_date)))
    if not os.path.exists(secondary_log_folder):
        os.mkdir(secondary_log_folder)

    logfolder_result = os.path.join(secondary_log_folder,
                                    "%s_result.txt" % int(curr_date))
    FileHandler.init_log_files(logfolder_result)
    settings = json.dumps(vars(args), sort_keys=True, indent=2)
    FileHandler.myprint("Running script " + str(os.path.realpath(__file__)))
    FileHandler.myprint(settings)
    FileHandler.myprint("Setting seed to " + str(args.seed))

    seed = args.seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

    if args.cuda:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    index2queries = dict(
        (y, x) for x, y in json.loads(open(args.query_mapped).read()).items())
    index2docs = dict(
        (y, x)
        for x, y in json.loads(open(args.article_mapped).read()).items())
    root = args.path
    use_reranking = "reranking" in root
    t1 = time.time()

    elmo_queries_path = os.path.join(args.elmo_feats, "queries_feats.pth")
    elmo_docs_path = os.path.join(args.elmo_feats, "articles_feats.pth")
    elmo_loader = load_data.ElmoLoader(elmo_queries_path, elmo_docs_path,
                                       args.fixed_length_left,
                                       args.fixed_length_right)
    load_data_func = elmo_loader.elmo_load_data

    train_pack = load_data_func(root, 'train', prefix=args.dataset)
    valid_pack = load_data_func(root, 'dev', prefix=args.dataset)
    predict_pack = load_data_func(root, 'test', prefix=args.dataset)
    if use_reranking:
        FileHandler.myprint("Using Re-Ranking Dataset..........")
        predict2_hard_pack = load_data_func(root,
                                            'test2_hard',
                                            prefix=args.dataset)

    a = train_pack.left["text_left"].str.lower().str.split().apply(len).max()
    b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max()
    c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max()
    max_query_length = max([a, b, c])
    min_query_length = min([a, b, c])

    a = train_pack.right["text_right"].str.lower().str.split().apply(len).max()
    b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max()
    c = predict_pack.right["text_right"].str.lower().str.split().apply(
        len).max()
    max_doc_length = max([a, b, c])
    min_doc_length = min([a, b, c])

    FileHandler.myprint("Min query length, " + str(min_query_length) +
                        " Min doc length " + str(min_doc_length))
    FileHandler.myprint("Max query length, " + str(max_query_length) +
                        " Max doc length " + str(max_doc_length))

    if args.use_visual:
        image_loader = load_data.ImagesLoader(
            left_pth_file=args.left_images_features,
            max_num_left_images=args.n_img_in_query,
            right_pth_file=args.right_images_features,
            max_num_right_images=args.n_img_in_doc,
            use_cuda=args.cuda)
        data_packs = [train_pack, valid_pack, predict_pack]
        if use_reranking:
            data_packs.append(predict2_hard_pack)

        image_loader.fit(data_packs)  # memory-intensive (~10Gb RAM)
        train_pack = image_loader.transform(train_pack)
        valid_pack = image_loader.transform(valid_pack)
        predict_pack = image_loader.transform(predict_pack)
        if use_reranking:
            predict2_hard_pack = image_loader.transform(predict2_hard_pack)

        print(image_loader.left_tensor.size(),
              image_loader.right_tensor.size())

    preprocessor = mz.preprocessors.ElmoPreprocessor(args.fixed_length_left,
                                                     args.fixed_length_right)
    print('parsing data')
    train_processed = preprocessor.fit_transform(
        train_pack)  # This is a DataPack
    valid_processed = preprocessor.transform(valid_pack)
    predict_processed = preprocessor.transform(predict_pack)

    train_interactions = MatchInteractionVisual(train_processed)
    valid_interactions = MatchInteractionVisual(valid_processed)
    test_interactions = MatchInteractionVisual(predict_processed)
    if use_reranking:
        predict2_processed = preprocessor.transform(predict2_hard_pack)
        predict2_interactions = MatchInteractionVisual(predict2_processed)

    FileHandler.myprint('done extracting')
    t2 = time.time()
    FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1))
    FileHandler.myprint("Building model")

    print("Loading word embeddings......")
    t1_emb = time.time()
    term_index = preprocessor.context['vocab_unit'].state['term_index']
    glove_embedding = mz.datasets.embeddings.load_glove_embedding(
        dimension=args.word_embedding_size, term_index=term_index)

    embedding_matrix = glove_embedding.build_matrix(term_index)
    l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
    embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
    t2_emb = time.time()
    print("Time to load word embeddings......", (t2_emb - t1_emb))

    match_params = {}
    match_params['embedding'] = embedding_matrix
    match_params["embedding_freeze"] = True  # freezing word embeddings
    match_params["fixed_length_left"] = args.fixed_length_left
    match_params["fixed_length_right"] = args.fixed_length_right
    match_params['dropout'] = 0.1
    match_params['filters'] = args.filters
    match_params["conv_layers"] = args.conv_layers
    match_params["filters_count_pacrr"] = args.filters_count_pacrr
    match_params["n_s"] = args.n_s
    match_params["max_ngram"] = args.max_ngram
    match_params["head_cnn_type"] = args.head_cnn_type
    match_params["use_visual"] = args.use_visual
    match_params[
        "use_average_dcompositional_att"] = args.use_average_dcompositional_att
    match_params["attention_type"] = args.attention_type
    # contextualized part
    match_params["left_elmo_tensor"] = elmo_loader.left_tensor_feats
    match_params["right_elmo_tensor"] = elmo_loader.right_tensor_feats
    match_params["elmo_vec_size"] = 1024

    if args.use_visual:
        match_params["visual_feature_size"] = image_loader.visual_features_size
        image_loader.left_tensor = torch_utils.gpu(image_loader.left_tensor,
                                                   args.cuda)
        image_loader.right_tensor = torch_utils.gpu(image_loader.right_tensor,
                                                    args.cuda)
        match_params["full_left_images_tensor"] = image_loader.left_tensor
        match_params["full_right_images_tensor"] = image_loader.right_tensor

    match_model = multimodal_attention_network.MultiModalAttentionNetwork(
        match_params)
    FileHandler.myprint("Fitting Model")
    if args.use_visual:
        FileHandler.myprint("Using both Textual and Visual features.......")
        fit_model = fitter.VisualFitter(net=match_model,
                                        loss=args.loss_type,
                                        n_iter=args.epochs,
                                        batch_size=args.batch_size,
                                        learning_rate=args.lr,
                                        early_stopping=args.early_stopping,
                                        use_cuda=args.cuda,
                                        num_negative_samples=args.num_neg,
                                        logfolder=secondary_log_folder,
                                        curr_date=curr_date,
                                        use_visual=args.use_visual,
                                        image_loader=image_loader,
                                        index2queries=index2queries,
                                        index2docs=index2docs)
    else:
        FileHandler.myprint("Using Textual content only....")
        fit_model = contextualized_fitter.ContextualizedFitter(
            net=match_model,
            loss=args.loss_type,
            n_iter=args.epochs,
            batch_size=args.batch_size,
            learning_rate=args.lr,
            early_stopping=args.early_stopping,
            use_cuda=args.cuda,
            num_negative_samples=args.num_neg,
            logfolder=secondary_log_folder,
            curr_date=curr_date)

    try:
        fit_model.fit(train_interactions,
                      verbose=True,
                      topN=args.topk,
                      val_interactions=valid_interactions,
                      test_interactions=test_interactions)
        fit_model.load_best_model(valid_interactions,
                                  test_interactions,
                                  topN=args.topk)
        if use_reranking:
            fit_model.load_best_model_test2_test3(predict2_interactions,
                                                  None,
                                                  topN=args.topk)

    except KeyboardInterrupt:
        FileHandler.myprint('Exiting from training early')
    t10 = time.time()
    FileHandler.myprint('Total time:  %d (seconds)' % (t10 - t1))
Esempio n. 3
0
def fit_models(args):
    if not os.path.exists(args.log):
        os.mkdir(args.log)

    curr_date = datetime.datetime.now().timestamp()  # seconds
    # folder to store all outputed files of a run
    secondary_log_folder = os.path.join(args.log,
                                        "log_results_%s" % (int(curr_date)))
    if not os.path.exists(secondary_log_folder):
        os.mkdir(secondary_log_folder)

    logfolder_result = os.path.join(secondary_log_folder,
                                    "%s_result.txt" % int(curr_date))
    FileHandler.init_log_files(logfolder_result)
    settings = json.dumps(vars(args), sort_keys=True, indent=2)
    FileHandler.myprint("Running script " + str(os.path.realpath(__file__)))
    FileHandler.myprint(settings)
    FileHandler.myprint("Setting seed to " + str(args.seed))

    seed = args.seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

    if args.cuda:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    root = args.path
    t1 = time.time()

    train_pack = load_data.load_data2(root, 'train', prefix=args.dataset)
    valid_pack = load_data.load_data2(root, 'dev', prefix=args.dataset)
    predict_pack = load_data.load_data2(root, 'test', prefix=args.dataset)

    a = train_pack.left["text_left"].str.lower().str.split().apply(len).max()
    b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max()
    c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max()
    max_query_length = max([a, b, c])
    min_query_length = min([a, b, c])

    a = train_pack.right["text_right"].str.lower().str.split().apply(len).max()
    b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max()
    c = predict_pack.right["text_right"].str.lower().str.split().apply(
        len).max()
    max_doc_length = max([a, b, c])
    min_doc_length = min([a, b, c])

    FileHandler.myprint("Min query length, " + str(min_query_length) +
                        " Min doc length " + str(min_doc_length))
    FileHandler.myprint("Max query length, " + str(max_query_length) +
                        " Max doc length " + str(max_doc_length))

    preprocessor = mz.preprocessors.SplitPreprocessor(args.fixed_length_left,
                                                      args.fixed_length_right,
                                                      vocab_file=os.path.join(
                                                          args.path,
                                                          "vocab.json"))
    print('parsing data')
    train_processed = preprocessor.fit_transform(
        train_pack)  # This is a DataPack
    valid_processed = preprocessor.transform(valid_pack)
    predict_processed = preprocessor.transform(predict_pack)

    train_interactions = MatchInteraction(train_processed)
    valid_interactions = MatchInteraction(valid_processed)
    test_interactions = MatchInteraction(predict_processed)

    FileHandler.myprint('done extracting')
    t2 = time.time()
    FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1))
    FileHandler.myprint("Building model")

    print("Loading word embeddings......")
    t1_emb = time.time()
    term_index = preprocessor.context['vocab_unit'].state['term_index']
    default_embeddings = mz.datasets.embeddings.load_default_embedding(
        dimension=args.word_embedding_size, term_index=term_index)
    embedding_matrix = default_embeddings.build_matrix(
        term_index, initializer=lambda: np.random.normal(0, 1))
    t2_emb = time.time()
    print("Time to load word embeddings......", (t2_emb - t1_emb))

    params = dict()
    params['embedding'] = embedding_matrix
    params["embedding_freeze"] = False  # trainable word embeddings
    params["fixed_length_left"] = args.fixed_length_left
    params["fixed_length_right"] = args.fixed_length_right
    params["embedding_output_dim"] = args.word_embedding_size
    params["embedding_dropout"] = args.embedding_dropout
    params["attention_type"] = args.attention_type
    params["hidden_size"] = args.hidden_size
    params["output_target_size"] = args.output_target_size
    params["bidirectional"] = False
    params["use_label"] = False
    params["use_input_feeding"] = args.use_input_feeding
    params["nlayers"] = 1

    generative_model = fcrg_model.FCRGModel(params)
    FileHandler.myprint("Fitting Model")

    fit_model = basic_fitter.BasicFitter(
        net=generative_model,
        loss=args.loss_type,
        n_iter=args.epochs,
        batch_size=args.batch_size,
        learning_rate=args.lr,
        early_stopping=args.early_stopping,
        use_cuda=args.cuda,
        clip=args.clip,
        logfolder=secondary_log_folder,
        curr_date=curr_date,
        vocab=preprocessor.context['vocab_unit'])

    try:
        fit_model.fit(train_interactions,
                      verbose=True,
                      val_interactions=valid_interactions,
                      test_interactions=test_interactions)
        fit_model.load_best_model(valid_interactions, test_interactions)

    except KeyboardInterrupt:
        FileHandler.myprint('Exiting from training early')
    t10 = time.time()
    FileHandler.myprint('Total time:  %d (seconds)' % (t10 - t1))