Ejemplo n.º 1
0
def summarize_score(info: Dict, prediction_file_path: str,
                    f_handler: FormatHandler, combine_score: Callable,
                    score_type) -> Dict[Tuple[str, str], float]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    def get_score(entry):
        if score_type == "softmax":
            return logit_to_score_softmax(entry['logits'])
        elif score_type == "raw":
            return entry[key_logit][0]
        elif score_type == "scalar":
            return entry[key_logit]
        elif score_type == "tuple":
            return entry[key_logit][1]
        else:
            assert False

    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    tprint("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        scores = lmap(get_score, items)
        final_score = combine_score(scores)
        out_d[pair_id] = final_score

    num_items_per_group = average(lmap(len, grouped.values()))
    tprint("Num items per group : ", num_items_per_group)
    return out_d
Ejemplo n.º 2
0
def main():
    tprint("loading counter dict")
    counter_dict: Dict[str, Counter] = load_counter_dict()

    def get_doc_lm(doc_id) -> Counter:
        counter = counter_dict[doc_id]
        n_tf = sum(counter.values())
        out_counter = Counter()
        for word, cnt in counter.items():
            out_counter[word] = cnt / n_tf
        return out_counter

    qrel = load_robust_qrel()

    def get_pos_docs(query_id):
        if query_id not in qrel:
            return
        judgement = qrel[query_id]
        for doc_id, score in judgement.items():
            if score:
                yield doc_id

    tprint("build query lm dict")
    query_lm_dict = {}
    queries = list(qrel.keys())
    for query_id in queries:
        pos_docs_ids: Iterable[str] = get_pos_docs(query_id)
        pos_doc_lms: List[Counter] = lmap(get_doc_lm, pos_docs_ids)
        query_lm: Counter = average_counters(pos_doc_lms)
        query_lm_dict[query_id] = query_lm
Ejemplo n.º 3
0
    def generate_selected_training_data_ablation_only_pos(info, key, max_seq_length, save_dir, score_dir):
        data_id_manager = DataIDManager(0, 1000000)
        out_path = os.path.join(save_dir, str(key))
        pred_path = os.path.join(score_dir, str(key))
        tprint("data gen")
        itr = enum_best_segments(pred_path, info)
        insts = []
        for selected_entry in itr:
            selected = decompress_seg_ids_entry(selected_entry)
            assert len(selected['input_ids']) == len(selected['seg_ids'])

            selected['input_ids'] = pad0(selected['input_ids'], max_seq_length)
            selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length)
            # data_id = data_id_manager.assign(selected_segment.to_info_d())
            data_id = 0
            ci = InstAsInputIds(
                selected['input_ids'],
                selected['seg_ids'],
                selected['label'],
                data_id)
            insts.append(ci)

        def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict:
            return encode_inst_as_input_ids(max_seq_length, inst)

        tprint("writing")
        write_records_w_encode_fn(out_path, encode_fn, insts, len(insts))
        save_info(save_dir, data_id_manager, str(key) + ".info")
Ejemplo n.º 4
0
def do_predict(
    bert_hp,
    train_config,
    data,
    lms_config,
    modeling_option,
    init_fn,
):
    num_gpu = train_config.num_gpu
    train_batches, dev_batches = data

    lms_model = LMSModel(modeling_option, bert_hp, lms_config, num_gpu)
    sess = init_session()
    sess.run(tf.global_variables_initializer())
    init_fn(sess)

    step_size = 100
    for i in range(100):
        st = i * step_size
        ed = st + step_size
        # make explain train_op does not increase global step
        tprint(st, ed)
        output_d = predict_fn(sess, train_batches[st:ed], lms_model.logits,
                              lms_model.loss_tensor, lms_model.ex_score_tensor,
                              lms_model.per_layer_logit_tensor,
                              lms_model.batch2feed_dict)

        save_path = at_output_dir("lms_scores", str(i))
        save_to_pickle(output_d, save_path)
Ejemplo n.º 5
0
def summarize_score_wo_merge(info: Dict, prediction_file_path: str,
                             f_handler: FormatHandler,
                             score_type) -> Dict[Tuple[str, str], float]:
    key_logit = "logits"
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])

    def logit_to_score_softmax(logit):
        return scipy.special.softmax(logit)[1]

    def get_score(entry):
        if score_type == "softmax":
            return logit_to_score_softmax(entry['logits'])
        elif score_type == "raw":
            return entry[key_logit][0]
        elif score_type == "scalar":
            return entry[key_logit]
        elif score_type == "tuple":
            return entry[key_logit][1]
        else:
            assert False

    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    tprint("Group size:", len(grouped))
    out_d = {}
    for pair_id, items in grouped.items():
        query_id, doc_id = pair_id
        scores = lmap(get_score, items)
        for idx, score in enumerate(scores):
            new_doc_id = "{}_{}".format(doc_id, idx)
            out_d[(query_id, new_doc_id)] = score

    return out_d
Ejemplo n.º 6
0
def qk_candidate_gen(q_res_path: str, doc_score_path, split,
                     config) -> List[Tuple[QCKQuery, List[KDP]]]:
    queries: List[QCKQuery] = get_qck_queries(split)
    num_jobs = d_n_claims_per_split2[split]
    score_d = load_doc_scores(doc_score_path, num_jobs)

    tprint("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_ids = list(ranked_list.keys())
    query_ids.sort()
    print("num queries", len(query_ids))
    q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)}
    print("Pre loading docs")
    top_n = config['top_n']
    out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk
Ejemplo n.º 7
0
    def generate_instances(self, job_id, data_id_man):
        q_id = self.job_id_to_q_id[job_id]
        query_text = self.query_d[int(q_id)]
        query_tokens = self.tokenizer.tokenize(query_text)
        ranked_list = self.ranked_list[q_id][:1000]
        doc_ids = list([e.doc_id for e in ranked_list])
        tprint("Loading documents start")
        docs_d: Dict[str, List[List[str]]] = load_multiple(BertTokenizedCluewebDoc, doc_ids, True)
        tprint("Loading documents done")
        avail_seq_length = self.max_seq_length - len(query_tokens) - 3

        label_dummy = 0
        not_found = 0
        for doc_id in doc_ids:
            try:
                doc: List[List[str]] = docs_d[doc_id]
                passages: Iterable[List[str]] = enum_passages(doc, avail_seq_length)

                for passage_idx, p in enumerate(passages):
                    if passage_idx > 9:
                        break
                    data_id = data_id_man.assign({
                        'query_id': q_id,
                        'doc_id': doc_id,
                        'passage_idx': passage_idx
                    })
                    yield Instance(query_tokens, p, label_dummy, data_id)
            except KeyError:
                not_found += 1
        print("{} of {} docs not found".format(not_found, len(doc_ids)))
Ejemplo n.º 8
0
def main():
    args = parse_arg()
    param = json.load(open(args.config_path, "r"))
    model, train_processed, valid_processed = prepare_model_and_data(param)

    hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix,
                                                          bin_size=30,
                                                          hist_mode='LCH')
    tprint("defining generator")
    train_generator = mz.DataGenerator(train_processed,
                                       batch_size=param['batch_size'],
                                       shuffle=True,
                                       callbacks=[hist_callback])

    valid_x, valid_y = valid_processed.unpack()
    evaluate = mz.callbacks.EvaluateAllMetrics(model,
                                               x=valid_x,
                                               y=valid_y,
                                               batch_size=len(valid_x))
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                                  patience=1,
                                                  verbose=1,
                                                  mode='min')
    tprint("fitting")
    callbacks = [evaluate]

    if param['early_stop']:
        callbacks.append(early_stop)
    history = model.fit_generator(train_generator,
                                  epochs=100,
                                  callbacks=callbacks,
                                  workers=5,
                                  use_multiprocessing=False)
Ejemplo n.º 9
0
def main():
    info_path = sys.argv[1]
    # TODO remove
    # sh_format_path = sys.argv[2]
    # model_name = sys.argv[3]
    # step = int(sys.argv[4])
    # model_sub_path = "{}/model.ckpt-{}".format(model_name, step)
    # TODO remove end

    run_info = json.load(open(info_path, "r"))
    job_info_list = run_info['job_info_list']
    job_group_name = run_info['job_group_name']
    save_dir = run_info['save_dir']
    data_st = run_info['data_st']
    data_ed = run_info['data_ed']
    sh_format_path = run_info['sh_format_path']
    model_sub_path = run_info['model_sub_path']
    if 'rerun_jobs' in run_info and run_info['rerun_jobs']:
        new_job_info_list = rerun_jobs(sh_format_path, model_sub_path,
                                       save_dir, job_group_name, job_info_list)
        job_info_list = new_job_info_list

    print("len(job_info_list)", len(job_info_list))
    tprint("Waiting files")
    wait_files(job_info_list)
    tprint("Make ranked list")
    make_ranked_list_from_multiple_files(job_group_name, save_dir, data_st,
                                         data_ed)
Ejemplo n.º 10
0
def label_predict(hparam, data, model_path) -> List[np.array]:
    tprint("building model")
    voca_size = 30522
    task = transformer_logit(hparam, 2, voca_size, False)
    enc_payload: List[Tuple[np.array, np.array, np.array]] = data

    sout = tf.nn.softmax(task.logits, axis=-1)
    sess = init_session()
    sess.run(tf.global_variables_initializer())

    tprint("loading model")
    load_model(sess, model_path)

    def forward_run(inputs):
        batches = get_batches_ex(inputs, hparam.batch_size, 3)
        logit_list = []
        ticker = TimeEstimator(len(batches))
        for batch in batches:
            x0, x1, x2 = batch
            soft_out, = sess.run([
                sout,
            ],
                                 feed_dict={
                                     task.x_list[0]: x0,
                                     task.x_list[1]: x1,
                                     task.x_list[2]: x2,
                                 })
            logit_list.append(soft_out)
            ticker.tick()
        return np.concatenate(logit_list)

    logits = forward_run(enc_payload)
    return logits
Ejemplo n.º 11
0
    def summarize(self):
        topic = data_generator.argmining.ukp_header.all_topics[0]
        data_loader = ukp.DataLoader(topic)
        stopwords = load_stopwords()

        def tokenize(x):
            return tokenizer.tokenize(x, stopwords)

        def sent_score(token_sent, bow_score):
            score = 0
            factor = 1
            for t in token_sent:
                score += bow_score[t] * factor
                factor *= 0.5
            return score

        def is_argument(entry):
            return entry['annotation'] == "Argument_for" or entry[
                'annotation'] == "Argument_against"

        for topic in data_generator.argmining.ukp_header.all_topics:
            entries = data_loader.all_data[topic]
            raw_sents = list(
                [e['sentence'] for e in entries if e['set'] == 'train'])
            token_sents = list(map(tokenize, raw_sents))
            tprint("Runing TextRank")
            text_rank = TextRank(token_sents)
            tr_score = Counter(text_rank.run(flatten(token_sents)))
            tprint("claim_gen.generate")

            raw_sents.sort(key=lambda x: sent_score(tokenize(x), tr_score),
                           reverse=True)
            for i in range(10):
                print(raw_sents[i])
Ejemplo n.º 12
0
def check_wait_tasks(active_proc_list):
    num_tas = active_proc_list.update_alive()
    tprint("Number of active task : ", num_tas)

    while num_tas > max_task:
        tprint("Waiting for tasks to be done")
        time.sleep(60)
        num_tas = active_proc_list.update_alive()
Ejemplo n.º 13
0
def execute(job_id):
    out = open(get_log_path(job_id), "w")
    p = psutil.Popen(["/bin/bash", get_sh_path_for_job_id(job_id)],
                     stdout=out,
                     stderr=out,
                     preexec_fn=preexec_function)
    tprint("Executed job {} .  pid={}".format(job_id, p.pid))
    return p
Ejemplo n.º 14
0
def main():
    data_name = sys.argv[1]
    tprint("Loading idf scores")
    get_idf = load_idf_fn_for(data_name)
    problems: List[QueryDoc] = load_as_tokens(data_name)
    save_name = "{}_idf.txt".format(data_name)
    save_path = os.path.join(get_genex_run_save_dir(), save_name)
    scores_list: Iterable[Counter] = get_idf_keyword_score(problems, get_idf)
    save_score_to_file(scores_list, save_path)
Ejemplo n.º 15
0
def load_info_from_compressed(pickle_path):
    tprint("loading info pickle")
    output_d = {}
    data = load_pickle_from(pickle_path)
    tprint("decompressing...")
    for data_id, value_d in data.items():
        new_entry = decompress_seg_ids_entry(value_d)
        output_d[data_id] = new_entry
    return output_d
Ejemplo n.º 16
0
def get_score_d(pred_file_path: str, info: Dict, f_handler: FormatHandler,
                combine_strategy: str, score_type: str):
    tprint("Reading from :", pred_file_path)
    DOC_SEG_COMBINE = 0
    DOC_PART_SEG_COMBINE = 1
    NO_COMBINE = 2

    combine_type = ""
    if combine_strategy == "top_k":
        print("using top k")
        combine_score = top_k_average
        combine_type = DOC_SEG_COMBINE
    elif combine_strategy == "avg":
        combine_score = average
        combine_type = DOC_SEG_COMBINE
        print("using avg")
    elif combine_strategy == "max":
        print("using max")
        combine_type = DOC_SEG_COMBINE
        combine_score = max
    elif combine_strategy == "non_tail_max":
        combine_type = DOC_SEG_COMBINE
        combine_score = non_tail_max
    elif combine_strategy == "first4_max":
        combine_type = DOC_SEG_COMBINE
        combine_score = first4_max
    elif combine_strategy == "first":
        combine_type = DOC_SEG_COMBINE
        combine_score = select_first
    elif combine_strategy == "avg_then_doc_max":
        combine_type = DOC_PART_SEG_COMBINE
        combine_score = average
        print("using avg then max")
    elif combine_strategy == "max_then_doc_max":
        combine_type = DOC_PART_SEG_COMBINE
        combine_score = max
        print("using avg then max")
    elif combine_strategy == "no_merge":
        combine_type = NO_COMBINE
        combine_score = None
    else:
        assert False

    if combine_type == DOC_SEG_COMBINE:
        score_d = summarize_score(info, pred_file_path, f_handler,
                                  combine_score, score_type)
    elif combine_type == DOC_PART_SEG_COMBINE:
        score_d = summarize_score(info, pred_file_path, f_handler,
                                  combine_score, score_type)
        score_d = get_max_score_from_doc_parts(score_d)
    elif combine_type == NO_COMBINE:
        score_d = summarize_score_wo_merge(info, pred_file_path, f_handler,
                                           score_type)
    else:
        assert False

    return score_d
Ejemplo n.º 17
0
def main():
    print("Process started")
    for split in splits:
        tprint("Loading pickles")
        job_name = "argu_debate_qck_datagen_{}".format(split)
        qk_candidate: List[QKUnit] = load_qk(split)
        candidate_dict, correct_d = load_from_pickle(job_name +
                                                     "_base_resource")
        tprint("Starting job")
        start_job(job_name, split, candidate_dict, correct_d, qk_candidate)
Ejemplo n.º 18
0
def get_lm_for_claim(all_ranked_list, cid, unigrams):
    ranked_list = all_ranked_list.get(str(cid))
    doc_ids = [t[0] for t in ranked_list]
    tprint("Loading document")
    preload_docs(doc_ids)
    preload_tf(doc_ids)
    docs = lmap(load_and_format_doc, doc_ids)
    tprint("building clm document")
    lm_classifier = build_lm(docs, unigrams)
    return lm_classifier
Ejemplo n.º 19
0
def baseline_predict(hparam, nli_setting, data, method_name,
                     model_path) -> List[np.array]:
    tprint("building model")
    voca_size = 30522
    task = transformer_logit(hparam, 2, voca_size, False)
    enc_payload: List[Tuple[np.array, np.array, np.array]] = data

    sout = tf.nn.softmax(task.logits, axis=-1)
    sess = init_session()
    sess.run(tf.global_variables_initializer())

    tprint("loading model")
    load_model(sess, model_path)

    def forward_run(inputs):
        batches = get_batches_ex(inputs, hparam.batch_size, 3)
        logit_list = []
        ticker = TimeEstimator(len(batches))
        for batch in batches:
            x0, x1, x2 = batch
            soft_out, = sess.run([
                sout,
            ],
                                 feed_dict={
                                     task.x_list[0]: x0,
                                     task.x_list[1]: x1,
                                     task.x_list[2]: x2,
                                 })
            logit_list.append(soft_out)
            ticker.tick()
        return np.concatenate(logit_list)

    # train_batches, dev_batches = self.load_nli_data(data_loader)
    def idf_explain(enc_payload, explain_tag, forward_run):
        train_batches, dev_batches = get_nli_data(hparam, nli_setting)
        idf_scorer = IdfScorer(train_batches)
        return idf_scorer.explain(enc_payload, explain_tag, forward_run)

    todo_list = [
        ('deletion_seq', explain_by_seq_deletion),
        ('replace_token', explain_by_replace),
        ('term_deletion', explain_by_term_deletion),
        ('term_replace', explain_by_term_replace),
        ('random', explain_by_random),
        ('idf', idf_explain),
        ('deletion', explain_by_deletion),
        ('LIME', explain_by_lime),
    ]
    method_dict = dict(todo_list)
    method = method_dict[method_name]
    explain_tag = "mismatch"
    explains: List[np.array] = method(enc_payload, explain_tag, forward_run)
    # pred_list = predict_translate(explains, data_loader, enc_payload, plain_payload)
    return explains
Ejemplo n.º 20
0
def run_and_save():
    texts: List[str] = list(enum_f5_data())
    train_x, train_y, dev_x, dev_y = get_aawd_binary_train_dev()
    tprint("training...")
    svm = SVMWrap(train_x, train_y)

    tprint("predicting...")
    scores = svm.predict(texts)

    output: List[Tuple[str, float]] = list(zip(texts, scores))
    save_to_pickle(output, "f5_svm_aawd_prediction")
Ejemplo n.º 21
0
def run(args):
    tprint("msmarco run")
    hp = Hyperparam()
    nli_setting = ExTrainConfig()

    def worker_factory(out_dir):
        worker = PredictWorker(args.input_dir, out_dir)
        worker.load_model(hp, nli_setting, args.model_path, "co")
        return worker

    runner = JobRunner(args.save_dir, 696, "pc_tfrecord_ex", worker_factory)
    runner.auto_runner()
Ejemplo n.º 22
0
def main():
    train_x, train_y, dev_x, dev_y = get_argu_pointwise_data()
    tprint("training and testing")
    use_char_ngram = False
    print("Use char ngram", use_char_ngram)
    pred_svm_ngram = svm.train_svm_and_test(
        svm.NGramFeature(use_char_ngram, 4), train_x, train_y, dev_x)
    # pred_svm_ngram = list([random.randint(0,1) for _ in dev_y])
    acc = accuracy(pred_svm_ngram, dev_y)
    ap = get_ap(dev_y, pred_svm_ngram)
    print("acc:", acc)
    print("ap:", ap)
Ejemplo n.º 23
0
def load_robust_meta(docs_dir, only_one_seg=False):
    collections = dict()
    for (dirpath, dirnames, filenames) in os.walk(docs_dir):
        for name in filenames:
            filepath = os.path.join(dirpath, name)
            tprint(filepath)
            d = load_trec_meta(filepath)
            print(len(d))
            collections.update(d)
            if only_one_seg:
                break
    return collections
Ejemplo n.º 24
0
def generate_selected_training_data_for_many_runs(
        target_data_idx, info_dir, max_seq_length, score_and_save_dir: List,
        generate_selected_training_data_fn):
    interval_start_list = left(robust_query_intervals)
    key = interval_start_list[target_data_idx]
    info_path = os.path.join(info_dir, str(key))
    tprint("loading info: " + info_path)
    info = load_pickle_from(info_path)
    for score_dir, save_dir in score_and_save_dir:
        exist_or_mkdir(save_dir)
        tprint(save_dir)
        generate_selected_training_data_fn(info, key, max_seq_length, save_dir,
                                           score_dir)
Ejemplo n.º 25
0
def demo_score(info, max_seq_length):
    tprint("data gen")
    df_d = load_from_pickle("subword_df_robust_train")
    df_d = collections.Counter(df_d)
    tokenizer = get_tokenizer()
    sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
    sbc = SubwordConvertor()
    collection_size = 1139368311 + 10
    avdl = 446

    def get_score(input_ids):
        sep_idx1 = input_ids.index(sep_id)
        sep_idx2 = input_ids.index(sep_id, sep_idx1 + 1)

        query = input_ids[1:sep_idx1]
        doc_content = input_ids[sep_idx1 + 1:sep_idx2]

        q_terms: List[Tuple[int]] = list(sbc.get_word_as_subtoken_tuple(query))
        d_terms: List[Tuple[int]] = list(
            sbc.get_word_as_subtoken_tuple(doc_content))

        tf = collections.Counter()
        for d_term in d_terms:
            if d_term in q_terms:
                tf[d_term] += 1

        score = 0
        for q_term in q_terms:
            f = tf[q_term]
            df = df_d[q_term]
            N = collection_size
            dl = len(d_terms)
            score += BM25_2(f, df, N, dl, avdl)

        def to_str(input_ids):
            return " ".join(tokenizer.convert_ids_to_tokens(input_ids))

        print("query", to_str(query))
        print("doc", to_str(doc_content))
        print('score:', score)

    insts = collections.defaultdict(list)
    grouped = group_by(info.values(), lambda e: (e['query_id'], e['doc_id']))
    for qid, doc_id in grouped:
        key = qid, doc_id
        sub_entries = grouped[key]
        for e in sub_entries:
            get_score(e['input_ids'])

    def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict:
        return encode_inst_as_input_ids(max_seq_length, inst)
Ejemplo n.º 26
0
    def work(self, job_id):
        qids = self.query_group[job_id]
        data_bin = 100000
        data_id_st = job_id * data_bin
        data_id_ed = data_id_st + data_bin
        data_id_manager = DataIDManager(data_id_st, data_id_ed)
        tprint("generating instances")
        insts = self.generator.generate(data_id_manager, qids)
        # tprint("{} instances".format(len(insts)))
        out_path = os.path.join(self.out_dir, str(job_id))
        self.generator.write(insts, out_path)

        info_path = os.path.join(self.info_dir, "{}.info".format(job_id))
        json.dump(data_id_manager.id_to_info, open(info_path, "w"))
Ejemplo n.º 27
0
def generate_selected_training_data_loop(split_no, score_dir, info_dir,
                                         max_seq_length, save_dir,
                                         generate_selected_training_data_fn):
    train_items, held_out = get_robust_splits(split_no)
    print(train_items)
    exist_or_mkdir(save_dir)
    for key in train_items:
        info_path = os.path.join(info_dir, str(key))
        # info = load_combine_info_jsons(info_path, False, False)
        tprint("loading info: " + info_path)
        info = load_pickle_from(info_path)
        # info = load_info_from_compressed(info_path)
        generate_selected_training_data_fn(info, key, max_seq_length, save_dir,
                                           score_dir)
Ejemplo n.º 28
0
def prepare_model_and_data(param):
    tprint("Loading data")
    preprocessor, train_processed, valid_processed = drmm_processed()
    print(train_processed)
    tprint("Defining task")
    classification_task = mz.tasks.classification.Classification()
    classification_task.metrics = ['accuracy']
    output_dim = 300
    tprint('output_dim : {}'.format(output_dim))
    # Initialize the model, fine-tune the hyper-parameters.
    tprint("building model")
    #model = mz.models.KNRM()
    #model = KNRMEx()
    # model = AvgEmbedding()
    # model.params.update(preprocessor.context)
    # model.params['task'] = classification_task
    # model.params['embedding_output_dim'] = output_dim
    # model.params['embedding_trainable'] = False
    # model.params['kernel_num'] = 11
    # model.params['sigma'] = 0.1
    # model.params['exact_sigma'] = 0.001
    glove_embedding = mz.datasets.embeddings.load_glove_embedding(
        dimension=output_dim)

    model = get_drmm_model(preprocessor, classification_task, output_dim)
    for key, v in param.items():
        if key in model.params:
            model.params[key] = v

    model.guess_and_fill_missing_params(verbose=1)

    step_per_epoch = 423 * 128
    num_max_steps = 100 * step_per_epoch

    if 'lr_decay' in param and param['lr_decay']:
        lr = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=param['lr'],
            decay_steps=num_max_steps / 20,
            decay_rate=0.9)
    else:
        lr = param['lr']
    model.params['optimizer'] = tf.keras.optimizers.Adam(learning_rate=lr)

    model.build()
    model.compile()
    tprint("processing embedding")
    term_index = preprocessor.context['vocab_unit'].state['term_index']
    embedding_matrix = prepare_embedding(output_dim, term_index)
    model.load_embedding_matrix(embedding_matrix)
    return model, train_processed, valid_processed
Ejemplo n.º 29
0
def collect_doc_per_query(split, target_qid):
    ms_reader = MSMarcoDataReader(split)

    def pop(query_id, cur_doc_ids: Set):
        num_candidate_doc = len(cur_doc_ids)
        cur_doc_ids.update(ms_reader.qrel[query_id])
        todo = []
        for doc_id in cur_doc_ids:
            offset = ms_reader.doc_offset[doc_id]
            todo.append((doc_id, offset))
        todo.sort(key=get_second)
        num_all_docs = len(cur_doc_ids)
        print("{} docs".format(num_all_docs))

        exist_or_mkdir(per_query_root)
        save_path = get_per_query_doc_path(query_id)
        out_f = open(save_path, "w")
        for doc_id, offset in todo:
            content: str = ms_reader.get_content(doc_id)
            out_f.write(content + "\n")
        out_f.close()
###
    total_line = 36701116
    skip = True
    with open_top100(split) as top100f:
        last_topic_id = None
        cur_doc_ids = set()
        for line_no, line in enumerate(top100f):
            if skip:
                if not line.startswith(target_qid):
                    continue
                else:
                    tprint("skip done")
                    remain_lines = total_line - line_no
                    ticker = TimeEstimator(remain_lines, "reading", 1000)
                    skip = False

            [topic_id, _, doc_id, rank, _, _] = line.split()
            if last_topic_id is None:
                last_topic_id = topic_id
            elif last_topic_id != topic_id:
                pop(last_topic_id, cur_doc_ids)
                break
                last_topic_id = topic_id
                cur_doc_ids = set()

            ticker.tick()
            cur_doc_ids.add(doc_id)
        pop(last_topic_id, cur_doc_ids)
Ejemplo n.º 30
0
def load_scores(info_file_path, prediction_file_path):
    input_type = "qc"
    f_handler = get_format_handler(input_type)
    tprint("Loading json info")
    info: Dict = load_combine_info_jsons(info_file_path,
                                         f_handler.get_mapping(),
                                         f_handler.drop_kdp())
    key_logit = "logits"
    tprint("Reading predictions...")
    data: List[Dict] = join_prediction_with_info(prediction_file_path, info,
                                                 ["data_id", key_logit])
    grouped: Dict[Tuple[str, str],
                  List[Dict]] = group_by(data, f_handler.get_pair_id)
    print("number of groups:", len(grouped))
    return grouped