Esempio n. 1
0
def select_doc_per_query_top50(split):
    ms_reader = MSMarcoDataReader(split)
    save_path = os.path.join(root_dir, "train_docs_top50_{}.tsv".format(split))
    out_f = open(save_path, "w")

    def pop(query_id, cur_doc_ids: List[Tuple[str, int]]):
        pos_docs = ms_reader.qrel[query_id]
        neg_docs = []
        for doc_id, rank in cur_doc_ids:
            if doc_id not in pos_docs and rank < 50:
                neg_docs.append(doc_id)
        doc_needed = pos_docs + neg_docs
        row = [query_id] + doc_needed
        out_f.write("\t".join(row) + "\n")

    total_line = 36701116
    ticker = TimeEstimator(total_line, "reading", 1000)
    with open_top100(split) as top100f:
        last_topic_id = None
        cur_doc_ids = []
        for line_no, line in enumerate(top100f):
            [topic_id, _, doc_id, rank, _, _] = line.split()
            if last_topic_id is None:
                last_topic_id = topic_id
            elif last_topic_id != topic_id:
                pop(last_topic_id, cur_doc_ids)
                last_topic_id = topic_id
                cur_doc_ids = []

            ticker.tick()
            cur_doc_ids.append((doc_id, int(rank)))
Esempio n. 2
0
def select_doc_per_query(split):
    ms_reader = MSMarcoDataReader(split)
    save_path = os.path.join(root_dir,
                             "train_docs_10times_{}.tsv".format(split))
    out_f = open(save_path, "w")

    def pop(query_id, cur_doc_ids: Set):
        pos_docs = ms_reader.qrel[query_id]
        neg_docs = list(
            [doc_id for doc_id in cur_doc_ids if doc_id not in pos_docs])
        if pos_docs:
            num_neg_docs = 10 * len(pos_docs)
            sel_docs = random.sample(neg_docs, num_neg_docs)
            doc_needed = pos_docs + sel_docs
            row = [query_id] + doc_needed
            out_f.write("\t".join(row) + "\n")

    total_line = 36701116
    ticker = TimeEstimator(total_line, "reading", 1000)
    with open_top100(split) as top100f:
        last_topic_id = None
        cur_doc_ids = set()
        for line_no, line in enumerate(top100f):
            [topic_id, _, doc_id, rank, _, _] = line.split()
            if last_topic_id is None:
                last_topic_id = topic_id
            elif last_topic_id != topic_id:
                pop(last_topic_id, cur_doc_ids)
                last_topic_id = topic_id
                cur_doc_ids = set()

            ticker.tick()
            cur_doc_ids.add(doc_id)
Esempio n. 3
0
def group_average_per_query(outputs) -> Dict[str, Dict[WordAsID, np.array]]:
    tokenizer = get_tokenizer()

    def collect_by_word_fn(entry: QKTokenLevelOutEntry):
        return collect_by_words(tokenizer, entry)

    print("Grouping entries")
    grouped: Dict[str, List[QKTokenLevelOutEntry]] = group_by(outputs, lambda x: x.query.query_id)

    def average_scores(out_entries: List[QKTokenLevelOutEntry]) -> Dict[WordAsID, np.array]:
        items: List[Iterable[Tuple[WordAsID, TokenScore]]] = lmap(collect_by_word_fn, out_entries)
        d: Dict[WordAsID, List] = defaultdict(list)
        for item in items:
            item: Iterable[Tuple[WordAsID, TokenScore]] = item
            for word, probs in item:
                d[word].append(probs)

        def average_per_dim(probs_list) -> np.array:
            return np.mean(np.array(probs_list), axis=0)

        out_d: Dict[WordAsID, np.array] = dict_value_map(average_per_dim, d)
        return out_d

    print("Collecting token level scores")
    per_query_infos: Dict[str, Dict[WordAsID, np.array]] = {}
    ticker = TimeEstimator(len(grouped))
    for key, value in grouped.items():
        per_query_infos[key] = average_scores(value)
        ticker.tick()

    return per_query_infos
Esempio n. 4
0
def collect_pc_rel_score(prediction_file, info: Dict):
    data = EstimatorPredictionViewer(prediction_file)

    print("Num data ", data.data_len)
    group_by_key = {}
    num_append = 0
    last_claim = None
    ticker = TimeEstimator(data.data_len)
    for entry in data:
        ticker.tick()
        logits = entry.get_vector("logits")
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[data_id]
            if 'cid' in cur_info:
                cid = cur_info['cid']
                last_claim = cid, logits
            elif 'pid' in cur_info:
                pid = cur_info['pid']
                cid, c_logits = last_claim
                key = cid, pid
                if key not in group_by_key:
                    group_by_key[key] = []
                group_by_key[key].append((c_logits, logits))
                num_append += 1
            else:
                assert False
        except KeyError as e:
            print(e)
            pass
    print(num_append)
    return group_by_key
Esempio n. 5
0
def work(job_id, pm):
    rng = random.Random(0)
    max_num_tokens = 256
    masked_lm_prob = 0.15
    short_seq_prob = 0.1
    problem_per_job = 100 * 1000

    in_path = in_path_format.format(job_id)
    out_path = os.path.join(working_path, "problems", "{}".format(job_id))
    query_out_path = os.path.join(working_path, "query", "{}".format(job_id))
    in_data = pickle.load(open(in_path, "rb"))
    out_data = []
    queries = []

    ticker = TimeEstimator(len(in_data))

    for idx, inst in enumerate(in_data):
        mask_inst = pm.generate_mask(inst, max_num_tokens, masked_lm_prob,
                                     short_seq_prob, rng)
        query = pm.generate_query(mask_inst)
        qid = job_id * problem_per_job + idx
        queries.append((qid, query))
        out_data.append(mask_inst)
        ticker.tick()

    write_query_json(queries, query_out_path)
    pickle.dump(out_data, open(out_path, 'wb'))
Esempio n. 6
0
def sentence_payload_gen(q_res_path: str, top_n, data_id_man: DataIDManager):
    print("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    qid_list = list(ranked_list.keys())
    qid_list = qid_list[:10]
    ranked_list = {k: ranked_list[k] for k in qid_list}
    print("Pre loading docs")
    preload_docs(ranked_list, top_n)
    entries: List[Tuple[str, bool, int]] = []

    def enum_sentence(tokens) -> Iterator[str]:
        text = " ".join(tokens)
        sents = sent_tokenize(text)
        yield from sents

    ticker = TimeEstimator(len(ranked_list))
    for qid in ranked_list:
        q_res: List[SimpleRankedListEntry] = ranked_list[qid]
        docs = iterate_docs(q_res, top_n)

        for doc in docs:
            for sent_idx, sent in enumerate(enum_sentence(doc.tokens)):
                info = {
                    'doc_id': doc.doc_id,
                    'sent_idx': sent_idx,
                    'sentence': sent
                }
                data_id = data_id_man.assign(info)
                e = sent, True, data_id
                entries.append(e)

        ticker.tick()
    return entries
Esempio n. 7
0
    def work(self, job_id):
        data_to_save = {}
        group_id, doc_ids = self.todo[job_id]
        cur_targets = set(doc_ids)
        dir_helper = get_sydney_clueweb09_corpus_helper()
        print(group_id, len(cur_targets))
        ticker = TimeEstimator(len(cur_targets))
        group_done = len(cur_targets) == 0
        for file_path in dir_helper.iter_gz_files_for_group(group_id):
            if group_done:
                break
            for doc_id, content in iter_docs(file_path):
                if doc_id in cur_targets:
                    data_to_save[doc_id] = content
                    cur_targets.remove(doc_id)
                    ticker.tick()

                if len(cur_targets) == 0:
                    group_done = True
                    break

        if cur_targets:
            print(len(cur_targets), "not found")

        pickle.dump(data_to_save,
                    open(os.path.join(self.out_dir, str(job_id)), "wb"))
Esempio n. 8
0
def count_n_gram_grom_docs(docs, n, config, exclude_fn):
    count = Counter()
    tick = TimeEstimator(len(docs))

    top_k = 10000

    after_pruning = False
    for doc_idx, doc in enumerate(docs):
        if doc_idx % 10000 == 0:
            print(doc_idx)
        tick.tick()
        for segment in doc:
            if MERGE_SUBWORD in config:
                segment = merge_subword(segment)
            assert type(segment) == list
            for ngram_item in ngrams(segment, n):
                if after_pruning and ngram_item in selected_ngram:
                    continue
                elif exclude_fn(ngram_item):
                    pass
                else:
                    count[ngram_item] += 1

        if len(count) > 1000 * 1000 and not after_pruning:
            print("Performing pruning")
            tf_cnt = list(count.items())
            tf_cnt.sort(key=lambda x: x[1], reverse=True)
            selected_ngram = set(left(tf_cnt[:top_k]))
            after_pruning = True

    return count
Esempio n. 9
0
def get_idf_keyword_score(problems: List[QueryDoc],
                          get_idf) -> Iterable[Counter]:
    stemmer = CacheStemmer()
    ticker = TimeEstimator(len(problems))
    for p in problems:
        tokens = p.doc
        tf = Counter()
        reverse_map = {}  # Stemmed -> raw
        tokens = [t for t in tokens if t not in [".", ",", "!"]]
        for raw_t in tokens:
            stem_t = stemmer.stem(raw_t)
            reverse_map[stem_t] = raw_t
            tf[stem_t] += 1

        score_d = Counter()
        for term, cnt in tf.items():

            score = math.log(1 + cnt) * get_idf(term)
            assert type(score) == float
            score_d[term] = score

        score_d_surface_form: Counter = Counter(
            dict_key_map(lambda x: reverse_map[x], score_d))
        ticker.tick()
        yield score_d_surface_form
Esempio n. 10
0
def segment_per_doc_index(task_id):
    token_reader = get_token_reader()
    stemmer = CacheStemmer()
    stopword = load_stopwords()

    p = os.path.join(cpath.data_path, "adhoc", "robust_seg_info.pickle")
    seg_info = pickle.load(open(p, "rb"))

    def get_doc_posting_list(doc_id):
        doc_posting = defaultdict(list)
        for interval in seg_info[doc_id]:
            (loc, loc_ed), (_, _) = interval
            tokens = token_reader.retrieve(doc_id)
            st_tokens = list([stemmer.stem(t) for t in tokens])
            ct = Counter(st_tokens[loc:loc_ed])
            for term, cnt in ct.items():
                if term in stopword:
                    continue
                doc_posting[term].append((loc, cnt))

        return doc_posting

    doc_id_list = get_doc_task(task_id)
    ticker = TimeEstimator(len(doc_id_list))
    doc_posting_d = {}
    for doc_id in doc_id_list:
        doc_posting_d[doc_id] = get_doc_posting_list(doc_id)
        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc",
                             "per_doc_posting_{}.pickle".format(task_id))
    pickle.dump(doc_posting_d, open(save_path, "wb"))
Esempio n. 11
0
    def run_B(self, job_id):
        if self.pr.doc_posting is None:
            self.pr.doc_posting = per_doc_posting_server.load_dict()

        output_A = self.load_output_A(job_id)
        candi_docs = self.load_candidate_docs(job_id)
        feature_str_list = []
        seg_candi_list = []
        ticker = TimeEstimator(self.inst_per_job)
        for i in range(self.inst_per_job):
            problem, qid = output_A[i]
            qid_str = str(qid)
            if qid_str in candi_docs:
                doc_candi = candi_docs[qid_str]
                seg_candi, features = self.process_B(problem, doc_candi)
                fstr = "\n".join([libsvm_str(qid, 0, f) for f in features])
                feature_str_list.append(fstr)
                seg_candi_list.append(seg_candi)
            else:
                feature_str_list.append([])
                seg_candi_list.append([])
            ticker.tick()

            if i % 100 == 3:
                self.code_tick.print()

        self.save("seg_candi_list", job_id, seg_candi_list)
        self.save_ltr(job_id, feature_str_list)
Esempio n. 12
0
def create_training_instances(input_files, tokenizer, max_seq_length, dupe_factor, short_seq_prob, masked_lm_prob,
                                                            max_predictions_per_seq, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        obj = pickle.load(open(input_file, "rb"))
        # obj = List[Document]
        # Document = List[(Tokens, Probs)]
        all_documents.extend(obj)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.values())
    instances = []
    ticker = TimeEstimator(dupe_factor * len(all_documents))
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                    create_instances_from_document(
                            all_documents, document_index, max_seq_length, short_seq_prob,
                            masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
            ticker.tick()
    rng.shuffle(instances)
    return instances
Esempio n. 13
0
def combine_pc_rel_with_cpid(prediction_file, info: Dict) \
        -> Dict[DataID, Tuple[CPIDPair, Logits, Logits]]:
    data = EstimatorPredictionViewer(prediction_file)
    print("Num data ", data.data_len)
    out_d: Dict[DataID, Tuple[CPIDPair, Logits, Logits]] = {}
    num_append = 0
    last_claim = None
    prev_data_id = None
    ticker = TimeEstimator(data.data_len)
    for entry in data:
        ticker.tick()
        logits = entry.get_vector("logits")
        data_id = entry.get_vector("data_id")[0]
        try:
            cur_info = info[data_id]
            if 'cid' in cur_info:
                cid = cur_info['cid']
                last_claim = cid, logits
                prev_data_id = data_id
            elif 'pid' in cur_info:
                pid = cur_info['pid']
                cid, c_logits = last_claim
                cpid = CPIDPair((cid, pid))
                out_d[data_id] = (cpid, c_logits, logits)
                out_d[prev_data_id] = (cpid, c_logits, logits)
                num_append += 1
            else:
                assert False
        except KeyError as e:
            print(e)
            pass
    return out_d
Esempio n. 14
0
def main():
    dir_path = sys.argv[1]
    tokenizer = get_tokenizer()
    averager = Averager()
    sbc = SubwordConvertor()
    df = Counter()
    collection_size = 0
    tikcer = TimeEstimator(485393)
    for file_path in get_dir_files(dir_path):
        for idx, record in enumerate(
                tf.compat.v1.python_io.tf_record_iterator(file_path)):
            example = tf.train.Example()
            example.ParseFromString(record)
            feature = example.features.feature
            input_ids = feature["input_ids"].int64_list.value
            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            sep_idx1 = tokens.index("[SEP]")
            sep_idx2 = tokens.index("[SEP]", sep_idx1 + 1)
            doc_tokens = tokens[sep_idx1:sep_idx2]
            words = lmap(tuple, sbc.get_word_as_subtoken_tuple(doc_tokens))
            dl = len(words)
            collection_size += dl
            averager.append(dl)
            for word in set(words):
                df[word] += 1
            tikcer.tick()

    print("collection length", collection_size)
    print("average dl", averager.get_average())
    save_to_pickle(df, "subword_df_robust_train")
Esempio n. 15
0
def rank_with_query_lm(query_lms: Dict[str, Counter],
                       candidate_dict: Dict[str, List[QCKCandidateI]],
                       num_query=100,
                       alpha=0.5) -> Dict[str, List[TrecRankedListEntry]]:
    run_name = "run_name"
    scorer = LMScorer(query_lms, alpha)
    out_d = {}
    print("Start scoring")
    keys = list(candidate_dict.keys())
    keys = keys[:num_query]
    ticker = TimeEstimator(len(keys))
    for query_id in keys:
        candidates = candidate_dict[query_id]

        def get_score(c: QCKCandidateI) -> float:
            text = c.text
            assert text
            score = scorer.score_text(query_id, text)
            return score

        candidates.sort(key=get_score, reverse=True)
        l: List[TrecRankedListEntry] = []
        for rank, c in enumerate(candidates):
            l.append(
                TrecRankedListEntry(query_id, c.id, rank, get_score(c),
                                    run_name))
        out_d[query_id] = l
        ticker.tick()
    return out_d
Esempio n. 16
0
def qk_candidate_gen(q_res_path: str, doc_score_path, split,
                     config) -> List[Tuple[QCKQuery, List[KDP]]]:
    queries: List[QCKQuery] = get_qck_queries(split)
    num_jobs = d_n_claims_per_split2[split]
    score_d = load_doc_scores(doc_score_path, num_jobs)

    tprint("loading ranked list")
    ranked_list: Dict[
        str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path)
    query_ids = list(ranked_list.keys())
    query_ids.sort()
    print("num queries", len(query_ids))
    q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)}
    print("Pre loading docs")
    top_n = config['top_n']
    out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = []

    all_doc_parts = 0
    ticker = TimeEstimator(len(queries))
    for q in queries:
        job_id: int = q_id_to_job_id[q.query_id]
        entries: List = score_d[job_id]
        entries.sort(key=get_second, reverse=True)
        doc_ids = left(entries)
        doc_ids = doc_ids[:top_n]
        preload_man.preload(TokenizedCluewebDoc, doc_ids)
        docs = iterate_docs(doc_ids)
        doc_part_list: List[KDP] = iterate_document_parts(
            docs, config['window_size'], config['step_size'], 20)

        all_doc_parts += len(doc_part_list)
        out_qk.append((q, doc_part_list))
        ticker.tick()
    return out_qk
Esempio n. 17
0
def subtoken_split(task_id):
    #robust_tokens = load_robust_token()
    token_reader = get_token_reader()

    doc_id_list = get_doc_task(task_id)
    num_doc = len(doc_id_list)

    vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    window_size = 256 - 3

    skip = int(window_size / 2)
    ticker = TimeEstimator(num_doc)

    doc_seg_info = {}
    for key in doc_id_list:
        tokens = token_reader.retrieve(key)
        fn = tokenizer.wordpiece_tokenizer.tokenize
        sub_tokens = list([fn(t) for t in tokens])

        def move(loc, loc_sub, skip):
            loc_idx = loc
            num_passed_sw = 0
            loc_sub_idx = loc_sub
            for i in range(skip):
                num_passed_sw += 1
                loc_sub_idx += 1
                if num_passed_sw == len(sub_tokens[loc_idx]):
                    loc_idx += 1
                    num_passed_sw = 0

                if loc_idx >= len(sub_tokens):
                    break
            # only move in token level
            if num_passed_sw > 0:
                loc_sub_idx -= num_passed_sw
            return loc_idx, loc_sub_idx

        loc = 0
        loc_sub = 0

        interval_list = []

        while loc < len(tokens):
            loc_ed, loc_sub_ed = move(loc, loc_sub, skip)
            e = (loc, loc_ed), (loc_sub, loc_sub_ed)
            interval_list.append(e)
            loc = loc_ed
            loc_sub = loc_sub_ed

        doc_seg_info[key] = interval_list
        ticker.tick()

    p = os.path.join(cpath.data_path, "adhoc",
                     "robust_seg_info_{}.pickle".format(task_id))
    pickle.dump(doc_seg_info, open(p, "wb"))
Esempio n. 18
0
def select_paragraph(
    docs: Dict[str, List[List[str]]],
    clue12_13_df,
    claim_list: List[Dict],
    strategy="topk",
) -> List[Tuple[str, List[List[str]]]]:

    claim_id_to_text: Dict[int,
                           str] = {c['cId']: c['text']
                                   for c in claim_list}

    cdf = 50 * 1000 * 1000
    top_k = 100
    not_found_set = set()

    def idf(term: str):
        if term not in clue12_13_df:
            if term in string.printable:
                return 0
            not_found_set.add(term)

        return math.log((cdf + 0.5) / (clue12_13_df[term] + 0.5))

    r: List[Tuple[str, List[List[str]]]] = []
    ticker = TimeEstimator(len(docs))
    for claim_id, docs in docs.items():
        claim_text = claim_id_to_text[int(claim_id)]
        q_terms = set(re_tokenize(nltk.tokenize.word_tokenize(claim_text)))

        def scorer(para: List[str]) -> float:
            return paragraph_scorer(idf, q_terms, para)

        max_score = sum(lmap(idf, q_terms))

        def get_best_per_doc(doc: List[str]) -> List[Tuple[List[str], float]]:
            paragraph_list: Iterable[List[str]] = enum_paragraph([doc])
            paragraph_scored_list: List[Tuple[List[str],
                                              float]] = lmap_pairing(
                                                  scorer, paragraph_list)
            paragraph_scored_list.sort(key=lambda x: x[1], reverse=True)
            return paragraph_scored_list[:1]

        selected: List[Tuple[List[str], float]] = list(
            flatten(lmap(get_best_per_doc, docs)))

        # if strategy == "topk":
        #     selected: List[Tuple[List[str], float]] = paragraph_scored_list[:top_k]
        # elif strategy == "cutoff":
        #     cut_off = max_score * 0.6
        #     selected: List[Tuple[List[str], float]] = lfilter(lambda x: x[1] > cut_off, paragraph_scored_list)
        # else:
        #     assert False

        e = claim_id, left(selected)
        r.append(e)
        ticker.tick()

    return r
Esempio n. 19
0
 def nli_data_indexing(self, data):
     data_info = {}
     ticker = TimeEstimator(len(data), "nli indexing", 100)
     for data_idx, e in enumerate(data):
         input_ids, input_mask, segment_ids, y = e
         tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
         data_info[data_idx] = self.index(tokens)
         ticker.tick()
     return data_info
Esempio n. 20
0
def build_co_occur_from_pc_feature(data: Dict[str, List[List[str]]]) \
        -> List[Tuple[str, Counter]]:
    window_size = 10
    stemmer = CacheStemmer()
    r = []
    ticker = TimeEstimator(len(data))
    for cid, tokens_list in data.items():
        ticker.tick()
        counter = build_co_occurrence(tokens_list, window_size, stemmer)
        r.append((cid, counter))
    return r
Esempio n. 21
0
def do(data_id):
    working_dir = os.environ["TF_WORKING_DIR"]
    tokenzier = get_tokenizer()
    name1 = os.path.join(working_dir, "bert_loss", "{}.pickle".format(data_id))
    name2 = os.path.join(working_dir, "bfn_loss", "{}.pickle".format(data_id))

    tf_logging.debug("Loading " + name1)
    output1 = PredictionOutput(name1)
    tf_logging.debug("Loading " + name2)
    output2 = PredictionOutput(name2)

    assert len(output1.input_ids) == len(output2.input_ids)

    out_path = os.path.join(working_dir,
                            "loss_pred_train_data/{}".format(data_id))
    record_writer = RecordWriterWrap(out_path)
    n_inst = len(output1.input_ids)
    sep_id = tokenzier.vocab["[SEP]"]
    tf_logging.debug("Iterating")
    ticker = TimeEstimator(n_inst, "", 1000)
    for i in range(n_inst):
        if i % 1000 == 0:
            assert_input_equal(output1.input_ids[i], output2.input_ids[i])
        try:
            features = get_segment_and_mask(output1.input_ids[i], sep_id)
        except:
            try:
                sep_indice = get_sep_considering_masking(
                    output1.input_ids[i], sep_id, output1.masked_lm_ids[i],
                    output1.masked_lm_positions[i])
                features = get_segment_and_mask_inner(output1.input_ids[i],
                                                      sep_indice)
            except:
                tokens = tokenzier.convert_ids_to_tokens(output1.input_ids[i])
                print(tokenization.pretty_tokens(tokens))
                print(output1.masked_lm_ids[i])
                print(output1.masked_lm_positions[i])
                raise

        features["next_sentence_labels"] = create_int_feature([0])
        features["masked_lm_positions"] = create_int_feature(
            output1.masked_lm_positions[i])
        features["masked_lm_ids"] = create_int_feature(
            output1.masked_lm_ids[i])
        features["masked_lm_weights"] = create_float_feature(
            output1.masked_lm_weights[i])
        features["loss_base"] = create_float_feature(
            output1.masked_lm_example_loss[i])
        features["loss_target"] = create_float_feature(
            output2.masked_lm_example_loss[i])
        record_writer.write_feature(features)
        ticker.tick()

    record_writer.close()
Esempio n. 22
0
def merge_counter(file_prefix, st, ed):
    count = Counter()
    ticker = TimeEstimator(ed - st)
    for i in range(st, ed):
        path = file_prefix + str(i)
        d = pickle.load(open(path, "rb"))

        for key in d:
            count[key] += d[key]
        ticker.tick()
    out_path = file_prefix + "_merged"
    pickle.dump(count, open(out_path, "wb"))
Esempio n. 23
0
    def work(self, job_id):
        qid_list = self.query_group[job_id]
        ticker = TimeEstimator(len(qid_list))
        missing_rel_cnt = 0
        missing_nrel_cnt = 0

        def empty_doc_fn(query_id, doc_id):
            rel_docs = self.ms_reader.qrel[query_id]
            nonlocal missing_rel_cnt
            nonlocal missing_nrel_cnt
            if doc_id in rel_docs:
                missing_rel_cnt += 1
            else:
                missing_nrel_cnt += 1

        for qid in qid_list:
            if qid not in self.candidate_docs_d:
                continue

            docs: List[MSMarcoDoc] = load_per_query_docs(qid, empty_doc_fn)
            ticker.tick()

            target_docs = self.candidate_docs_d[qid]
            text_d = {}
            bert_tokens_d = {}
            stemmed_tokens_d = {}

            for d in docs:
                if d.doc_id in target_docs:
                    title = d.title
                    title = crop_to_space(title, self.max_title_length)

                    body_sents = sent_tokenize(d.body)
                    new_body_sents = self.resplit_body_sents(body_sents)
                    text_d[d.doc_id] = title, new_body_sents

                    for tokenize_fn, save_dict in [
                        (self.bert_tokenizer.tokenize, bert_tokens_d),
                        (self.stem_tokenizer.tokenize_stem, stemmed_tokens_d)
                    ]:
                        title_tokens = tokenize_fn(title)
                        body_tokens_list = lmap(tokenize_fn, new_body_sents)
                        save_dict[d.doc_id] = (title_tokens, body_tokens_list)

            todo = [
                (text_d, self.text_dir_name),
                (bert_tokens_d, self.bert_tokens_dir_name),
                (stemmed_tokens_d, self.stemmed_tokens_dir_name),
            ]

            for tokens_d, dir_name in todo:
                save_path = os.path.join(self.out_dir, dir_name, str(qid))
                pickle.dump(tokens_d, open(save_path, "wb"))
Esempio n. 24
0
    def tokenize_docs(self, doc_id_list):
        tokenizer = get_tokenizer()
        token_d = {}
        ticker = TimeEstimator(len(doc_id_list))
        for doc_id in doc_id_list:
            text = self.data[doc_id]
            text = re.sub(r"<\s*[^>]*>", " ", text)
            # tokenize text
            tokens = tokenizer.tokenize(text)
            token_d[doc_id] = tokens
            ticker.tick()

        return token_d
Esempio n. 25
0
def build_co_occur_from_pc_feature(
        data: Dict[str, List[ScoreParagraph]]) -> List[Tuple[str, Counter]]:
    window_size = 10
    stemmer = CacheStemmer()
    r = []

    ticker = TimeEstimator(len(data))
    for cid, para_list in data.items():
        ticker.tick()
        tokens_list: List[List[str]] = [e.paragraph.tokens for e in para_list]
        counter = build_co_occurrence(tokens_list, window_size, stemmer)
        r.append((cid, counter))
    return r
Esempio n. 26
0
def main():
    num_inst = 1000 * 1000 * 100
    path_format = "/mnt/nfs/work3/youngwookim/data/tlm/enwiki_seg_galago/train.{}.trectext"
    text_path = list([path_format.format(i) for i in range(10)])

    ts = TextSampler(text_path)
    sp = StreamPickler("wiki_segments3_", 1000 * 100)
    ticker = TimeEstimator(num_inst)
    for i in range(num_inst):
        inst = ts.sample()
        sp.add(inst)
        ticker.tick()

    sp.flush()
Esempio n. 27
0
def retrieve_urls(disk_id, doc_ids):
    f = open(get_url_dict_path(disk_id), "r")
    loc = 0
    d = {}
    ticker = TimeEstimator(len(doc_ids))
    for doc_id in doc_ids:
        line, found_loc = find_line_start_with(f, doc_id, loc)
        loc = found_loc
        doc_id_s, url = line.split()
        assert doc_id_s[-1] == ","
        assert doc_id == doc_id_s[:-1]
        d[doc_id] = url
        ticker.tick()
    return d
Esempio n. 28
0
def save_doc_len():
    collection = trec.load_robust(trec.robust_path)
    print("writing...")
    ticker = TimeEstimator(len(collection))

    doc_len = dict()
    for doc_id in collection:
        content = collection[doc_id]
        tokens = nltk.tokenize.wordpunct_tokenize(content)
        doc_len[doc_id] = len(tokens)
        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc", "doc_len.pickle")
    pickle.dump(doc_len, open(save_path, "wb"))
Esempio n. 29
0
    def run_A(self, job_id):
        output = []
        queries = []
        ticker = TimeEstimator(self.inst_per_job)
        for i in range(self.inst_per_job):
            p, q = self.process_A()
            qid = job_id * self.inst_per_job + i

            output.append((p, qid))
            queries.append((qid, q))
            ticker.tick()

        self.save_query(job_id, queries)
        self.save_output_A(job_id, output)
Esempio n. 30
0
def get_qk_candidate(config, q_res_path, qck_queries: List[QCKQuery]) -> List[QKUnit]:
    top_n = config['top_n']
    worker = QKWorker(q_res_path, config, top_n)
    all_candidate: List[QKUnit] = []
    ticker = TimeEstimator(len(qck_queries))
    for q in qck_queries:
        ticker.tick()
        try:
            doc_part_list: List[KDP] = worker.work(q)
            e: QKUnit = q, doc_part_list
            all_candidate.append(e)
        except KeyError as e:
            print(e)
    return all_candidate