Exemple #1
0
def load_from_db_or_from_galago(table_name, key, galago_fn):
    if has_key(table_name, key):
        return load(table_name, key)

    r = galago_fn()
    if not has_key(table_name, key):
        save(table_name, key, r)
        flush()
    return r
    def get_doc_list(query_id: str):
        q_res_id: str = "{}_{}".format(query_id, q_config_id)
        ticker.tick()
        if has_key(QueryResult, q_res_id):
            r: List[SimpleRankedListEntry] = load(QueryResult, q_res_id)

            for entry in r:
                doc_id, rank, score = entry
                doc_list.add(doc_id)
Exemple #3
0
def enum_paragraph(step_size, subword_len,
                   subword_tokenize: Callable[[str], List[Subword]],
                   doc: SimpleRankedListEntry) -> Iterable[Paragraph]:
    # load tokens and BERT subword tokens
    tokens = load(TokenizedCluewebDoc, doc.doc_id)
    subword_tokens: List[List[Subword]] = lmap(subword_tokenize, tokens)
    cursor = 0

    while cursor < len(subword_tokens):
        cursor_ed = move_cursor(subword_tokens, cursor, subword_len)
        yield Paragraph(doc_id=doc.doc_id,
                        doc_rank=doc.rank,
                        doc_score=doc.score,
                        subword_tokens=list(
                            flatten(subword_tokens[cursor:cursor_ed])),
                        tokens=tokens[cursor:cursor_ed])
        cursor += step_size
Exemple #4
0
def write_tfrecord(ranked_list_d: RankedListDict, queries: List[Query],
                   q_rels: Dict[str, List[str]], save_path):
    max_seq_length = 512
    tokenizer = get_tokenizer()
    encoder = AllSegmentAsDoc(max_seq_length)
    writer = RecordWriterWrap(save_path)
    data_id = 0

    data_info = []
    for query in queries:
        if query.qid not in ranked_list_d:
            print("Warning query {} not found".format(query.qid))
            continue
        print(query.qid)
        ranked_list = ranked_list_d[query.qid]
        doc_ids = [doc_entry.doc_id for doc_entry in ranked_list]
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
        q_tokens = tokenizer.tokenize(query.text)

        for doc_entry in ranked_list:
            try:
                tokens_list: List[List[str]] = load(BertTokenizedCluewebDoc,
                                                    doc_entry.doc_id)
                tokens = flatten(tokens_list)
                insts: List[Tuple[List,
                                  List]] = encoder.encode(q_tokens, tokens)
                for inst in insts:
                    label = doc_entry.doc_id in q_rels[query.qid]

                    input_tokens, segment_ids = inst
                    feature = get_basic_input_feature(tokenizer,
                                                      max_seq_length,
                                                      input_tokens,
                                                      segment_ids)
                    feature["label_ids"] = create_int_feature([int(label)])
                    feature["data_id"] = create_int_feature([int(data_id)])
                    writer.write_feature(feature)

                    data_info.append((data_id, query.qid, doc_entry.doc_id))
                    data_id += 1
            except KeyError as e:
                print("doc {} not found".format(doc_entry.doc_id))

    return data_info
Exemple #5
0
 def get_instances(self, cid, data_id_manager, entries):
     doc_ids = lmap(lambda x: x.doc_id, entries)
     preload_man.preload(BertTokenizedCluewebDoc, doc_ids)
     n_doc_not_found = 0
     for entry in entries[:self.top_n]:
         try:
             tokens: List[List[str]] = load(BertTokenizedCluewebDoc,
                                            entry.doc_id)
             for sent_idx, sent in enumerate(tokens[:self.num_sent]):
                 for pid in self.pid_dict[int(cid)]:
                     info = {
                         'cid': cid,
                         'pid': pid,
                         'doc_id': entry.doc_id,
                         'sent_idx': sent_idx
                     }
                     yield Instance(pid, sent, data_id_manager.assign(info))
         except KeyError:
             n_doc_not_found += 1
     if n_doc_not_found:
         print("{} of {} docs not found".format(n_doc_not_found,
                                                len(doc_ids)))
Exemple #6
0
    def generate(claim_lm: ClaimLM,
                 ranked_list: List[SimpleRankedListEntry]) -> List[Record]:
        claim_text = claim_lm.claim
        claim_tokens = bert_tokenizer.tokenize(claim_text)
        claim_token_len = len(claim_tokens)

        log_topic_lm = get_lm_log(smooth(claim_lm.LM, bg_lm, alpha))
        log_odd: Counter = subtract(log_topic_lm, log_bg_lm)
        doc_ids = lmap(lambda x: x.doc_id, ranked_list[:top_n])
        print("loading docs")
        preload_man.preload(BertTokenizedCluewebDoc, doc_ids)

        window_size = max_seq_length - claim_token_len - 3
        step_size = max_seq_length - 112
        enum_paragraph = enum_paragraph_functor(step_size, window_size)

        def get_record(tokens) -> Record:
            scores, masks = get_target_labels(tokens, log_odd, stopwords,
                                              fail_logger)
            return Record(claim_tokens, tokens, scores, masks)

        tokens_list: List[List[str]] = []
        not_found = 0
        for doc_id in doc_ids:
            try:
                tokens: List[str] = list(
                    flatten(load(BertTokenizedCluewebDoc, doc_id)))
                tokens_list.append(tokens)
            except KeyError:
                not_found += 1
                pass

        print("{} of {} not found".format(not_found, len(tokens_list)))
        paragraph_list: Iterable[List[str]] = enum_paragraph(tokens_list)
        records: List[Record] = lmap(get_record, paragraph_list)

        return records
Exemple #7
0
    def fetch_from_q_res_id(self,
              query_res_id: str,
              ) -> List[SimpleRankedListEntry]:
        def translate_structure(raw_data) -> List[SimpleRankedListEntry]:
            try:
                dummy = raw_data[0].doc_id
                r = raw_data
            except AttributeError:
                def tuple_to_ranked_entry(tuple) -> SimpleRankedListEntry:
                    doc_id, rank, score = tuple
                    return SimpleRankedListEntry(doc_id=doc_id,
                                                 rank=rank,
                                                 score=score)

                r = lmap(tuple_to_ranked_entry, raw_data)
            return r

        try:
            raw_data = load(QueryResult, query_res_id)
            data = translate_structure(raw_data)
            return data
        except KeyError:
            print(query_res_id)
            raise
 def get_tokens(doc_id) -> List[str]:
     return load(TokenizedCluewebDoc, doc_id)
Exemple #9
0
 def get_db_item_or_make(self, table_name, doc_id):
     if has_key(table_name, doc_id):
         return load(table_name, doc_id)
     print("doc_id not found:", doc_id)
     self.launch_doc_processor(doc_id)
     return load(table_name, doc_id)
Exemple #10
0
def load_tf(doc_id):
    return load(CluewebDocTF, doc_id)
Exemple #11
0
def load_doc(doc_id):
    return load(TokenizedCluewebDoc, doc_id)
Exemple #12
0
def main():
    doc_id = "clueweb12-0005wb-96-30750"
    doc = load(BertTokenizedCluewebDoc, doc_id)
    print("doc has {} lines", len(doc))
    print("last line:", pretty_tokens(doc[-1], True))