def number_feature(data_set_path: str, db_path: str, max_sent_num: int): from common.dataset.reader import JSONLineReader db = FeverDocDB(db_path) jlr = JSONLineReader() lines = jlr.read(data_set_path) num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32) for i, line in enumerate(lines): claim_text = line['claim'] claim_tokens = tokenize(claim_text) all_nums = set() for token in claim_tokens: if is_token_numeric(token): all_nums.add(float(token)) for j, evidence in enumerate(line['predicted_evidence']): if j >= max_sent_num: break page, line_num = evidence[-2], evidence[-1] all_evidence_nums = [] evidence_text = evidence_num_to_text(db, page, line_num) evidence_tokens = tokenize(evidence_text) for token in evidence_tokens: if is_token_numeric(token): all_evidence_nums.append(float(token)) has_num = len(all_evidence_nums) > 0 has_identical_num = any(n in all_nums for n in all_evidence_nums) has_different_num = any(n not in all_nums for n in all_evidence_nums) num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][ 2] = _interprete_num_result(has_num, has_identical_num, has_different_num) return num_feat
def single_sentence_set_2_ids_given_vocab(texts, vocab_dict): logger = LogHelper.get_logger("single_sentence_set_2_ids_given_vocab") doc_ids = [] out_of_vocab_counts = 0 for sent in texts: tokens = tokenize(sent) word_ids = [] for token in tokens: if token.lower() in vocab_dict: word_ids.append(vocab_dict[token.lower()]) else: out_of_vocab_counts += 1 word_ids.append(vocab_dict['UNK']) doc_ids.append(word_ids) logger.debug("{} times out of vocab".format(str(out_of_vocab_counts))) return doc_ids
def single_sentence_set_2_fasttext_embedded(sents: List[str], fasttext_model: Union[str, FastText]): logger = LogHelper.get_logger("single_sentence_set_2_fasttext_embedded") if type(fasttext_model) == str: fasttext_model = FastText.load_fasttext_format(fasttext_model) fasttext_embeddings = [] for sent in sents: tokens = tokenize(sent) sent_embeddings = [] for token in tokens: try: sent_embeddings.append(fasttext_model[token.lower()]) except KeyError: sent_embeddings.append(np.ones([dim_fasttext], np.float32)) fasttext_embeddings.append(sent_embeddings) return fasttext_embeddings, fasttext_model
voc_dict['UNK'] = 1 return voc_dict if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('db', help='/path/to/db/file') parser.add_argument('output', help='/path/to/output/pickle/file') args = parser.parse_args() LogHelper.setup() logger = LogHelper.get_logger("generate_vocab_all_wiki") db = FeverDocDB(args.db) vocab = set() for doc in tqdm(db.get_doc_ids()): lines = db.get_doc_lines(doc) lines = lines.split("\n") for line in lines: segments = line.split("\t") if len(segments) < 2: continue line = segments[1] if line.strip() == "": continue tokens = set(token.lower() for token in tokenize(clean_text(line))) vocab.update(tokens) logger.info("total size of vocab: " + str(len(vocab))) vocab_dict = vocab_map(vocab) del vocab with open(args.output, 'wb') as f: pickle.dump(vocab_dict, f, protocol=pickle.HIGHEST_PROTOCOL)