def main(rank, num_threads, folder, chunk_size): print("loading chunk {}".format(rank), flush=True) documents = load_chunk(rank, folder) arguments = [{ "rank": rank, "id": id, "documents": chunk, "nlp": spacy.load("en_core_web_sm"), "chunk_size": chunk_size, } for id, chunk in enumerate(utils.chunk_it(documents, num_threads))] print("starting {} threads in {}".format(num_threads, rank)) pool = ThreadPool(num_threads) results = pool.map(run_thread, arguments) f = open( os.path.join(folder, "kilt_{}.jsonl".format(rank)), "w+", ) i = 1 for output in results: for msg in output: f.write("{}\t{}\n".format(i, json.dumps(msg))) i += 1 f.close() pool.terminate() pool.join() print("done {}".format(rank))
def load_ks(ks_directory, verbose=False): NUM_TREADS = int(multiprocessing.cpu_count()) if verbose: print(f"loading hotpotqa knowledge source with {NUM_TREADS} threads") pool = ThreadPool(NUM_TREADS) filenames = [] directories = [ os.path.join(ks_directory, o) for o in os.listdir(ks_directory) if os.path.isdir(os.path.join(ks_directory, o)) ] for directory in directories: onlyfiles = [ f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) ] for filetto in onlyfiles: filename = "{}/{}".format(directory, filetto) filenames.append(filename) arguments = [{ "id": i, "filenames": chunk, "verbose": verbose } for i, chunk in enumerate(chunk_it(filenames, NUM_TREADS))] results = pool.map(run_thread, arguments) output_dict = {} for x in results: output_dict.update(x) pool.terminate() pool.join() return output_dict
def get_chunks(self, num_chunks): with open(self.input_file, "r") as infile: all_data = json.load(infile) n = len(all_data) print("{} examples in the dataset".format(n)) return utils.chunk_it(all_data, num_chunks)
def feed_data(self, queries_data, logger=None): chunked_queries = utils.chunk_it(queries_data, self.num_threads) for idx, arg in enumerate(self.arguments): arg["queries_data"] = chunked_queries[idx] arg["logger"] = logger
def get_chunks(self, num_chunks): with open(self.input_file, "r", encoding='utf-8') as infile: all_data = json.load(infile) all_data = all_data['Data'] n = len(all_data) print("{} examples in the dataset".format(n)) return utils.chunk_it(all_data, num_chunks)
def get_chunks(self, num_chunks): all_data = [] with open(self.input_file, "r") as fin: lines = fin.readlines() assert len(lines) == 1 line = lines[0] all_data = json.loads(line) n = len(all_data) print("{} examples in the dataset".format(n)) return utils.chunk_it(all_data, num_chunks)
def store_chunks(documents, num_threads, folder): for id, chunk in enumerate(utils.chunk_it(documents, num_threads)): out_filename = os.path.join(folder, "documents_{}.p".format(id)) pickle.dump(chunk, open(out_filename, "wb"))
def get_chunks(self, num_chunks): # Read claims, create a set of wiki pages to # find the evidence sentences in page_to_evidence_sents = {} with open(self.claims_input_file, "r") as infile: for line in infile: claim = json.loads(line) if "verifiable" in claim and claim[ "verifiable"] == "NOT VERIFIABLE": continue evidence_sets = claim["evidence"] for evidence_set in evidence_sets: for evidence in evidence_set: if evidence[2]: page_id = unicodedata.normalize( "NFKD", evidence[2]) else: # those can be filtered out/ignored. They’re an artefact of merging some of the duplicates where annotators disagreed over the label. break sent_id = int(evidence[3]) if page_id not in page_to_evidence_sents: page_to_evidence_sents[page_id] = {} page_to_evidence_sents[page_id][sent_id] = None for idx in range(1, 110): filename = self.evidence_directory_path + f"/wiki-{idx:03}.jsonl" print(f"processing filename {filename}") with open(filename, "r") as fin: for line in fin: wiki_page = json.loads(line.strip()) page_id = wiki_page["id"] if page_id not in page_to_evidence_sents: continue lines = wiki_page["lines"].split("\n") sentences = [] for l in lines: line_fields = l.split("\t") # skip empty sentences if len(line_fields) < 2 or line_fields[1] == "": continue # skip sentences where first element is not number if not line_fields[0].isdigit(): continue sent_text = line_fields[1] # there is no id, so the new line character is # likely a formatting error, will ignore and # append the normalized text to the previous # sentence. if line_fields[0] == "": sentences[-1]["text"] += " " + sent_text else: sentences.append({ "id": line_fields[0], "text": sent_text, }) for sentence in sentences: sent_id = int(sentence["id"]) sent_text = sentence["text"] if sent_id in page_to_evidence_sents[page_id]: page_to_evidence_sents[page_id][ sent_id] = sent_text data = [] for page_id in page_to_evidence_sents: for sent_id in page_to_evidence_sents[page_id]: sent_text = page_to_evidence_sents[page_id][sent_id] data.append({ "page_id": page_id, "sent_id": sent_id, "text": sent_text, }) n = len(data) print("{} examples in the dataset".format(n)) return utils.chunk_it(data, num_chunks)