Beispiel #1
0
def main(rank, num_threads, folder, chunk_size):

    print("loading chunk {}".format(rank), flush=True)
    documents = load_chunk(rank, folder)

    arguments = [{
        "rank": rank,
        "id": id,
        "documents": chunk,
        "nlp": spacy.load("en_core_web_sm"),
        "chunk_size": chunk_size,
    } for id, chunk in enumerate(utils.chunk_it(documents, num_threads))]

    print("starting {} threads in {}".format(num_threads, rank))
    pool = ThreadPool(num_threads)
    results = pool.map(run_thread, arguments)

    f = open(
        os.path.join(folder, "kilt_{}.jsonl".format(rank)),
        "w+",
    )

    i = 1
    for output in results:
        for msg in output:
            f.write("{}\t{}\n".format(i, json.dumps(msg)))
            i += 1
    f.close()
    pool.terminate()
    pool.join()
    print("done {}".format(rank))
Beispiel #2
0
def load_ks(ks_directory, verbose=False):
    NUM_TREADS = int(multiprocessing.cpu_count())

    if verbose:
        print(f"loading hotpotqa knowledge source with {NUM_TREADS} threads")
    pool = ThreadPool(NUM_TREADS)

    filenames = []
    directories = [
        os.path.join(ks_directory, o) for o in os.listdir(ks_directory)
        if os.path.isdir(os.path.join(ks_directory, o))
    ]
    for directory in directories:
        onlyfiles = [
            f for f in os.listdir(directory)
            if os.path.isfile(os.path.join(directory, f))
        ]
        for filetto in onlyfiles:
            filename = "{}/{}".format(directory, filetto)
            filenames.append(filename)

    arguments = [{
        "id": i,
        "filenames": chunk,
        "verbose": verbose
    } for i, chunk in enumerate(chunk_it(filenames, NUM_TREADS))]

    results = pool.map(run_thread, arguments)
    output_dict = {}
    for x in results:
        output_dict.update(x)
    pool.terminate()
    pool.join()

    return output_dict
Beispiel #3
0
    def get_chunks(self, num_chunks):
        with open(self.input_file, "r") as infile:
            all_data = json.load(infile)

        n = len(all_data)
        print("{} examples in the dataset".format(n))
        return utils.chunk_it(all_data, num_chunks)
Beispiel #4
0
    def feed_data(self, queries_data, logger=None):

        chunked_queries = utils.chunk_it(queries_data, self.num_threads)

        for idx, arg in enumerate(self.arguments):
            arg["queries_data"] = chunked_queries[idx]
            arg["logger"] = logger
Beispiel #5
0
    def get_chunks(self, num_chunks):
        with open(self.input_file, "r", encoding='utf-8') as infile:
            all_data = json.load(infile)

        all_data = all_data['Data']
        n = len(all_data)
        print("{} examples in the dataset".format(n))
        return utils.chunk_it(all_data, num_chunks)
Beispiel #6
0
    def get_chunks(self, num_chunks):
        all_data = []
        with open(self.input_file, "r") as fin:
            lines = fin.readlines()
            assert len(lines) == 1
            line = lines[0]
            all_data = json.loads(line)

        n = len(all_data)
        print("{} examples in the dataset".format(n))
        return utils.chunk_it(all_data, num_chunks)
Beispiel #7
0
def store_chunks(documents, num_threads, folder):
    for id, chunk in enumerate(utils.chunk_it(documents, num_threads)):
        out_filename = os.path.join(folder, "documents_{}.p".format(id))
        pickle.dump(chunk, open(out_filename, "wb"))
    def get_chunks(self, num_chunks):

        # Read claims, create a set of wiki pages to
        # find the evidence sentences in
        page_to_evidence_sents = {}

        with open(self.claims_input_file, "r") as infile:
            for line in infile:
                claim = json.loads(line)

                if "verifiable" in claim and claim[
                        "verifiable"] == "NOT VERIFIABLE":
                    continue

                evidence_sets = claim["evidence"]
                for evidence_set in evidence_sets:

                    for evidence in evidence_set:
                        if evidence[2]:
                            page_id = unicodedata.normalize(
                                "NFKD", evidence[2])
                        else:
                            #  those can be filtered out/ignored. They’re an artefact of merging some of the duplicates where annotators disagreed over the label.
                            break

                        sent_id = int(evidence[3])

                        if page_id not in page_to_evidence_sents:
                            page_to_evidence_sents[page_id] = {}

                        page_to_evidence_sents[page_id][sent_id] = None

        for idx in range(1, 110):
            filename = self.evidence_directory_path + f"/wiki-{idx:03}.jsonl"
            print(f"processing filename {filename}")
            with open(filename, "r") as fin:
                for line in fin:
                    wiki_page = json.loads(line.strip())
                    page_id = wiki_page["id"]
                    if page_id not in page_to_evidence_sents:
                        continue
                    lines = wiki_page["lines"].split("\n")
                    sentences = []
                    for l in lines:
                        line_fields = l.split("\t")
                        # skip empty sentences
                        if len(line_fields) < 2 or line_fields[1] == "":
                            continue
                        # skip sentences where first element is not number
                        if not line_fields[0].isdigit():
                            continue

                        sent_text = line_fields[1]

                        # there is no id, so the new line character is
                        # likely a formatting error, will ignore and
                        # append the normalized text to the previous
                        # sentence.
                        if line_fields[0] == "":
                            sentences[-1]["text"] += " " + sent_text
                        else:
                            sentences.append({
                                "id": line_fields[0],
                                "text": sent_text,
                            })

                    for sentence in sentences:
                        sent_id = int(sentence["id"])
                        sent_text = sentence["text"]
                        if sent_id in page_to_evidence_sents[page_id]:
                            page_to_evidence_sents[page_id][
                                sent_id] = sent_text

        data = []
        for page_id in page_to_evidence_sents:
            for sent_id in page_to_evidence_sents[page_id]:
                sent_text = page_to_evidence_sents[page_id][sent_id]
                data.append({
                    "page_id": page_id,
                    "sent_id": sent_id,
                    "text": sent_text,
                })

        n = len(data)
        print("{} examples in the dataset".format(n))
        return utils.chunk_it(data, num_chunks)