Exemple #1
0
    def download_if_missing(self):
        cachedir = self.get_cache_path()
        tmp_dir, document_dir = Path("/tmp"), cachedir / "documents"
        expected_fns = [
            document_dir / "metadata.csv", document_dir / "document_parses"
        ]
        if all([os.path.exists(f) for f in expected_fns]):
            return document_dir.as_posix()

        url = self.url % self.date
        tar_file = tmp_dir / f"covid-19-{self.date}.tar.gz"
        if not tar_file.exists():
            download_file(url, tar_file)

        with tarfile.open(tar_file) as f:
            f.extractall(path=cachedir
                         )  # emb.tar.gz, metadata.csv, doc.tar.gz, changelog
            os.rename(cachedir / self.date, document_dir)

        doc_fn = "document_parses"
        if f"{doc_fn}.tar.gz" in os.listdir(document_dir):
            with tarfile.open(document_dir / f"{doc_fn}.tar.gz") as f:
                f.extractall(path=document_dir)
        else:
            self.transform_metadata(document_dir)

        # only document_parses and metadata.csv are expected
        for fn in os.listdir(document_dir):
            if (document_dir / fn) not in expected_fns:
                os.remove(document_dir / fn)
        return document_dir.as_posix()
Exemple #2
0
def test_covid_round3_qrel_conversion():
    collection_config = {"name": "covid", "round": 3, "coll_type": "abstract"}
    benchmark_config = {
        "name": "covid",
        "udelqexpand": False,
        "useprevqrels": False
    }
    collection = CovidCollection(collection_config)
    benchmark = CovidBenchmark(benchmark_config,
                               provide={"collectoin": collection})

    benchmark.download_if_missing()

    docid_map_tmp = "/tmp/docid.map"
    newdocid_qrels_fn = "/tmp/new.docid.qrels"
    qrel_url = "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j0.5-3.txt"
    docid_map_url = "https://ir.nist.gov/covidSubmit/data/changedIds-May19.csv"

    download_file(docid_map_url, docid_map_tmp)
    download_file(qrel_url, newdocid_qrels_fn)
    with open(docid_map_tmp) as f:
        old2new = {line.split(",")[0]: line.split(",")[1] for line in f}
    newdocid_qrels = load_qrels(newdocid_qrels_fn)
    olddocid_qrels = benchmark.qrels

    # since there are dropped out terms in benchmark.qrels (the ones that appeared in previous judgements)
    # converted olddocid_qrels will have less entries than newdocid_qrels.
    # Cannot use assert convert_qrels == newdocid_qrels here
    for qid in olddocid_qrels:
        for docid in olddocid_qrels[qid]:
            newdocid = old2new.get(docid, docid)
            assert olddocid_qrels[qid][docid] == newdocid_qrels[qid][newdocid]
Exemple #3
0
    def download_if_missing(self):
        url = "http://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt"
        cachedir = self.get_cache_path()
        document_dir = os.path.join(cachedir, "documents")
        coll_filename = os.path.join(document_dir, "antique-collection.txt")

        if os.path.exists(coll_filename):
            return document_dir

        tmp_dir = cachedir / "tmp"
        tmp_filename = os.path.join(tmp_dir, "tmp.anqique.file")

        os.makedirs(tmp_dir, exist_ok=True)
        os.makedirs(document_dir, exist_ok=True)

        download_file(
            url,
            tmp_filename,
            expected_hash=
            "68b6688f5f2668c93f0e8e43384f66def768c4da46da4e9f7e2629c1c47a0c36")
        self._convert_to_trec(inp_path=tmp_filename, outp_path=coll_filename)
        logger.info(
            f"antique collection file prepared, stored at {coll_filename}")

        for file in os.listdir(tmp_dir):  # in case there are legacy files
            os.remove(os.path.join(tmp_dir, file))
        shutil.rmtree(tmp_dir)

        return document_dir
Exemple #4
0
    def download_if_missing(self):
        cachedir = self.get_cache_path()
        document_dir = cachedir / "documents"
        coll_filename = document_dir / ("csn-" + self.config["lang"] +
                                        "-collection.txt")

        if coll_filename.exists():
            return document_dir.as_posix()

        zipfile = self.config["lang"] + ".zip"
        lang_url = f"{self.url}/{zipfile}"
        tmp_dir = cachedir / "tmp"
        zip_path = tmp_dir / zipfile

        if zip_path.exists():
            logger.info(
                f"{zipfile} already exist under directory {tmp_dir}, skip downloaded"
            )
        else:
            tmp_dir.mkdir(exist_ok=True, parents=True)
            download_file(lang_url, zip_path)

        document_dir.mkdir(exist_ok=True, parents=True)  # tmp
        with ZipFile(zip_path, "r") as zipobj:
            zipobj.extractall(tmp_dir)

        pkl_path = tmp_dir / (self.config["lang"] +
                              "_dedupe_definitions_v2.pkl")
        self._pkl2trec(pkl_path, coll_filename)
        return document_dir.as_posix()
Exemple #5
0
    def download_raw(self):
        cachedir = self.get_cache_path()
        tmp_dir = cachedir / "tmp"
        tmp_tar_fn, tmp_corpus_dir = tmp_dir / "nfcorpus.tar.gz", tmp_dir / "nfcorpus"

        os.makedirs(tmp_dir, exist_ok=True)

        if not tmp_tar_fn.exists():
            download_file(self.url, tmp_tar_fn, "ebc026d4a8bef3f866148b727e945a2073eb4045ede9b7de95dd50fd086b4256")

        with tarfile.open(tmp_tar_fn) as f:
            f.extractall(tmp_dir)
        return tmp_corpus_dir
Exemple #6
0
    def download_index(self, cachedir, url, sha256, index_directory_inside,
                       index_cache_path_string, index_expected_document_count):
        # Download the collection from URL and extract into a path in the cache directory.
        # To avoid re-downloading every call, we create an empty '/done' file in this directory on success.
        done_file = os.path.join(cachedir, "done")
        document_dir = os.path.join(cachedir, "documents")

        # already downloaded?
        if os.path.exists(done_file):
            return document_dir

        # 1. Download and extract Anserini index to a temporary location
        tmp_dir = os.path.join(cachedir, "tmp_download")
        archive_file = os.path.join(tmp_dir, "archive_file")
        os.makedirs(document_dir, exist_ok=True)
        os.makedirs(tmp_dir, exist_ok=True)
        logger.info(
            "downloading index for missing collection %s to temporary file %s",
            self.module_name, archive_file)
        download_file(url, archive_file, expected_hash=sha256)

        logger.info(
            "extracting index to %s (before moving to correct cache path)",
            tmp_dir)
        with tarfile.open(archive_file) as tar:
            tar.extractall(path=tmp_dir)

        extracted_dir = os.path.join(tmp_dir, index_directory_inside)
        if not (os.path.exists(extracted_dir)
                and os.path.isdir(extracted_dir)):
            raise ValueError(
                f"could not find expected index directory {extracted_dir} in {tmp_dir}"
            )

        # 2. Move index to its correct location in the cache
        index_dir = os.path.join(cachedir, index_cache_path_string, "index")
        if not os.path.exists(os.path.join("index_dir", "done")):
            if os.path.exists(index_dir):
                shutil.rmtree(index_dir)
            shutil.move(extracted_dir, index_dir)

        # 3. Extract raw documents from the Anserini index to document_dir
        anserini_index_to_trec_docs(index_dir, document_dir,
                                    index_expected_document_count)

        # remove temporary files and create a /done we can use to verify extraction was successful
        shutil.rmtree(tmp_dir)
        with open(done_file, "wt") as outf:
            print("", file=outf)

        return document_dir
Exemple #7
0
    def download_and_extract(url, tmp_dir, expected_fns=None):
        tmp_dir.mkdir(exist_ok=True, parents=True)
        gz_name = url.split("/")[-1]
        output_gz = tmp_dir / gz_name
        if not output_gz.exists():
            logger.info(f"Downloading from {url}...")
            download_file(url, output_gz)

        extract_dir = None
        t = time()
        if str(output_gz).endswith("tar.gz"):
            tmp_dir = tmp_dir / gz_name.replace(".tar.gz", "")
            logger.info(f"tmp_dir: {tmp_dir}")
            if not tmp_dir.exists():
                logger.info(
                    f"{tmp_dir} file does not exist, extracting from {output_gz}..."
                )
                with tarfile.open(output_gz, "r:gz") as f:
                    f.extractall(path=tmp_dir)

            if os.path.isdir(
                    tmp_dir):  # and set(os.listdir(tmp_dir)) != expected_fns:
                extract_dir = tmp_dir
            elif not os.path.isdir(
                    tmp_dir):  # and tmp_dir != list(expected_fns)[0]:
                extract_dir = tmp_dir.parent

        else:
            outp_fn = tmp_dir / gz_name.replace(".gz", "")
            if not outp_fn.exists():
                logger.info(
                    f"{tmp_dir} file does not exist, extracting from {output_gz}..."
                )
                with gzip.open(output_gz, "rb") as fin, open(outp_fn,
                                                             "wb") as fout:
                    shutil.copyfileobj(fin, fout)
            extract_dir = tmp_dir

        duration = int(time() - t)
        min, sec = duration // 60, duration % 60
        logger.info(
            f"{output_gz} extracted after {duration} seconds (00:{min}:{sec})")
        return extract_dir
Exemple #8
0
    def download_if_missing(self):
        """ download query.csv and prepare queryid - query mapping file """
        if self.topic_file.exists() and self.qid_map_file.exists():
            return

        tmp_dir = Path("/tmp")
        tmp_dir.mkdir(exist_ok=True, parents=True)
        self.file_fn.mkdir(exist_ok=True, parents=True)

        query_fn = tmp_dir / f"query.csv"
        if not query_fn.exists():
            download_file(self.url, query_fn)

        # prepare qid - query
        qid_map = {}
        topic_file = open(self.topic_file, "w", encoding="utf-8")
        query_file = open(query_fn)
        for qid, line in enumerate(query_file):
            if qid != 0:  # ignore the first line "query"
                topic_file.write(topic_to_trectxt(qid, line.strip()))
                qid_map[qid] = line
        topic_file.close()
        json.dump(qid_map, open(self.qid_map_file, "w"))
Exemple #9
0
    def download_if_missing(self, cachedir):
        if os.path.exists(self.config["documents"]["path"]):
            return
        elif "index_download" not in self.config["documents"]:
            raise IOError(
                f"a download URL is not available for collection={self.name} and the collection path {self.config['documents']['path']} does not exist; you must manully place the document collection at this path in order to use this collection"
            )

        # Download the collection from URL and extract into a path in the cache directory.
        # To avoid re-downloading every call, we create an empty '/done' file in this directory on success.
        downloaded_collection_dir = os.path.join(cachedir, self.name,
                                                 "downloaded")
        done_file = os.path.join(downloaded_collection_dir, "done")
        document_dir = os.path.join(downloaded_collection_dir, "documents")

        self.config["documents"]["path"] = document_dir
        # already downloaded?
        if os.path.exists(done_file):
            return True

        # 1. Download and extract Anserini index to a temporary location
        tmp_dir = os.path.join(downloaded_collection_dir, "tmp")
        archive_file = os.path.join(tmp_dir, "archive_file")
        os.makedirs(document_dir, exist_ok=True)
        os.makedirs(tmp_dir, exist_ok=True)
        logger.info(
            "downloading index for missing collection %s to temporary file %s",
            self.name, archive_file)
        download_file(
            self.config["documents"]["index_download"]["url"],
            archive_file,
            expected_hash=self.config["documents"]["index_download"]["sha256"],
        )

        logger.debug("extracting to %s", tmp_dir)
        with tarfile.open(archive_file) as tar:
            tar.extractall(path=tmp_dir)

        extracted_dir = os.path.join(
            tmp_dir, self.config["documents"]["index_download"]
            ["index_directory_inside"])
        if not (os.path.exists(extracted_dir)
                and os.path.isdir(extracted_dir)):
            raise ValueError(
                f"could not find expected index directory {extracted_dir} in {tmp_dir}"
            )

        # 2. Move Anserini index to its correct location in the cache
        index_config = self.config["documents"]["index_download"][
            "index_config_string"]
        index_dir = os.path.join(cachedir, self.name, index_config, "index")
        shutil.move(extracted_dir, index_dir)

        # 3. Extract raw documents from the Anserini index to document_dir
        index_to_trec_docs(
            index_dir, document_dir, self.config["documents"]["index_download"]
            ["expected_document_count"])

        # remove temporary file and create a /done we can use to verify extraction was successful
        os.remove(archive_file)
        with open(done_file, "wt") as outf:
            print("", file=outf)

        logger.info("missing collection %s saved to %s", self.config["name"],
                    document_dir)
Exemple #10
0
    def download_if_missing(self):
        files = [self.qid_map_file, self.docid_map_file, self.qrel_file, self.topic_file, self.fold_file]
        if all([f.exists() for f in files]):
            return

        lang = self.config["lang"]

        tmp_dir = Path("/tmp")
        zip_fn = tmp_dir / f"{lang}.zip"
        if not zip_fn.exists():
            download_file(f"{self.url}/{lang}.zip", zip_fn)

        with ZipFile(zip_fn, "r") as zipobj:
            zipobj.extractall(tmp_dir)

        # prepare docid-url mapping from dedup.pkl
        pkl_fn = tmp_dir / f"{lang}_dedupe_definitions_v2.pkl"
        doc_objs = pickle.load(open(pkl_fn, "rb"))
        self._docid_map = self._prep_docid_map(doc_objs)
        assert self._get_n_docid() == len(doc_objs)

        # prepare folds, qrels, topics, docstring2qid  # TODO: shall we add negative samples?
        qrels, self._qid_map = defaultdict(dict), {}
        qids = {s: [] for s in ["train", "valid", "test"]}

        topic_file = open(self.topic_file, "w", encoding="utf-8")
        qrel_file = open(self.qrel_file, "w", encoding="utf-8")

        def gen_doc_from_gzdir(dir):
            """ generate parsed dict-format doc from all jsonl.gz files under given directory """
            for fn in sorted(dir.glob("*.jsonl.gz")):
                f = gzip.open(fn, "rb")
                for doc in f:
                    yield json.loads(doc)

        for set_name in qids:
            set_path = tmp_dir / lang / "final" / "jsonl" / set_name
            for doc in gen_doc_from_gzdir(set_path):
                code = remove_newline(" ".join(doc["code_tokens"]))
                docstring = remove_newline(" ".join(doc["docstring_tokens"]))
                n_words_in_docstring = len(docstring.split())
                if n_words_in_docstring >= 1024:
                    logger.warning(
                        f"chunk query to first 1000 words otherwise TooManyClause would be triggered "
                        f"at lucene at search stage, "
                    )
                    docstring = " ".join(docstring.split()[:1020])  # for TooManyClause

                docid = self.get_docid(doc["url"], code)
                qid = self._qid_map.get(docstring, str(len(self._qid_map)))
                qrel_file.write(f"{qid} Q0 {docid} 1\n")

                if docstring not in self._qid_map:
                    self._qid_map[docstring] = qid
                    qids[set_name].append(qid)
                    topic_file.write(topic_to_trectxt(qid, docstring))

        topic_file.close()
        qrel_file.close()

        # write to qid_map.json, docid_map, fold.json
        json.dump(self._qid_map, open(self.qid_map_file, "w"))
        json.dump(self._docid_map, open(self.docid_map_file, "w"))
        json.dump(
            {"s1": {"train_qids": qids["train"], "predict": {"dev": qids["valid"], "test": qids["test"]}}},
            open(self.fold_file, "w"),
        )