def download_if_missing(self): cachedir = self.get_cache_path() tmp_dir, document_dir = Path("/tmp"), cachedir / "documents" expected_fns = [ document_dir / "metadata.csv", document_dir / "document_parses" ] if all([os.path.exists(f) for f in expected_fns]): return document_dir.as_posix() url = self.url % self.date tar_file = tmp_dir / f"covid-19-{self.date}.tar.gz" if not tar_file.exists(): download_file(url, tar_file) with tarfile.open(tar_file) as f: f.extractall(path=cachedir ) # emb.tar.gz, metadata.csv, doc.tar.gz, changelog os.rename(cachedir / self.date, document_dir) doc_fn = "document_parses" if f"{doc_fn}.tar.gz" in os.listdir(document_dir): with tarfile.open(document_dir / f"{doc_fn}.tar.gz") as f: f.extractall(path=document_dir) else: self.transform_metadata(document_dir) # only document_parses and metadata.csv are expected for fn in os.listdir(document_dir): if (document_dir / fn) not in expected_fns: os.remove(document_dir / fn) return document_dir.as_posix()
def test_covid_round3_qrel_conversion(): collection_config = {"name": "covid", "round": 3, "coll_type": "abstract"} benchmark_config = { "name": "covid", "udelqexpand": False, "useprevqrels": False } collection = CovidCollection(collection_config) benchmark = CovidBenchmark(benchmark_config, provide={"collectoin": collection}) benchmark.download_if_missing() docid_map_tmp = "/tmp/docid.map" newdocid_qrels_fn = "/tmp/new.docid.qrels" qrel_url = "https://ir.nist.gov/covidSubmit/data/qrels-covid_d3_j0.5-3.txt" docid_map_url = "https://ir.nist.gov/covidSubmit/data/changedIds-May19.csv" download_file(docid_map_url, docid_map_tmp) download_file(qrel_url, newdocid_qrels_fn) with open(docid_map_tmp) as f: old2new = {line.split(",")[0]: line.split(",")[1] for line in f} newdocid_qrels = load_qrels(newdocid_qrels_fn) olddocid_qrels = benchmark.qrels # since there are dropped out terms in benchmark.qrels (the ones that appeared in previous judgements) # converted olddocid_qrels will have less entries than newdocid_qrels. # Cannot use assert convert_qrels == newdocid_qrels here for qid in olddocid_qrels: for docid in olddocid_qrels[qid]: newdocid = old2new.get(docid, docid) assert olddocid_qrels[qid][docid] == newdocid_qrels[qid][newdocid]
def download_if_missing(self): url = "http://ciir.cs.umass.edu/downloads/Antique/antique-collection.txt" cachedir = self.get_cache_path() document_dir = os.path.join(cachedir, "documents") coll_filename = os.path.join(document_dir, "antique-collection.txt") if os.path.exists(coll_filename): return document_dir tmp_dir = cachedir / "tmp" tmp_filename = os.path.join(tmp_dir, "tmp.anqique.file") os.makedirs(tmp_dir, exist_ok=True) os.makedirs(document_dir, exist_ok=True) download_file( url, tmp_filename, expected_hash= "68b6688f5f2668c93f0e8e43384f66def768c4da46da4e9f7e2629c1c47a0c36") self._convert_to_trec(inp_path=tmp_filename, outp_path=coll_filename) logger.info( f"antique collection file prepared, stored at {coll_filename}") for file in os.listdir(tmp_dir): # in case there are legacy files os.remove(os.path.join(tmp_dir, file)) shutil.rmtree(tmp_dir) return document_dir
def download_if_missing(self): cachedir = self.get_cache_path() document_dir = cachedir / "documents" coll_filename = document_dir / ("csn-" + self.config["lang"] + "-collection.txt") if coll_filename.exists(): return document_dir.as_posix() zipfile = self.config["lang"] + ".zip" lang_url = f"{self.url}/{zipfile}" tmp_dir = cachedir / "tmp" zip_path = tmp_dir / zipfile if zip_path.exists(): logger.info( f"{zipfile} already exist under directory {tmp_dir}, skip downloaded" ) else: tmp_dir.mkdir(exist_ok=True, parents=True) download_file(lang_url, zip_path) document_dir.mkdir(exist_ok=True, parents=True) # tmp with ZipFile(zip_path, "r") as zipobj: zipobj.extractall(tmp_dir) pkl_path = tmp_dir / (self.config["lang"] + "_dedupe_definitions_v2.pkl") self._pkl2trec(pkl_path, coll_filename) return document_dir.as_posix()
def download_raw(self): cachedir = self.get_cache_path() tmp_dir = cachedir / "tmp" tmp_tar_fn, tmp_corpus_dir = tmp_dir / "nfcorpus.tar.gz", tmp_dir / "nfcorpus" os.makedirs(tmp_dir, exist_ok=True) if not tmp_tar_fn.exists(): download_file(self.url, tmp_tar_fn, "ebc026d4a8bef3f866148b727e945a2073eb4045ede9b7de95dd50fd086b4256") with tarfile.open(tmp_tar_fn) as f: f.extractall(tmp_dir) return tmp_corpus_dir
def download_index(self, cachedir, url, sha256, index_directory_inside, index_cache_path_string, index_expected_document_count): # Download the collection from URL and extract into a path in the cache directory. # To avoid re-downloading every call, we create an empty '/done' file in this directory on success. done_file = os.path.join(cachedir, "done") document_dir = os.path.join(cachedir, "documents") # already downloaded? if os.path.exists(done_file): return document_dir # 1. Download and extract Anserini index to a temporary location tmp_dir = os.path.join(cachedir, "tmp_download") archive_file = os.path.join(tmp_dir, "archive_file") os.makedirs(document_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) logger.info( "downloading index for missing collection %s to temporary file %s", self.module_name, archive_file) download_file(url, archive_file, expected_hash=sha256) logger.info( "extracting index to %s (before moving to correct cache path)", tmp_dir) with tarfile.open(archive_file) as tar: tar.extractall(path=tmp_dir) extracted_dir = os.path.join(tmp_dir, index_directory_inside) if not (os.path.exists(extracted_dir) and os.path.isdir(extracted_dir)): raise ValueError( f"could not find expected index directory {extracted_dir} in {tmp_dir}" ) # 2. Move index to its correct location in the cache index_dir = os.path.join(cachedir, index_cache_path_string, "index") if not os.path.exists(os.path.join("index_dir", "done")): if os.path.exists(index_dir): shutil.rmtree(index_dir) shutil.move(extracted_dir, index_dir) # 3. Extract raw documents from the Anserini index to document_dir anserini_index_to_trec_docs(index_dir, document_dir, index_expected_document_count) # remove temporary files and create a /done we can use to verify extraction was successful shutil.rmtree(tmp_dir) with open(done_file, "wt") as outf: print("", file=outf) return document_dir
def download_and_extract(url, tmp_dir, expected_fns=None): tmp_dir.mkdir(exist_ok=True, parents=True) gz_name = url.split("/")[-1] output_gz = tmp_dir / gz_name if not output_gz.exists(): logger.info(f"Downloading from {url}...") download_file(url, output_gz) extract_dir = None t = time() if str(output_gz).endswith("tar.gz"): tmp_dir = tmp_dir / gz_name.replace(".tar.gz", "") logger.info(f"tmp_dir: {tmp_dir}") if not tmp_dir.exists(): logger.info( f"{tmp_dir} file does not exist, extracting from {output_gz}..." ) with tarfile.open(output_gz, "r:gz") as f: f.extractall(path=tmp_dir) if os.path.isdir( tmp_dir): # and set(os.listdir(tmp_dir)) != expected_fns: extract_dir = tmp_dir elif not os.path.isdir( tmp_dir): # and tmp_dir != list(expected_fns)[0]: extract_dir = tmp_dir.parent else: outp_fn = tmp_dir / gz_name.replace(".gz", "") if not outp_fn.exists(): logger.info( f"{tmp_dir} file does not exist, extracting from {output_gz}..." ) with gzip.open(output_gz, "rb") as fin, open(outp_fn, "wb") as fout: shutil.copyfileobj(fin, fout) extract_dir = tmp_dir duration = int(time() - t) min, sec = duration // 60, duration % 60 logger.info( f"{output_gz} extracted after {duration} seconds (00:{min}:{sec})") return extract_dir
def download_if_missing(self): """ download query.csv and prepare queryid - query mapping file """ if self.topic_file.exists() and self.qid_map_file.exists(): return tmp_dir = Path("/tmp") tmp_dir.mkdir(exist_ok=True, parents=True) self.file_fn.mkdir(exist_ok=True, parents=True) query_fn = tmp_dir / f"query.csv" if not query_fn.exists(): download_file(self.url, query_fn) # prepare qid - query qid_map = {} topic_file = open(self.topic_file, "w", encoding="utf-8") query_file = open(query_fn) for qid, line in enumerate(query_file): if qid != 0: # ignore the first line "query" topic_file.write(topic_to_trectxt(qid, line.strip())) qid_map[qid] = line topic_file.close() json.dump(qid_map, open(self.qid_map_file, "w"))
def download_if_missing(self, cachedir): if os.path.exists(self.config["documents"]["path"]): return elif "index_download" not in self.config["documents"]: raise IOError( f"a download URL is not available for collection={self.name} and the collection path {self.config['documents']['path']} does not exist; you must manully place the document collection at this path in order to use this collection" ) # Download the collection from URL and extract into a path in the cache directory. # To avoid re-downloading every call, we create an empty '/done' file in this directory on success. downloaded_collection_dir = os.path.join(cachedir, self.name, "downloaded") done_file = os.path.join(downloaded_collection_dir, "done") document_dir = os.path.join(downloaded_collection_dir, "documents") self.config["documents"]["path"] = document_dir # already downloaded? if os.path.exists(done_file): return True # 1. Download and extract Anserini index to a temporary location tmp_dir = os.path.join(downloaded_collection_dir, "tmp") archive_file = os.path.join(tmp_dir, "archive_file") os.makedirs(document_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) logger.info( "downloading index for missing collection %s to temporary file %s", self.name, archive_file) download_file( self.config["documents"]["index_download"]["url"], archive_file, expected_hash=self.config["documents"]["index_download"]["sha256"], ) logger.debug("extracting to %s", tmp_dir) with tarfile.open(archive_file) as tar: tar.extractall(path=tmp_dir) extracted_dir = os.path.join( tmp_dir, self.config["documents"]["index_download"] ["index_directory_inside"]) if not (os.path.exists(extracted_dir) and os.path.isdir(extracted_dir)): raise ValueError( f"could not find expected index directory {extracted_dir} in {tmp_dir}" ) # 2. Move Anserini index to its correct location in the cache index_config = self.config["documents"]["index_download"][ "index_config_string"] index_dir = os.path.join(cachedir, self.name, index_config, "index") shutil.move(extracted_dir, index_dir) # 3. Extract raw documents from the Anserini index to document_dir index_to_trec_docs( index_dir, document_dir, self.config["documents"]["index_download"] ["expected_document_count"]) # remove temporary file and create a /done we can use to verify extraction was successful os.remove(archive_file) with open(done_file, "wt") as outf: print("", file=outf) logger.info("missing collection %s saved to %s", self.config["name"], document_dir)
def download_if_missing(self): files = [self.qid_map_file, self.docid_map_file, self.qrel_file, self.topic_file, self.fold_file] if all([f.exists() for f in files]): return lang = self.config["lang"] tmp_dir = Path("/tmp") zip_fn = tmp_dir / f"{lang}.zip" if not zip_fn.exists(): download_file(f"{self.url}/{lang}.zip", zip_fn) with ZipFile(zip_fn, "r") as zipobj: zipobj.extractall(tmp_dir) # prepare docid-url mapping from dedup.pkl pkl_fn = tmp_dir / f"{lang}_dedupe_definitions_v2.pkl" doc_objs = pickle.load(open(pkl_fn, "rb")) self._docid_map = self._prep_docid_map(doc_objs) assert self._get_n_docid() == len(doc_objs) # prepare folds, qrels, topics, docstring2qid # TODO: shall we add negative samples? qrels, self._qid_map = defaultdict(dict), {} qids = {s: [] for s in ["train", "valid", "test"]} topic_file = open(self.topic_file, "w", encoding="utf-8") qrel_file = open(self.qrel_file, "w", encoding="utf-8") def gen_doc_from_gzdir(dir): """ generate parsed dict-format doc from all jsonl.gz files under given directory """ for fn in sorted(dir.glob("*.jsonl.gz")): f = gzip.open(fn, "rb") for doc in f: yield json.loads(doc) for set_name in qids: set_path = tmp_dir / lang / "final" / "jsonl" / set_name for doc in gen_doc_from_gzdir(set_path): code = remove_newline(" ".join(doc["code_tokens"])) docstring = remove_newline(" ".join(doc["docstring_tokens"])) n_words_in_docstring = len(docstring.split()) if n_words_in_docstring >= 1024: logger.warning( f"chunk query to first 1000 words otherwise TooManyClause would be triggered " f"at lucene at search stage, " ) docstring = " ".join(docstring.split()[:1020]) # for TooManyClause docid = self.get_docid(doc["url"], code) qid = self._qid_map.get(docstring, str(len(self._qid_map))) qrel_file.write(f"{qid} Q0 {docid} 1\n") if docstring not in self._qid_map: self._qid_map[docstring] = qid qids[set_name].append(qid) topic_file.write(topic_to_trectxt(qid, docstring)) topic_file.close() qrel_file.close() # write to qid_map.json, docid_map, fold.json json.dump(self._qid_map, open(self.qid_map_file, "w")) json.dump(self._docid_map, open(self.docid_map_file, "w")) json.dump( {"s1": {"train_qids": qids["train"], "predict": {"dev": qids["valid"], "test": qids["test"]}}}, open(self.fold_file, "w"), )