def setUp(self): curdir = os.getcwd() if curdir.endswith('clprf'): self.pyserini_root = '../..' else: self.pyserini_root = '.' self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(self.tmp): shutil.rmtree(self.tmp) os.mkdir(self.tmp) os.mkdir(f'{self.tmp}/runs') self.round3_runs = { 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': 'dfccc32efd58a8284ae411e5c6b27ce9', } download_url( 'https://storage.googleapis.com/neuralresearcher_data/trec_covid/data/53/covidex.t5.final.txt', f'{self.tmp}/runs') for url in self.round3_runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub( '\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter download_url(url, self.tmp, md5=self.round3_runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
def setUp(self): download_and_unpack_index('https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-abstract-2020-05-01.tar.gz') download_and_unpack_index('https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-full-text-2020-05-01.tar.gz') download_and_unpack_index('https://git.uwaterloo.ca/jimmylin/cord19-indexes/raw/master/2020-05-01/lucene-index-cord19-paragraph-2020-05-01.tar.gz') download_url('https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round2/anserini.covid-r2.fusion1.txt.gz', 'runs') # from https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python with gzip.open('runs/anserini.covid-r2.fusion1.txt.gz', 'rb') as f_in: with open('runs/anserini.covid-r2.fusion1.txt', 'wb') as f_out: shutil.copyfileobj(f_in, f_out)
def setUp(self): download_and_unpack_index( 'https://www.dropbox.com/s/jdsc6wu0vbumpup/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1') download_and_unpack_index( 'https://www.dropbox.com/s/ouvp7zyqsp9y9gh/lucene-index-cord19-full-text-2020-05-01.tar.gz?dl=1') download_and_unpack_index( 'https://www.dropbox.com/s/e1118vjuf58ojt4/lucene-index-cord19-paragraph-2020-05-01.tar.gz?dl=1') download_url('https://www.dropbox.com/s/wqb0vhxp98g7dxh/anserini.covid-r2.fusion1.txt.gz?dl=1', 'runs') # from https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python with gzip.open('runs/anserini.covid-r2.fusion1.txt.gz', 'rb') as f_in: with open('runs/anserini.covid-r2.fusion1.txt', 'wb') as f_out: shutil.copyfileobj(f_in, f_out)
def main(args): print(args.url) contents = urllib.request.urlopen(args.url).read().decode('utf-8') pattern = None if args.dropbox: pattern = re.compile('https://www.dropbox.com/[^)]+') elif args.gitlab: pattern = re.compile('https://git.uwaterloo.ca/([^)]+).tar.gz') # See https://git.uwaterloo.ca/jimmylin/anserini-indexes/-/raw/master/README.md # Tricky pattern to write because some lines might have two GitLab URLs elif args.vault: pattern = re.compile('https://vault.cs.uwaterloo.ca/[^)]+') else: print( 'Must specify one of --dropbox, --gitlab, --vault: type of link to check.' ) exit(0) md5sum_pattern = re.compile('`([a-z0-9]{32})`') for line in contents.splitlines(): match = pattern.search(line) if match: md5sum_match = md5sum_pattern.search(line) if md5sum_match: url = match.group() if args.vault: if not url.endswith('/download'): url = url + '/download' md5sum = md5sum_match.group(1) print(f'Downloading and verifying {url}') destination = download_url(url, '.', md5=md5sum) print(f'Finished downloading to {destination}, removing...') os.remove(destination)
def setUp(self): self.runs = { 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.abstract.qq.bm25.txt': 'd08d85c87e30d6c4abf54799806d282f', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.abstract.qdel.bm25.txt': 'd552dff90995cd860a5727637f0be4d1', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.full-text.qq.bm25.txt': '6c9f4c09d842b887262ca84d61c61a1f', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.full-text.qdel.bm25.txt': 'c5f9db7733c72eea78ece2ade44d3d35', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.paragraph.qq.bm25.txt': '872673b3e12c661748d8899f24d3ba48', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round3/anserini.covid-r3.paragraph.qdel.bm25.txt': 'c1b966e4c3f387b6810211f339b35852', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.abstract.qq.bm25.txt': '56ac5a0410e235243ca6e9f0f00eefa1', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.abstract.qdel.bm25.txt': '115d6d2e308b47ffacbc642175095c74', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.full-text.qq.bm25.txt': 'af0d10a5344f4007e6781e8d2959eb54', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.full-text.qdel.bm25.txt': '594d469b8f45cf808092a3d8e870eaf5', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.paragraph.qq.bm25.txt': '6f468b7b60aaa05fc215d237b5475aec', 'https://git.uwaterloo.ca/jimmylin/covidex-trec-covid-runs/raw/master/round4/anserini.covid-r4.paragraph.qdel.bm25.txt': 'b7b39629c12573ee0bfed8687dacc743', } self.tmp = f'tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(self.tmp): shutil.rmtree(self.tmp) os.mkdir(self.tmp) for url in self.runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub( '\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter download_url(url, self.tmp, md5=self.runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) print('')
def setUp(self): curdir = os.getcwd() if curdir.endswith('clprf'): self.pyserini_root = '../..' else: self.pyserini_root = '.' self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(self.tmp): shutil.rmtree(self.tmp) os.mkdir(self.tmp) os.mkdir(f'{self.tmp}/runs') self.round5_runs = { 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.1s.gz': '2181ae5b7fe8bafbd3b41700f3ccde02', 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.2s.gz': 'e61f9b6de5ffbe1b5b82d35216968154', 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.2s.gz': '6e517a5e044d8b7ce983f7e165cf4aeb', 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.1s.gz': 'dc9b4b45494294a8448cf0693f07f7fd' } for url in self.round5_runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub( '\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter gzip_filename = (".").join(filename.split('.')[:-1]) download_url(url, f'{self.tmp}/runs/', md5=self.round5_runs[url], force=True) self.assertTrue( os.path.exists(os.path.join(f'{self.tmp}/runs/', filename))) with gzip.open(f'{self.tmp}/runs/{filename}', 'rb') as f_in: with open(f'{self.tmp}/runs/{gzip_filename}', 'wb') as f_out: shutil.copyfileobj(f_in, f_out)
def check_runs(self, runs): tmp = f'tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(tmp): shutil.rmtree(tmp) os.mkdir(tmp) for url in runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter download_url(url, tmp, md5=runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(tmp, filename))) print('') shutil.rmtree(tmp)
def check(index): for entry in index: print(f'# Checking "{entry}"...') md5sum = index[entry]['md5'] for url in index[entry]['urls']: destination = download_url(url, '.', md5=md5sum) print(f'Finished downloading to {destination}, cleaning up.') os.remove(destination) print('\n')
def setUp(self): self.runs = { 'https://www.dropbox.com/s/g80cqdxud1l06wq/anserini.covid-r3.abstract.qq.bm25.txt?dl=1': 'd08d85c87e30d6c4abf54799806d282f', 'https://www.dropbox.com/s/sjcnxq7h0a3j3xz/anserini.covid-r3.abstract.qdel.bm25.txt?dl=1': 'd552dff90995cd860a5727637f0be4d1', 'https://www.dropbox.com/s/4bjx35sgosu0jz0/anserini.covid-r3.full-text.qq.bm25.txt?dl=1': '6c9f4c09d842b887262ca84d61c61a1f', 'https://www.dropbox.com/s/mjt7y1ywae784d0/anserini.covid-r3.full-text.qdel.bm25.txt?dl=1': 'c5f9db7733c72eea78ece2ade44d3d35', 'https://www.dropbox.com/s/qwn7jd8vg2chjik/anserini.covid-r3.paragraph.qq.bm25.txt?dl=1': '872673b3e12c661748d8899f24d3ba48', 'https://www.dropbox.com/s/2928i60fj2i09bt/anserini.covid-r3.paragraph.qdel.bm25.txt?dl=1': 'c1b966e4c3f387b6810211f339b35852', 'https://www.dropbox.com/s/mf79huhxfy96g6i/anserini.covid-r4.abstract.qq.bm25.txt?dl=1': '56ac5a0410e235243ca6e9f0f00eefa1', 'https://www.dropbox.com/s/4zau6ejrkvgn9m7/anserini.covid-r4.abstract.qdel.bm25.txt?dl=1': '115d6d2e308b47ffacbc642175095c74', 'https://www.dropbox.com/s/bpdopie6gqffv0w/anserini.covid-r4.full-text.qq.bm25.txt?dl=1': 'af0d10a5344f4007e6781e8d2959eb54', 'https://www.dropbox.com/s/rh0uy71ogbpas0v/anserini.covid-r4.full-text.qdel.bm25.txt?dl=1': '594d469b8f45cf808092a3d8e870eaf5', 'https://www.dropbox.com/s/ifkjm8ff8g2aoh1/anserini.covid-r4.paragraph.qq.bm25.txt?dl=1': '6f468b7b60aaa05fc215d237b5475aec', 'https://www.dropbox.com/s/keuogpx1dzinsgy/anserini.covid-r4.paragraph.qdel.bm25.txt?dl=1': 'b7b39629c12573ee0bfed8687dacc743', } self.tmp = f'tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(self.tmp): shutil.rmtree(self.tmp) os.mkdir(self.tmp) for url in self.runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter download_url(url, self.tmp, md5=self.runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) print('')
def setUp(self): curdir = os.getcwd() if curdir.endswith('integrations'): self.pyserini_root = '..' else: self.pyserini_root = '.' self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}' # In the rare event there's a collision if os.path.exists(self.tmp): shutil.rmtree(self.tmp) os.mkdir(self.tmp) os.mkdir(f'{self.tmp}/runs') self.round4_runs = { 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt': 'dfccc32efd58a8284ae411e5c6b27ce9', 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt': '7a5c27e8e052c49ff72d557051825973', } download_url( 'https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz', f'{self.tmp}/runs') with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in: with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) for url in self.round4_runs: print(f'Verifying stored run at {url}...') filename = url.split('/')[-1] filename = re.sub( '\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter download_url(url, self.tmp, md5=self.round4_runs[url], force=True) self.assertTrue(os.path.exists(os.path.join(self.tmp, filename))) print('')
def setUp(self): download_and_unpack_index( 'https://www.dropbox.com/s/wxjoe4g71zt5za2/lucene-index-cord19-abstract-2020-05-01.tar.gz?dl=1' ) download_and_unpack_index( 'https://www.dropbox.com/s/di27r5o2g5kat5k/lucene-index-cord19-full-text-2020-05-01.tar.gz?dl=1' ) download_and_unpack_index( 'https://www.dropbox.com/s/6ib71scm925mclk/lucene-index-cord19-paragraph-2020-05-01.tar.gz?dl=1' ) download_url( 'https://www.dropbox.com/s/wqb0vhxp98g7dxh/anserini.covid-r2.fusion1.txt.gz?dl=1', 'runs') # from https://stackoverflow.com/questions/31028815/how-to-unzip-gz-file-using-python with gzip.open('runs/anserini.covid-r2.fusion1.txt.gz', 'rb') as f_in: with open('runs/anserini.covid-r2.fusion1.txt', 'wb') as f_out: shutil.copyfileobj(f_in, f_out)
def main(args): print(args.url) contents = urllib.request.urlopen(args.url).read().decode('utf-8') dropbox_pattern = re.compile('https://www.dropbox.com/[^)]+') md5sum_pattern = re.compile('`([a-z0-9]{32})`') for line in contents.splitlines(): match = dropbox_pattern.search(line) if match: md5sum_match = md5sum_pattern.search(line) if md5sum_match: url = match.group() md5sum = md5sum_match.group(1) print(f'Downloading and verifying {url}') destination = download_url(url, '.', md5=md5sum) print(f'Finished downloading to {destination}, removing...') os.remove(destination)
def download_kilt_topics(cls, task: str, force=False): if task not in KILT_QUERY_INFO: raise ValueError(f'Unrecognized query name {task}') task = KILT_QUERY_INFO[task] md5 = task['md5'] save_dir = os.path.join(get_cache_home(), 'queries') if not os.path.exists(save_dir): os.makedirs(save_dir) for url in task['urls']: try: return download_url(url, save_dir, force=force, md5=md5) except (HTTPError, URLError) as e: print( f'Unable to download encoded query at {url}, trying next URL...' ) raise ValueError( f'Unable to download encoded query at any known URLs.')
def main(): if not (os.path.isdir(indexes[0]) and os.path.isdir(indexes[1]) and os.path.isdir(indexes[2])): print('Required indexes do not exist. Please download first.') os.system( 'cat src/main/resources/topics-and-qrels/qrels.covid-round1.txt ' + 'src/main/resources/topics-and-qrels/qrels.covid-round2.txt ' + '> src/main/resources/topics-and-qrels/qrels.covid-round12.txt') round3_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round3.txt' round2_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round12.txt' round3_cumulative_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt' verify_stored_runs(stored_runs) perform_runs() perform_fusion() prepare_final_submissions(round2_cumulative_qrels) evaluate_runs(round2_cumulative_qrels, cumulative_runs) evaluate_runs(round3_cumulative_qrels, cumulative_runs) # Download the NIST post-processed runs. print('') download_url( 'https://www.dropbox.com/s/ilqgky1tti0zvez/anserini.final-r3.fusion1.post-processed.txt?dl=1', 'runs', force=True) download_url( 'https://www.dropbox.com/s/ue3z6xxxca9krkb/anserini.final-r3.fusion2.post-processed.txt?dl=1', 'runs', force=True) download_url( 'https://www.dropbox.com/s/95vk831wp1ldnpm/anserini.final-r3.rf.post-processed.txt?dl=1', 'runs', force=True) evaluate_runs(round3_qrels, final_runs)