def cityscapes(): """Extract Cityscapes SOTA tables.""" soup = get_soup(CITYSCAPES_URL) sota_tabels = soup.findAll("table", attrs={"class": "tablepress"}) if len(sota_tabels) == 3: cityscapes = sota_tabels[0] # pixel-level semantic segmentation task dataset = Dataset( name=DATASET_NAME, is_subdataset=False, description=DATASET_DESCRIPTION, ) task = Task(name="Semantic Segmentation") task.datasets = [dataset] task.source_link = Link(title="CityScapes Leaderboard", url=CITYSCAPES_URL) # scrape the evaluation values on the two datasets dataset.sota.metrics = ["Mean IoU (class)", "Mean IoU (class)"] dataset.sota.rows = get_sota_rows(cityscapes) tdb = TaskDB() tdb.add_task(task) return tdb.export() else: raise DataError("Got an unexpected number of SOTA tables.")
def eff(): """Extract EFF SOTA tables.""" response = requests.get(EFF_URL) if response.status_code != 200: raise HttpClientError("Resource unavailable", response=response) j = json.loads(response.text) tdb = TaskDB() for problem in j["problems"]: if problem["name"] in EFF_TASK_CONVERSION: problem_name = EFF_TASK_CONVERSION[problem["name"]] else: problem_name = problem["name"] task = Task(name=problem_name) task.source_link = Link( title="Progress of AI Research", url="https://github.com/AI-metrics/AI-metrics", ) datasets = [] for metric in problem["metrics"]: if "measures" in metric and metric["measures"]: measures = metric["measures"] dataset = Dataset( name=metric["name"], is_subdataset=False, sota=Sota(metrics=[metric["scale"]]), ) for measure in measures: sr = SotaRow( model_name=measure["name"], paper_title=measure["papername"], paper_url=measure["url"], metrics={metric["scale"]: measure["value"]}, ) if measure["replicated_url"]: sr.code_links.append( Link( title="Replicated", url=measure["replicated_url"], )) dataset.sota.rows.append(sr) datasets.append(dataset) task.datasets = datasets tdb.add_task(task) return tdb.export()
def squad(): """Extract SQUAD SOTA tables.""" soup = get_soup(SQUAD_URL) sota_tabels = soup.findAll("table", attrs={"class": "performanceTable"}) if len(sota_tabels) == 2: squad2 = sota_tabels[0] squad1 = sota_tabels[1] dataset2 = Dataset( name=DATASET_2_NAME, is_subdataset=False, description=DATASET_2_DESCRIPTION, ) dataset1 = Dataset( name=DATASET_1_NAME, is_subdataset=False, description=DATASET_1_DESCRIPTION, ) task = Task(name="Question Answering") task.datasets = [dataset2, dataset1] task.source_link = Link(title="SQuAD Leaderboard", url=SQUAD_URL) # scrape the evaluation values on the two datasets dataset2.sota.metrics = ["EM", "F1"] dataset1.sota.metrics = ["EM", "F1"] dataset2.sota.rows = get_sota_rows(squad2) dataset1.sota.rows = get_sota_rows(squad1) tdb = TaskDB() tdb.add_task(task) return tdb.export() else: raise DataError("Got an unexpected number of SOTA tables.")
def reddit(): """Extract Reddit SOTA tables.""" tdb = TaskDB() md = requests.get(REDITSOTA_URL).text # assumptions: # ### Category # #### Task md_lines = md.split("\n") category = None task = None for i in range(len(md_lines)): line = md_lines[i] if line.startswith("###") and not line.startswith("####"): category = line.replace("###", "").strip() if line.startswith("####") and not line.startswith("#####"): task = line.replace("####", "").strip() task = re.sub("^[0-9+].?", "", task).strip() if "<table>" in line.lower(): end_i = None # find the end of table for j in range(i, len(md_lines)): if "</table>" in md_lines[j].lower(): end_i = j + 1 break if end_i and task and category: html_lines = md_lines[i:end_i] h = "\n".join(html_lines) soup = BeautifulSoup(h, "html.parser") # parse out the individual rows entries = [] rows = soup.findAll("tr") for row in rows: cells = row.findAll("td") if len(cells) >= 4: # paper ref c_paper = cells[0] paper_title = c_paper.text.strip() paper_url = None if c_paper.find("a"): paper_url = c_paper.find("a")["href"] # datasets c_datasets = cells[1] c_datasets_li = c_datasets.findAll("li") dataset_names = [] for dataset_li in c_datasets_li: dataset_names.append(dataset_li.text.strip()) # metrics c_metrics = cells[2] c_metrics_li = c_metrics.findAll("li") metrics = [] for metrics_li in c_metrics_li: parts = metrics_li.text.split(":") parts = [p.strip() for p in parts] m = {} if len(parts) == 2: m[parts[0]] = parts[1] metrics.append(m) if not metrics: # Try to use it as single value parts = c_metrics.text.split(":") parts = [p.strip() for p in parts] m = {} if len(parts) == 2: m[parts[0]] = parts[1] metrics.append(m) # source code ref c_code = cells[3] c_code_a = c_code.findAll("a") code_links = [] for code_a in c_code_a: code_links.append( Link( title=code_a.text.strip(), url=code_a["href"], )) entries.append({ "paper_title": paper_title, "paper_url": paper_url, "dataset_names": dataset_names, "metrics": metrics, "code_links": code_links, }) # Add the new task t = Task(name=task, categories=[category]) t.source_link = Link(title="RedditSota", url=REDITSOTA_URL) # Add datasets and perfomance on them data_map = {} for e in entries: if len(e["dataset_names"]) == len(e["metrics"]): for j in range(len(e["dataset_names"])): dataset_name = e["dataset_names"][j] # make sure the dataset exists if dataset_name not in data_map: # collect all the metrics mentioned for this # dataset all_metrics = [ list(ee["metrics"][j].keys()) for ee in entries if dataset_name in ee["dataset_names"] ] all_metrics = [ item for sublist in all_metrics for item in sublist ] all_metrics = list(set(all_metrics)) dataset = Dataset( name=dataset_name, is_subdataset=False, sota=Sota(metrics=all_metrics), ) data_map[dataset_name] = dataset t.datasets.append(dataset) else: dataset = data_map[dataset_name] # record the metric for this dataset sr = SotaRow( model_name="", paper_title=e["paper_title"], paper_url=e["paper_url"], metrics=e["metrics"][j], code_links=e["code_links"], ) dataset.sota.rows.append(sr) # add and reset the task tdb.add_task(t) task = None return tdb.export()
def snli(): """Extract SNLI SOTA tables.""" soup = get_soup(SNLI_URL) table = soup.findAll("table", attrs={"class": "newstuff"})[1] rows = table.findAll("tr") sota_rows = [] # suffix = "" for row in rows: # ignore the header if row.get("class") == ["header"]: pass elif row.get("class") == ["section"]: # suffix = row.text.replace("models", "").strip() continue else: cells = row.findAll("td") a = cells[0].find("a") paper_url = a.get("href") if paper_url == "http://nlp.stanford.edu/pubs/snli_paper.pdf": paper_title = ( "A large annotated corpus for learning natural language " "inference" ) elif paper_url == "https://www.nyu.edu/projects/bowman/spinn.pdf": paper_title = ( "A Fast Unified Model for Parsing and Sentence " "Understanding" ) elif ( paper_url == "https://s3-us-west-2.amazonaws.com/openai-assets/" "research-covers/language-unsupervised/" "language_understanding_paper.pdf" ): paper_title = ( "Improving Language Understanding by Generative " "Pre-Training" ) elif ( paper_url == "https://pdfs.semanticscholar.org/adc1/" "84fcb04107f95e35ea1b07ef9aad749da8d7.pdf" ): paper_title = "Deep Fusion LSTMs for Text Semantic Matching" else: paper_title = a.text model_name = cells[1].text.strip() # if suffix: # model_name = "%s (%s)" % (model_name, suffix) model_name = model_name.replace("(code)", "").strip() params = cells[2].text.strip() train_acc = cells[3].text.strip() test_acc = cells[4].text.strip() sota_rows.append( SotaRow( model_name=model_name, paper_title=paper_title, paper_url=paper_url, metrics={ "% Test Accuracy": test_acc, "% Train Accuracy": train_acc, "Parameters": params, }, ) ) task = Task( name="Natural Language Inference", datasets=[ Dataset( name="SNLI", is_subdataset=False, sota=Sota( metrics=[ "% Test Accuracy", "% Train Accuracy", "Parameters", ], rows=sota_rows, ), ) ], source_link=Link( title="The Stanford Natural Language Inference (SNLI) Corpus", url="https://nlp.stanford.edu/projects/snli/", ), ) tdb = TaskDB() tdb.add_task(task) return tdb.export()