def cityscapes(): """Extract Cityscapes SOTA tables.""" soup = get_soup(CITYSCAPES_URL) sota_tabels = soup.findAll("table", attrs={"class": "tablepress"}) if len(sota_tabels) == 3: cityscapes = sota_tabels[0] # pixel-level semantic segmentation task dataset = Dataset( name=DATASET_NAME, is_subdataset=False, description=DATASET_DESCRIPTION, ) task = Task(name="Semantic Segmentation") task.datasets = [dataset] task.source_link = Link(title="CityScapes Leaderboard", url=CITYSCAPES_URL) # scrape the evaluation values on the two datasets dataset.sota.metrics = ["Mean IoU (class)", "Mean IoU (class)"] dataset.sota.rows = get_sota_rows(cityscapes) tdb = TaskDB() tdb.add_task(task) return tdb.export() else: raise DataError("Got an unexpected number of SOTA tables.")
def eff(): """Extract EFF SOTA tables.""" response = requests.get(EFF_URL) if response.status_code != 200: raise HttpClientError("Resource unavailable", response=response) j = json.loads(response.text) tdb = TaskDB() for problem in j["problems"]: if problem["name"] in EFF_TASK_CONVERSION: problem_name = EFF_TASK_CONVERSION[problem["name"]] else: problem_name = problem["name"] task = Task(name=problem_name) task.source_link = Link( title="Progress of AI Research", url="https://github.com/AI-metrics/AI-metrics", ) datasets = [] for metric in problem["metrics"]: if "measures" in metric and metric["measures"]: measures = metric["measures"] dataset = Dataset( name=metric["name"], is_subdataset=False, sota=Sota(metrics=[metric["scale"]]), ) for measure in measures: sr = SotaRow( model_name=measure["name"], paper_title=measure["papername"], paper_url=measure["url"], metrics={metric["scale"]: measure["value"]}, ) if measure["replicated_url"]: sr.code_links.append( Link( title="Replicated", url=measure["replicated_url"], )) dataset.sota.rows.append(sr) datasets.append(dataset) task.datasets = datasets tdb.add_task(task) return tdb.export()
def squad(): """Extract SQUAD SOTA tables.""" soup = get_soup(SQUAD_URL) sota_tabels = soup.findAll("table", attrs={"class": "performanceTable"}) if len(sota_tabels) == 2: squad2 = sota_tabels[0] squad1 = sota_tabels[1] dataset2 = Dataset( name=DATASET_2_NAME, is_subdataset=False, description=DATASET_2_DESCRIPTION, ) dataset1 = Dataset( name=DATASET_1_NAME, is_subdataset=False, description=DATASET_1_DESCRIPTION, ) task = Task(name="Question Answering") task.datasets = [dataset2, dataset1] task.source_link = Link(title="SQuAD Leaderboard", url=SQUAD_URL) # scrape the evaluation values on the two datasets dataset2.sota.metrics = ["EM", "F1"] dataset1.sota.metrics = ["EM", "F1"] dataset2.sota.rows = get_sota_rows(squad2) dataset1.sota.rows = get_sota_rows(squad1) tdb = TaskDB() tdb.add_task(task) return tdb.export() else: raise DataError("Got an unexpected number of SOTA tables.")
def parse_subdatasets( parent: Dataset, pairs: List[Tuple[ElementTree, ElementTree]]) -> List[Dataset]: subdatasets = [] for p, table in pairs: strong = p.find("strong") if strong is None: continue subdatasets.append( Dataset( name=strong.text.strip().strip(":"), is_subdataset=True, parent=parent, sota=parse_sota(table), )) return subdatasets
def fix_dataset(dataset: Dataset) -> Optional[Dataset]: """Walk through the dataset and return a valid one. Valid dataset contains only subdatasets with non empty sota tables or, if it has no subdatasets, its sota table has to be non-empty. """ dataset.subdatasets = [ subdataset for subdataset in dataset.subdatasets if len(subdataset.sota.rows) > 0 ] if len(dataset.subdatasets) > 0: return dataset if len(dataset.sota.rows) > 0: return dataset return None
def reddit(): """Extract Reddit SOTA tables.""" tdb = TaskDB() md = requests.get(REDITSOTA_URL).text # assumptions: # ### Category # #### Task md_lines = md.split("\n") category = None task = None for i in range(len(md_lines)): line = md_lines[i] if line.startswith("###") and not line.startswith("####"): category = line.replace("###", "").strip() if line.startswith("####") and not line.startswith("#####"): task = line.replace("####", "").strip() task = re.sub("^[0-9+].?", "", task).strip() if "<table>" in line.lower(): end_i = None # find the end of table for j in range(i, len(md_lines)): if "</table>" in md_lines[j].lower(): end_i = j + 1 break if end_i and task and category: html_lines = md_lines[i:end_i] h = "\n".join(html_lines) soup = BeautifulSoup(h, "html.parser") # parse out the individual rows entries = [] rows = soup.findAll("tr") for row in rows: cells = row.findAll("td") if len(cells) >= 4: # paper ref c_paper = cells[0] paper_title = c_paper.text.strip() paper_url = None if c_paper.find("a"): paper_url = c_paper.find("a")["href"] # datasets c_datasets = cells[1] c_datasets_li = c_datasets.findAll("li") dataset_names = [] for dataset_li in c_datasets_li: dataset_names.append(dataset_li.text.strip()) # metrics c_metrics = cells[2] c_metrics_li = c_metrics.findAll("li") metrics = [] for metrics_li in c_metrics_li: parts = metrics_li.text.split(":") parts = [p.strip() for p in parts] m = {} if len(parts) == 2: m[parts[0]] = parts[1] metrics.append(m) if not metrics: # Try to use it as single value parts = c_metrics.text.split(":") parts = [p.strip() for p in parts] m = {} if len(parts) == 2: m[parts[0]] = parts[1] metrics.append(m) # source code ref c_code = cells[3] c_code_a = c_code.findAll("a") code_links = [] for code_a in c_code_a: code_links.append( Link( title=code_a.text.strip(), url=code_a["href"], )) entries.append({ "paper_title": paper_title, "paper_url": paper_url, "dataset_names": dataset_names, "metrics": metrics, "code_links": code_links, }) # Add the new task t = Task(name=task, categories=[category]) t.source_link = Link(title="RedditSota", url=REDITSOTA_URL) # Add datasets and perfomance on them data_map = {} for e in entries: if len(e["dataset_names"]) == len(e["metrics"]): for j in range(len(e["dataset_names"])): dataset_name = e["dataset_names"][j] # make sure the dataset exists if dataset_name not in data_map: # collect all the metrics mentioned for this # dataset all_metrics = [ list(ee["metrics"][j].keys()) for ee in entries if dataset_name in ee["dataset_names"] ] all_metrics = [ item for sublist in all_metrics for item in sublist ] all_metrics = list(set(all_metrics)) dataset = Dataset( name=dataset_name, is_subdataset=False, sota=Sota(metrics=all_metrics), ) data_map[dataset_name] = dataset t.datasets.append(dataset) else: dataset = data_map[dataset_name] # record the metric for this dataset sr = SotaRow( model_name="", paper_title=e["paper_title"], paper_url=e["paper_url"], metrics=e["metrics"][j], code_links=e["code_links"], ) dataset.sota.rows.append(sr) # add and reset the task tdb.add_task(t) task = None return tdb.export()
def snli(): """Extract SNLI SOTA tables.""" soup = get_soup(SNLI_URL) table = soup.findAll("table", attrs={"class": "newstuff"})[1] rows = table.findAll("tr") sota_rows = [] # suffix = "" for row in rows: # ignore the header if row.get("class") == ["header"]: pass elif row.get("class") == ["section"]: # suffix = row.text.replace("models", "").strip() continue else: cells = row.findAll("td") a = cells[0].find("a") paper_url = a.get("href") if paper_url == "http://nlp.stanford.edu/pubs/snli_paper.pdf": paper_title = ( "A large annotated corpus for learning natural language " "inference" ) elif paper_url == "https://www.nyu.edu/projects/bowman/spinn.pdf": paper_title = ( "A Fast Unified Model for Parsing and Sentence " "Understanding" ) elif ( paper_url == "https://s3-us-west-2.amazonaws.com/openai-assets/" "research-covers/language-unsupervised/" "language_understanding_paper.pdf" ): paper_title = ( "Improving Language Understanding by Generative " "Pre-Training" ) elif ( paper_url == "https://pdfs.semanticscholar.org/adc1/" "84fcb04107f95e35ea1b07ef9aad749da8d7.pdf" ): paper_title = "Deep Fusion LSTMs for Text Semantic Matching" else: paper_title = a.text model_name = cells[1].text.strip() # if suffix: # model_name = "%s (%s)" % (model_name, suffix) model_name = model_name.replace("(code)", "").strip() params = cells[2].text.strip() train_acc = cells[3].text.strip() test_acc = cells[4].text.strip() sota_rows.append( SotaRow( model_name=model_name, paper_title=paper_title, paper_url=paper_url, metrics={ "% Test Accuracy": test_acc, "% Train Accuracy": train_acc, "Parameters": params, }, ) ) task = Task( name="Natural Language Inference", datasets=[ Dataset( name="SNLI", is_subdataset=False, sota=Sota( metrics=[ "% Test Accuracy", "% Train Accuracy", "Parameters", ], rows=sota_rows, ), ) ], source_link=Link( title="The Stanford Natural Language Inference (SNLI) Corpus", url="https://nlp.stanford.edu/projects/snli/", ), ) tdb = TaskDB() tdb.add_task(task) return tdb.export()
def run(self, root): # Assumptions: # 1) H1 are tasks # 2) Everything until the next heading is the task description # 3) H2 are subtasks, H3 are datasets, H4 are subdatasets # Algorithm: # 1) Split the document by headings sections = [] cur = [] for el in root: if el.tag in {"h1", "h2", "h3", "h4", "h5"}: if cur: sections.append(cur) cur = [el] else: cur = [el] else: cur.append(el) if cur: sections.append(cur) # 2) Parse each heading section one-by-one task = None # current task element being parsed subtask = None # current subtask being parsed dataset = None # current dataset being parsed for section_index in range(len(sections)): section = sections[section_index] header = section[0] if header.text is None: # Invalid section continue # Task definition if header.tag == "h1": if task is not None: self.parsed.append(task) task = Task( name=header.text.strip().title(), description=Text.parse( [e for e in section if e.tag == "p"]).text, ) # reset subtasks and datasets subtask = None dataset = None # Subtask definition if header.tag == "h2": if task is None: logger.error( "Unexpected subtask without a parent task at: %s", header.text, ) # new substask subtask = Task( name=header.text.strip().title(), description=Text.parse( [e for e in section if e.tag == "p"]).text, parent=task, ) task.subtasks.append(subtask) # reset the last dataset dataset = None # Dataset definition if header.tag == "h3" and "Table of content" not in header.text: if task is None: logger.error( "Unexpected dataset without a parent task at: %s", header.text, ) tables = [t for t in section if t.tag == "table"] n_tables = len(tables) if n_tables < 2: text = Text.parse([e for e in section if e.tag == "p"]) dataset = Dataset( name=header.text.strip().strip(":"), description=text.text, links=text.links, ) if n_tables == 1: dataset.sota = parse_sota(tables[0]) else: table_idxs = [ i for i, el in enumerate(section) if el.tag == "table" ] pairs = [] for idx in table_idxs: if idx >= 2 and section[idx - 1].tag == "p": pairs.append((section[idx - 1], section[idx])) description_idxs = set(range( 1, len(section))) - set(table_idxs) description_ps = [ el for i, el in enumerate(section) if i in description_idxs ] text = Text.parse(description_ps) dataset = Dataset( name=header.text.strip().strip(":"), description=text.text, links=text.links, ) dataset.subdatasets = parse_subdatasets(parent=dataset, pairs=pairs) if subtask is not None: # we are in a subtask, add everything here subtask.datasets.append(dataset) else: task.datasets.append(dataset) if task: self.parsed.append(task)