コード例 #1
0
def cityscapes():
    """Extract Cityscapes SOTA tables."""
    soup = get_soup(CITYSCAPES_URL)

    sota_tabels = soup.findAll("table", attrs={"class": "tablepress"})

    if len(sota_tabels) == 3:

        cityscapes = sota_tabels[0]  # pixel-level semantic segmentation task

        dataset = Dataset(
            name=DATASET_NAME,
            is_subdataset=False,
            description=DATASET_DESCRIPTION,
        )

        task = Task(name="Semantic Segmentation")
        task.datasets = [dataset]
        task.source_link = Link(title="CityScapes Leaderboard",
                                url=CITYSCAPES_URL)

        # scrape the evaluation values on the two datasets
        dataset.sota.metrics = ["Mean IoU (class)", "Mean IoU (class)"]

        dataset.sota.rows = get_sota_rows(cityscapes)

        tdb = TaskDB()
        tdb.add_task(task)
        return tdb.export()
    else:
        raise DataError("Got an unexpected number of SOTA tables.")
コード例 #2
0
ファイル: eff.py プロジェクト: sotabench/sota-extractor
def eff():
    """Extract EFF SOTA tables."""

    response = requests.get(EFF_URL)
    if response.status_code != 200:
        raise HttpClientError("Resource unavailable", response=response)
    j = json.loads(response.text)
    tdb = TaskDB()

    for problem in j["problems"]:

        if problem["name"] in EFF_TASK_CONVERSION:
            problem_name = EFF_TASK_CONVERSION[problem["name"]]
        else:
            problem_name = problem["name"]

        task = Task(name=problem_name)

        task.source_link = Link(
            title="Progress of AI Research",
            url="https://github.com/AI-metrics/AI-metrics",
        )

        datasets = []
        for metric in problem["metrics"]:
            if "measures" in metric and metric["measures"]:
                measures = metric["measures"]

                dataset = Dataset(
                    name=metric["name"],
                    is_subdataset=False,
                    sota=Sota(metrics=[metric["scale"]]),
                )

                for measure in measures:
                    sr = SotaRow(
                        model_name=measure["name"],
                        paper_title=measure["papername"],
                        paper_url=measure["url"],
                        metrics={metric["scale"]: measure["value"]},
                    )

                    if measure["replicated_url"]:
                        sr.code_links.append(
                            Link(
                                title="Replicated",
                                url=measure["replicated_url"],
                            ))

                    dataset.sota.rows.append(sr)

                datasets.append(dataset)

        task.datasets = datasets
        tdb.add_task(task)

    return tdb.export()
コード例 #3
0
ファイル: squad.py プロジェクト: sotabench/sota-extractor
def squad():
    """Extract SQUAD SOTA tables."""
    soup = get_soup(SQUAD_URL)

    sota_tabels = soup.findAll("table", attrs={"class": "performanceTable"})

    if len(sota_tabels) == 2:
        squad2 = sota_tabels[0]
        squad1 = sota_tabels[1]

        dataset2 = Dataset(
            name=DATASET_2_NAME,
            is_subdataset=False,
            description=DATASET_2_DESCRIPTION,
        )
        dataset1 = Dataset(
            name=DATASET_1_NAME,
            is_subdataset=False,
            description=DATASET_1_DESCRIPTION,
        )

        task = Task(name="Question Answering")
        task.datasets = [dataset2, dataset1]
        task.source_link = Link(title="SQuAD Leaderboard", url=SQUAD_URL)

        # scrape the evaluation values on the two datasets
        dataset2.sota.metrics = ["EM", "F1"]
        dataset1.sota.metrics = ["EM", "F1"]

        dataset2.sota.rows = get_sota_rows(squad2)
        dataset1.sota.rows = get_sota_rows(squad1)

        tdb = TaskDB()
        tdb.add_task(task)
        return tdb.export()
    else:
        raise DataError("Got an unexpected number of SOTA tables.")
コード例 #4
0
def parse_subdatasets(
        parent: Dataset, pairs: List[Tuple[ElementTree,
                                           ElementTree]]) -> List[Dataset]:
    subdatasets = []
    for p, table in pairs:
        strong = p.find("strong")
        if strong is None:
            continue
        subdatasets.append(
            Dataset(
                name=strong.text.strip().strip(":"),
                is_subdataset=True,
                parent=parent,
                sota=parse_sota(table),
            ))
    return subdatasets
コード例 #5
0
ファイル: fixer.py プロジェクト: rajeshkpandey/sota-extractor
def fix_dataset(dataset: Dataset) -> Optional[Dataset]:
    """Walk through the dataset and return a valid one.

    Valid dataset contains only subdatasets with non empty sota tables or,
    if it has no subdatasets, its sota table has to be non-empty.
    """
    dataset.subdatasets = [
        subdataset for subdataset in dataset.subdatasets
        if len(subdataset.sota.rows) > 0
    ]

    if len(dataset.subdatasets) > 0:
        return dataset

    if len(dataset.sota.rows) > 0:
        return dataset

    return None
コード例 #6
0
def reddit():
    """Extract Reddit SOTA tables."""
    tdb = TaskDB()
    md = requests.get(REDITSOTA_URL).text

    # assumptions:
    # ### Category
    # #### Task
    md_lines = md.split("\n")

    category = None
    task = None
    for i in range(len(md_lines)):
        line = md_lines[i]

        if line.startswith("###") and not line.startswith("####"):
            category = line.replace("###", "").strip()

        if line.startswith("####") and not line.startswith("#####"):
            task = line.replace("####", "").strip()
            task = re.sub("^[0-9+].?", "", task).strip()

        if "<table>" in line.lower():
            end_i = None
            # find the end of table
            for j in range(i, len(md_lines)):
                if "</table>" in md_lines[j].lower():
                    end_i = j + 1
                    break

            if end_i and task and category:
                html_lines = md_lines[i:end_i]
                h = "\n".join(html_lines)

                soup = BeautifulSoup(h, "html.parser")

                # parse out the individual rows
                entries = []
                rows = soup.findAll("tr")
                for row in rows:
                    cells = row.findAll("td")
                    if len(cells) >= 4:
                        # paper ref
                        c_paper = cells[0]
                        paper_title = c_paper.text.strip()
                        paper_url = None
                        if c_paper.find("a"):
                            paper_url = c_paper.find("a")["href"]

                        # datasets
                        c_datasets = cells[1]
                        c_datasets_li = c_datasets.findAll("li")
                        dataset_names = []
                        for dataset_li in c_datasets_li:
                            dataset_names.append(dataset_li.text.strip())

                        # metrics
                        c_metrics = cells[2]
                        c_metrics_li = c_metrics.findAll("li")
                        metrics = []
                        for metrics_li in c_metrics_li:
                            parts = metrics_li.text.split(":")
                            parts = [p.strip() for p in parts]
                            m = {}
                            if len(parts) == 2:
                                m[parts[0]] = parts[1]
                                metrics.append(m)

                        if not metrics:
                            # Try to use it as single value
                            parts = c_metrics.text.split(":")
                            parts = [p.strip() for p in parts]
                            m = {}
                            if len(parts) == 2:
                                m[parts[0]] = parts[1]
                                metrics.append(m)

                        # source code ref
                        c_code = cells[3]
                        c_code_a = c_code.findAll("a")
                        code_links = []
                        for code_a in c_code_a:
                            code_links.append(
                                Link(
                                    title=code_a.text.strip(),
                                    url=code_a["href"],
                                ))

                        entries.append({
                            "paper_title": paper_title,
                            "paper_url": paper_url,
                            "dataset_names": dataset_names,
                            "metrics": metrics,
                            "code_links": code_links,
                        })

                # Add the new task
                t = Task(name=task, categories=[category])
                t.source_link = Link(title="RedditSota", url=REDITSOTA_URL)

                # Add datasets and perfomance on them
                data_map = {}
                for e in entries:
                    if len(e["dataset_names"]) == len(e["metrics"]):
                        for j in range(len(e["dataset_names"])):
                            dataset_name = e["dataset_names"][j]
                            # make sure the dataset exists
                            if dataset_name not in data_map:
                                # collect all the metrics mentioned for this
                                # dataset
                                all_metrics = [
                                    list(ee["metrics"][j].keys())
                                    for ee in entries
                                    if dataset_name in ee["dataset_names"]
                                ]
                                all_metrics = [
                                    item for sublist in all_metrics
                                    for item in sublist
                                ]
                                all_metrics = list(set(all_metrics))
                                dataset = Dataset(
                                    name=dataset_name,
                                    is_subdataset=False,
                                    sota=Sota(metrics=all_metrics),
                                )
                                data_map[dataset_name] = dataset
                                t.datasets.append(dataset)
                            else:
                                dataset = data_map[dataset_name]

                            # record the metric for this dataset
                            sr = SotaRow(
                                model_name="",
                                paper_title=e["paper_title"],
                                paper_url=e["paper_url"],
                                metrics=e["metrics"][j],
                                code_links=e["code_links"],
                            )
                            dataset.sota.rows.append(sr)

                # add and reset the task
                tdb.add_task(t)
                task = None

    return tdb.export()
コード例 #7
0
def snli():
    """Extract SNLI SOTA tables."""
    soup = get_soup(SNLI_URL)

    table = soup.findAll("table", attrs={"class": "newstuff"})[1]

    rows = table.findAll("tr")

    sota_rows = []
    # suffix = ""
    for row in rows:
        # ignore the header
        if row.get("class") == ["header"]:
            pass
        elif row.get("class") == ["section"]:
            # suffix = row.text.replace("models", "").strip()
            continue
        else:
            cells = row.findAll("td")

            a = cells[0].find("a")

            paper_url = a.get("href")
            if paper_url == "http://nlp.stanford.edu/pubs/snli_paper.pdf":
                paper_title = (
                    "A large annotated corpus for learning natural language "
                    "inference"
                )
            elif paper_url == "https://www.nyu.edu/projects/bowman/spinn.pdf":
                paper_title = (
                    "A Fast Unified Model for Parsing and Sentence "
                    "Understanding"
                )
            elif (
                paper_url
                == "https://s3-us-west-2.amazonaws.com/openai-assets/"
                "research-covers/language-unsupervised/"
                "language_understanding_paper.pdf"
            ):
                paper_title = (
                    "Improving Language Understanding by Generative "
                    "Pre-Training"
                )
            elif (
                paper_url == "https://pdfs.semanticscholar.org/adc1/"
                "84fcb04107f95e35ea1b07ef9aad749da8d7.pdf"
            ):
                paper_title = "Deep Fusion LSTMs for Text Semantic Matching"
            else:
                paper_title = a.text

            model_name = cells[1].text.strip()
            # if suffix:
            #    model_name = "%s (%s)" % (model_name, suffix)

            model_name = model_name.replace("(code)", "").strip()

            params = cells[2].text.strip()
            train_acc = cells[3].text.strip()
            test_acc = cells[4].text.strip()

            sota_rows.append(
                SotaRow(
                    model_name=model_name,
                    paper_title=paper_title,
                    paper_url=paper_url,
                    metrics={
                        "% Test Accuracy": test_acc,
                        "% Train Accuracy": train_acc,
                        "Parameters": params,
                    },
                )
            )

    task = Task(
        name="Natural Language Inference",
        datasets=[
            Dataset(
                name="SNLI",
                is_subdataset=False,
                sota=Sota(
                    metrics=[
                        "% Test Accuracy",
                        "% Train Accuracy",
                        "Parameters",
                    ],
                    rows=sota_rows,
                ),
            )
        ],
        source_link=Link(
            title="The Stanford Natural Language Inference (SNLI) Corpus",
            url="https://nlp.stanford.edu/projects/snli/",
        ),
    )
    tdb = TaskDB()
    tdb.add_task(task)
    return tdb.export()
コード例 #8
0
    def run(self, root):
        # Assumptions:
        # 1) H1 are tasks
        # 2) Everything until the next heading is the task description
        # 3) H2 are subtasks, H3 are datasets, H4 are subdatasets

        # Algorithm:
        # 1) Split the document by headings
        sections = []
        cur = []
        for el in root:
            if el.tag in {"h1", "h2", "h3", "h4", "h5"}:
                if cur:
                    sections.append(cur)
                    cur = [el]
                else:
                    cur = [el]
            else:
                cur.append(el)

        if cur:
            sections.append(cur)

        # 2) Parse each heading section one-by-one
        task = None  # current task element being parsed
        subtask = None  # current subtask being parsed
        dataset = None  # current dataset being parsed

        for section_index in range(len(sections)):
            section = sections[section_index]
            header = section[0]

            if header.text is None:
                # Invalid section
                continue

            # Task definition
            if header.tag == "h1":
                if task is not None:
                    self.parsed.append(task)

                task = Task(
                    name=header.text.strip().title(),
                    description=Text.parse(
                        [e for e in section if e.tag == "p"]).text,
                )

                # reset subtasks and datasets
                subtask = None
                dataset = None

            # Subtask definition
            if header.tag == "h2":
                if task is None:
                    logger.error(
                        "Unexpected subtask without a parent task at: %s",
                        header.text,
                    )

                # new substask
                subtask = Task(
                    name=header.text.strip().title(),
                    description=Text.parse(
                        [e for e in section if e.tag == "p"]).text,
                    parent=task,
                )
                task.subtasks.append(subtask)

                # reset the last dataset
                dataset = None

            # Dataset definition
            if header.tag == "h3" and "Table of content" not in header.text:
                if task is None:
                    logger.error(
                        "Unexpected dataset without a parent task at: %s",
                        header.text,
                    )

                tables = [t for t in section if t.tag == "table"]
                n_tables = len(tables)
                if n_tables < 2:
                    text = Text.parse([e for e in section if e.tag == "p"])
                    dataset = Dataset(
                        name=header.text.strip().strip(":"),
                        description=text.text,
                        links=text.links,
                    )
                    if n_tables == 1:
                        dataset.sota = parse_sota(tables[0])
                else:
                    table_idxs = [
                        i for i, el in enumerate(section) if el.tag == "table"
                    ]
                    pairs = []
                    for idx in table_idxs:
                        if idx >= 2 and section[idx - 1].tag == "p":
                            pairs.append((section[idx - 1], section[idx]))

                    description_idxs = set(range(
                        1, len(section))) - set(table_idxs)
                    description_ps = [
                        el for i, el in enumerate(section)
                        if i in description_idxs
                    ]
                    text = Text.parse(description_ps)
                    dataset = Dataset(
                        name=header.text.strip().strip(":"),
                        description=text.text,
                        links=text.links,
                    )
                    dataset.subdatasets = parse_subdatasets(parent=dataset,
                                                            pairs=pairs)

                if subtask is not None:
                    # we are in a subtask, add everything here
                    subtask.datasets.append(dataset)
                else:
                    task.datasets.append(dataset)

        if task:
            self.parsed.append(task)