Ejemplo n.º 1
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date
"""
        http://melodi.ee.washington.edu/s3tp/

* * *
**_Word error rate on Switchboard (specify details): [Month, Year: Score [SWB]: Team].  Compiled by Jack Clark._**

A note about measurement: We're measuring Switchboard (SWB) and Call Home (CH) performance (mostly) from the Hub5'00 dataset, with main scores assesses in terms of word error rate on SWB. We also create 

Why do we care: Reflects the improvement of audio processing systems on speech over time.

"""
speech_recognition = Problem(name="Speech Recognition",
                             attributes=["language", "agi"])
swb_hub_500 = speech_recognition.metric(
    name="Word error rate on Switchboard trained against the Hub5'00 dataset",
    scale=error_percent,
    target=5.9)
swb_hub_500.measure(
    date(2011, 8, 31), 16.1, "CD-DNN",
    "https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CD-DNN-HMM-SWB-Interspeech2011-Pub.pdf"
)
swb_hub_500.measure(
    date(2012, 4, 27), 18.5, "DNN-HMM",
    "https://pdfs.semanticscholar.org/ce25/00257fda92338ec0a117bea1dbc0381d7c73.pdf?_ga=1.195375081.452266805.1483390947"
)

swb_hub_500.measure(
    date(2013, 8, 25), 12.9, "DNN MMI",
Ejemplo n.º 2
0
# -*- coding: utf-8 -*-
"Hand-entered data about written language problems"
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

modelling_english = Problem("Accurate modelling of human language.", ["language", "agi"])
ptperplexity = modelling_english.metric(name="Penn Treebank (Perplexity when parsing English sentences)", scale=perplexity)
ptperplexity.measure(date(2016,9,26), 70.9, "Pointer Sentinel-LSTM", "https://arxiv.org/pdf/1609.07843v1.pdf")
ptperplexity.measure(date(2016,10,5), 73.4, "Variational LSTM", "https://arxiv.org/pdf/1512.05287v5.pdf")
ptperplexity.measure(date(2013,12,20), 107.5, "Deep RNN", "https://arxiv.org/abs/1312.6026")
ptperplexity.measure(date(2012,4,7), 78.8, "KN5+RNNME ensemble", "http://www.fit.vutbr.cz/~imikolov/rnnlm/google.pdf")
ptperplexity.measure(date(2012,4,7), 125.7, "KN5+cache baseline", "http://www.fit.vutbr.cz/~imikolov/rnnlm/google.pdf")

ptperplexity.measure(date(2012,7,27), 124.7, "RNNLM", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 74.1, "RNN-LDA+all", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 113.7, "RNN-LDA LM", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 92.0, "RNN-LDA LM+KN5+cache", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 80.1, "RNN-LDA ensemble", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(None, 68.7, "RNN Dropout Regularization", "https://arxiv.org/abs/1409.2329v1")
ptperplexity.measure(None, 68.5, "RHN", "https://arxiv.org/pdf/1607.03474v3")
ptperplexity.measure(None, 66, "RHN+WT", "https://arxiv.org/pdf/1607.03474v3")
ptperplexity.measure(None, 71.3, "Variational RHN", "https://arxiv.org/abs/1607.03474")

hp_compression = modelling_english.metric(name="Hutter Prize (bits per character to encode English text)", scale=bits_per_x, target=1.3)
hp_compression.measure(date(2016,10,31), 1.313, "Surprisal-Driven Zoneout",
                   "https://pdfs.semanticscholar.org/e9bc/83f9ff502bec9cffb750468f76fdfcf5dd05.pdf")
hp_compression.measure(date(2016,10,19), 1.37, "Surprisal-Driven Feedback RNN",
                   "https://arxiv.org/pdf/1608.06027.pdf")
hp_compression.measure(date(2016,9,27), 1.39, "Hypernetworks", "https://arxiv.org/abs/1609.09106")
Ejemplo n.º 3
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date
""" 
* * *
**_Generative models of CIFAR-10 Natural Images _****[Year: bits-per-subpixel, method]. Compiled by Durk Kingma.**

**Why we care:**
(1) The compression=prediction=understanding=intelligence view (see Hutter prize, etc.). (Note that perplexity, log-likelihood, and #bits are all equivalent measurements.)
(2) Learning a generative model is a prominent auxiliary task towards semi-supervised learning. Current SOTA semi-supervised classification results utilize generative models.
3) You're finding patterns in the data that let you compress it more efficiently. Ultimate pattern recognition benchmark because you're trying to find the patterns in all the data. 

"""

image_generation = Problem("Drawing pictures", ["vision", "agi"])
# note: this section is not on scene generation, but making the distinction seemed like a good idea.
scene_generation = Problem(
    "Be able to generate complex scene e.g. a baboon receiving their degree at convocatoin.",
    ["vision", "world-modelling", "agi"])
scene_generation.add_subproblem(image_generation)

# NOTE: scale, and target need to be checked
image_generation_metric = image_generation.metric(
    "Generative models of CIFAR-10 images",
    scale=bits_per_x,
    axis_label="Model entropy (bits per pixel)")

image_generation_metric.measure(date(2014, 10, 30), 4.48, "NICE",
                                "https://arxiv.org/abs/1410.8516")
image_generation_metric.measure(date(2015, 2, 16), 4.13, "DRAW",
Ejemplo n.º 4
0
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

vision = Problem("Vision", ["agi", "vision", "world-modelling"])

image_comprehension = Problem("Image comprehension", ["agi", "vision", "language", "world-modelling"])
image_classification = Problem("Image classification", ["vision", "agi"])
image_classification.add_subproblem(image_comprehension)
vision.add_subproblem(image_classification)

imagenet = image_classification.metric("Imagenet Image Recognition", "http://image-net.org", scale=error_rate, target=0.051)
imagenet.notes = """
Correctly label images from the Imagenet dataset. As of 2016, this includes:
 - Object localization for 1000 categories.
 - Object detection for 200 fully labeled categories.
 - Object detection from video for 30 fully labeled categories.
 - Scene classification for 365 scene categories (Joint with MIT Places team) on Places2 Database http://places2.csail.mit.edu.
 - Scene parsing for 150 stuff and discrete object categories (Joint with MIT Places team).
"""
imagenet.measure(date(2010,8,31), 0.28191, "NEC UIUC", "http://image-net.org/challenges/LSVRC/2010/results")
imagenet.measure(date(2011,10,26), 0.2577, "XRCE","http://image-net.org/challenges/LSVRC/2011/results")
imagenet.measure(date(2012,10,13), 0.16422, "SuperVision", "http://image-net.org/challenges/LSVRC/2012/results.html")
imagenet.measure(date(2013,11,14), 0.11743, "Clarifai","http://www.image-net.org/challenges/LSVRC/2013/results.php")
imagenet.measure(date(2014,8,18), 0.07405, "VGG", "http://image-net.org/challenges/LSVRC/2014/index")
imagenet.measure(date(2015,12,10), 0.03567, "MSRA", "http://image-net.org/challenges/LSVRC/2015/results", algorithms=["residual-networks"])
imagenet.measure(date(2016,9,26), 0.02991, "Trimps-Soushen", "http://image-net.org/challenges/LSVRC/2016/results")

# Test automatic detection of withdrawn papers
Ejemplo n.º 5
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

read_stem_papers = Problem(
    "Read a scientific or technical paper, and comprehend its contents",
    ["language", "world-modelling", "super"])

# Getting some major results from an abstract, tables or conclusion is much easier than understanding the entire paper, its assumptions, robustness, support for its claims, etc
extract_results = Problem(
    "Extract major numerical results or progress claims from a STEM paper",
    ["language", "world-modelling", "agi"])
read_stem_papers.add_subproblem(extract_results)

extract_results.metric("Automatically find new relevant ML results on arXiv")
extract_results.notes = """
This metric is the ability to automatically update the ipython Notebook you are reading by spotting results in pdfs uploaded to arxiv.org.
Pull requests demonstrating solutions are welcome :)
"""

solve_technical_problems = Problem(
    "Given an arbitrary technical problem, solve it as well as a typical professional in that field",
    ["language", "world-modelling"])

program_induction = Problem("Writing software from specifications")
solve_technical_problems.add_subproblem(program_induction)
program_induction.metric("Card2Code",
                         url="https://github.com/deepmind/card2code",
                         scale=correct_percent)
Ejemplo n.º 6
0
# -*- coding: utf-8 -*-
"Hand-entered data about written language problems"
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

modelling_english = Problem("Accurate modelling of human language.", ["language", "agi"])
ptperplexity = modelling_english.metric(name="Penn Treebank (Perplexity when parsing English sentences)", scale=perplexity)
ptperplexity.measure(date(2016,9,26), 70.9, "Pointer Sentinel-LSTM", "https://arxiv.org/pdf/1609.07843v1.pdf")
ptperplexity.measure(date(2016,10,5), 73.4, "Variational LSTM", "https://arxiv.org/pdf/1512.05287v5.pdf")
ptperplexity.measure(date(2013,12,20), 107.5, "Deep RNN", "https://arxiv.org/abs/1312.6026")
ptperplexity.measure(date(2012,4,7), 78.8, "KN5+RNNME ensemble", "http://www.fit.vutbr.cz/~imikolov/rnnlm/google.pdf")
ptperplexity.measure(date(2012,4,7), 125.7, "KN5+cache baseline", "http://www.fit.vutbr.cz/~imikolov/rnnlm/google.pdf")

ptperplexity.measure(date(2012,7,27), 124.7, "RNNLM", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 74.1, "RNN-LDA+all", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 113.7, "RNN-LDA LM", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 92.0, "RNN-LDA LM+KN5+cache", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 80.1, "RNN-LDA ensemble", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(None, 68.7, "RNN Dropout Regularization", "https://arxiv.org/abs/1409.2329v1")
ptperplexity.measure(None, 68.5, "RHN", "https://arxiv.org/pdf/1607.03474v3")
ptperplexity.measure(None, 66, "RHN+WT", "https://arxiv.org/pdf/1607.03474v3")
ptperplexity.measure(None, 71.3, "RHN", "https://arxiv.org/abs/1607.03474v2")
ptperplexity.measure(None, 65.4, "RHN+WT", "https://arxiv.org/abs/1607.03474v4")
ptperplexity.measure(None, 62.4, "Neural Architecture Search", url="https://arxiv.org/abs/1611.01578v2", venue="ICLR 2017")


hp_compression = modelling_english.metric(name="Hutter Prize (bits per character to encode English text)", scale=bits_per_x, 
                                          target=1.3, target_label="Region of human performance",
                                          target_source="http://languagelog.ldc.upenn.edu/myl/Shannon1950.pdf")
Ejemplo n.º 7
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

""" 
* * *
**_Generative models of CIFAR-10 Natural Images _****[Year: bits-per-subpixel, method]. Compiled by Durk Kingma.**

**Why we care:**
(1) The compression=prediction=understanding=intelligence view (see Hutter prize, etc.). (Note that perplexity, log-likelihood, and #bits are all equivalent measurements.)
(2) Learning a generative model is a prominent auxiliary task towards semi-supervised learning. Current SOTA semi-supervised classification results utilize generative models.
3) You're finding patterns in the data that let you compress it more efficiently. Ultimate pattern recognition benchmark because you're trying to find the patterns in all the data. 

"""

image_generation = Problem("Drawing pictures", ["vision", "agi"])
# note: this section is not on scene generation, but making the distinction seemed like a good idea.
scene_generation = Problem("Be able to generate complex scene e.g. a baboon receiving their degree at convocatoin.", ["vision", "world-modelling", "agi"])
scene_generation.add_subproblem(image_generation)

# NOTE: scale, and target need to be checked
image_generation_metric = image_generation.metric("Generative models of CIFAR-10 images", scale=bits_per_x, axis_label="Model entropy (bits per pixel)")

image_generation_metric.measure(date(2014,10,30), 4.48, "NICE", "https://arxiv.org/abs/1410.8516")
image_generation_metric.measure(date(2015,2,16), 4.13, "DRAW", "https://arxiv.org/abs/1502.04623")
image_generation_metric.measure(date(2016,5,27), 3.49, "Real NVP", "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(date(2016,6,15), 3.11, "VAE with IAF", "https://papers.nips.cc/paper/6581-improved-variational-inference-with-inverse-autoregressive-flow")
image_generation_metric.measure(date(2016,5,27), 3.0, "PixelRNN", "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(date(2016,11,4), 2.92, "PixelCNN++","https://openreview.net/forum?id=BJrFC6ceg", replicated="https://github.com/openai/pixel-cnn")
Ejemplo n.º 8
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

abstract_strategy_games = Problem("Abstract strategy games", ["agi", "abstract-games"])

playing_with_hints = Problem("Playing abstract games with extensive hints", ["abstract-games"], solved=True)
abstract_strategy_games.add_subproblem(playing_with_hints)
playing_with_hints.notes = """
  Complex abstract strategy games have been solved to super-human levels
  by computer systems with extensive rule-hinting and heuristics,
  in some cases combined with machine learning techniques.
"""
computer_chess = playing_with_hints.metric("Computer Chess", scale=elo, target=2882, target_label="Best human play", target_source="https://en.wikipedia.org/w/index.php?title=Comparison_of_top_chess_players_throughout_history&oldid=777500496#Elo_system")
computer_go = playing_with_hints.metric("Computer Go", scale=elo, target=3632, target_label="Best human play", target_source="https://www.goratings.org/en/history/")
computer_go.solved = True # until we get proper data

# For some caveats, see https://en.wikipedia.org/w/index.php?title=Chess_engine&oldid=764341963#Ratings
# We could script ingestion of data from CCRL, or get data from Katja
computer_chess.measure(date(1997,5,11), 2725, "Deep Blue", uncertainty=25, url="https://www.quora.com/What-was-Deep-Blues-Elo-rating")
computer_chess.measure(date(2006,5,27), 2995, "Rybka 1.1 64bit", uncertainty=25, url="https://web.archive.org/web/20060531091049/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2010,8,7), 3269, "Rybka 4 64bit", uncertainty=22, url="https://web.archive.org/web/20100923131123/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2013,7,20), 3248, "Houdini 3 64bit", uncertainty=16, url="https://web.archive.org/web/20130415000000*/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2015,7,4), 3332, "Komodo 9", uncertainty=24, url="https://web.archive.org/web/20150708104805/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2017,2,27), 3393, "Stockfish", uncertainty=50, url="https://web.archive.org/web/20170227044521/http://www.computerchess.org.uk/ccrl/4040/")
# Wikipedia has some nice data here:
computer_chess.measure(date(1984,12,31), 1631, "Novag Super Constellation 6502 4 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1985,12,31), 1827, "Mephisto Amsterdam 68000 12 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1986,12,31), 1827, "Mephisto Amsterdam 68000 12 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1987,12,31), 1923, "Mephisto Dallas 68020 14 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
Ejemplo n.º 9
0
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

vision = Problem("Vision", ["agi", "vision", "world-modelling"])

# note that there is also a lot of vision data in awty.py, which was
# originally created by the Are We There Yet? scraper. Probably it should just
# be merged into this file...

image_comprehension = Problem("Image comprehension", ["agi", "vision", "language", "world-modelling"])
image_classification = Problem("Image classification", ["vision", "agi"])
image_classification.add_subproblem(image_comprehension)
vision.add_subproblem(image_classification)

imagenet = image_classification.metric("Imagenet Image Recognition", "http://image-net.org", scale=error_rate, target=0.051)
imagenet.notes = """
Correctly label images from the Imagenet dataset. As of 2016, this includes:
 - Object localization for 1000 categories.
 - Object detection for 200 fully labeled categories.
 - Object detection from video for 30 fully labeled categories.
 - Scene classification for 365 scene categories (Joint with MIT Places team) on Places2 Database http://places2.csail.mit.edu.
 - Scene parsing for 150 stuff and discrete object categories (Joint with MIT Places team).
"""
imagenet.measure(date(2010,8,31), 0.28191, "NEC UIUC", "http://image-net.org/challenges/LSVRC/2010/results")
imagenet.measure(date(2011,10,26), 0.2577, "XRCE","http://image-net.org/challenges/LSVRC/2011/results")
imagenet.measure(date(2012,10,13), 0.16422, "AlexNet / SuperVision",
"http://image-net.org/challenges/LSVRC/2012/results.html", algorithm_src_url="https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks")
imagenet.measure(date(2013,11,14), 0.11743, "Clarifai","http://www.image-net.org/challenges/LSVRC/2013/results.php")
Ejemplo n.º 10
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date
"""
        http://melodi.ee.washington.edu/s3tp/

* * *
**_Word error rate on Switchboard (specify details): [Month, Year: Score [SWB]: Team].  Compiled by Jack Clark._**

A note about measurement: We're measuring Switchboard (SWB) and Call Home (CH) performance (mostly) from the Hub5'00 dataset, with main scores assesses in terms of word error rate on SWB. We also create 

Why do we care: Reflects the improvement of audio processing systems on speech over time.

"""
speech_recognition = Problem(name="Speech Recognition",
                             attributes=["language", "agi"])
swb_hub_500 = speech_recognition.metric(
    name="Word error rate on Switchboard trained against the Hub5'00 dataset",
    scale=error_percent,
    target=5.9)
swb_hub_500.measure(
    date(2011, 8, 31), 16.1, "CD-DNN",
    "https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CD-DNN-HMM-SWB-Interspeech2011-Pub.pdf"
)
swb_hub_500.measure(
    date(2012, 4, 27), 18.5, "DNN-HMM",
    "https://pdfs.semanticscholar.org/ce25/00257fda92338ec0a117bea1dbc0381d7c73.pdf?_ga=1.195375081.452266805.1483390947"
)

swb_hub_500.measure(
    date(2013, 8, 25), 12.9, "DNN MMI",
Ejemplo n.º 11
0
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

computer_games = Problem(
    "Play real-time computer & video games",
    ["world-modelling", "realtime-games", "agi", "language"])

games_requiring_novel_language = Problem(
    "Games that require inventing novel language, forms of speech, or communication"
)
games_requiring_speech = Problem(
    "Games that require both understanding and speaking a language")
games_requiring_speech.metric("Starcraft")

games_requiring_language_comprehension = Problem(
    "Games that require language comprehension", ["agi", "languge"])

computer_games.add_subproblem(games_requiring_novel_language)
games_requiring_novel_language.add_subproblem(games_requiring_speech)
games_requiring_speech.add_subproblem(games_requiring_language_comprehension)

# Atari 2600 Games: Breakout, Enduro, Pong, Q*Bert, Seaquest, S. Invaders. Each game has its own metric.
# We previously used hand-compiled by Yomna Nasser and Miles Brundage; this is
# now mostly obsolete, and the data is scraped in scrapers/atari.py

simple_games = Problem("Simple video games",
                       ["world-modelling", "realtime-games", "agi"])
computer_games.add_subproblem(simple_games)
Ejemplo n.º 12
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

computer_games = Problem("Play real-time computer & video games", ["world-modelling", "realtime-games", "agi", "language"])

games_requiring_novel_language = Problem("Games that require inventing novel language, forms of speech, or communication")
games_requiring_speech = Problem("Games that require both understanding and speaking a language")
games_requiring_speech.metric("Starcraft")

games_requiring_language_comprehension = Problem("Games that require language comprehension", ["agi", "languge"])

computer_games.add_subproblem(games_requiring_novel_language)
games_requiring_novel_language.add_subproblem(games_requiring_speech)
games_requiring_speech.add_subproblem(games_requiring_language_comprehension)


# Atari 2600 Games: Breakout, Enduro, Pong, Q*Bert, Seaquest, S. Invaders. Each game has its own metric.
# We previously used hand-compiled by Yomna Nasser and Miles Brundage; this is
# now mostly obsolete, and the data is scraped in scrapers/atari.py

simple_games = Problem("Simple video games", ["world-modelling", "realtime-games", "agi"]) 
computer_games.add_subproblem(simple_games)

# Alien
alien_metric = simple_games.metric("Atari 2600 Alien", scale=atari_linear, target=6875, target_source="https://www.semanticscholar.org/paper/Human-level-control-through-deep-reinforcement-Mnih-Kavukcuoglu/340f48901f72278f6bf78a04ee5b01df208cc508")
# alien_metric.measure(date(2015, 2, 26), 3069, "DQN", "https://www.semanticscholar.org/paper/Human-level-control-through-deep-reinforcement-Mnih-Kavukcuoglu/340f48901f72278f6bf78a04ee5b01df208cc508")
# alien_metric.measure(date(2015,11,20), 1620, "DQN","https://arxiv.org/abs/1511.06581v1")
# alien_metric.measure(date(2015,11,20), 3747.7, "DDQN","https://arxiv.org/abs/1511.06581v1")
# alien_metric.measure(date(2015,11,20), 4461.4, "Duel","https://arxiv.org/abs/1511.06581v1")
Ejemplo n.º 13
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

read_stem_papers = Problem("Read a scientific or technical paper, and comprehend its contents", ["language", "world-modelling", "super"])

# Getting some major results from an abstract, tables or conclusion is much easier than understanding the entire paper, its assumptions, robustness, support for its claims, etc
extract_results = Problem("Extract major numerical results or progress claims from a STEM paper", ["language", "world-modelling", "agi"])
read_stem_papers.add_subproblem(extract_results)

extract_results.metric("Automatically find new relevant ML results on arXiv")
extract_results.notes = """
This metric is the ability to automatically update the ipython Notebook you are reading by spotting results in pdfs uploaded to arxiv.org.
Pull requests demonstrating solutions are welcome :)
"""

solve_technical_problems = Problem("Given an arbitrary technical problem, solve it as well as a typical professional in that field", ["language", "world-modelling"])

program_induction = Problem("Writing software from specifications")
solve_technical_problems.add_subproblem(program_induction)
program_induction.metric("Card2Code", url="https://github.com/deepmind/card2code", scale=correct_percent)

vaguely_constrained_technical_problems = Problem("Solve vaguely or under-constrained technical problems")
solve_technical_problems.add_subproblem(vaguely_constrained_technical_problems)

# This subset of technical problems is much easier; here we assume that a human / worldly problem has been reduced to something that can be
# subjected to clear computational evaluation ("is this purported proof of theorem X correct?", "does this circuit perform task Y efficiently?"
# "will this airframe fly with reasonable characteristics?")
solve_constrained_technical_problems = Problem("Solve technical problems with clear constraints (proofs, circuit design, aerofoil design, etc)")
solve_technical_problems.add_subproblem(solve_constrained_technical_problems)