コード例 #1
0
ファイル: acoustics.py プロジェクト: zgsxwsdxg/AI-metrics
date = datetime.date
"""
        http://melodi.ee.washington.edu/s3tp/

* * *
**_Word error rate on Switchboard (specify details): [Month, Year: Score [SWB]: Team].  Compiled by Jack Clark._**

A note about measurement: We're measuring Switchboard (SWB) and Call Home (CH) performance (mostly) from the Hub5'00 dataset, with main scores assesses in terms of word error rate on SWB. We also create 

Why do we care: Reflects the improvement of audio processing systems on speech over time.

"""
speech_recognition = Problem(name="Speech Recognition",
                             attributes=["language", "agi"])
swb_hub_500 = speech_recognition.metric(
    name="Word error rate on Switchboard trained against the Hub5'00 dataset",
    scale=error_percent,
    target=5.9)
swb_hub_500.measure(
    date(2011, 8, 31), 16.1, "CD-DNN",
    "https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CD-DNN-HMM-SWB-Interspeech2011-Pub.pdf"
)
swb_hub_500.measure(
    date(2012, 4, 27), 18.5, "DNN-HMM",
    "https://pdfs.semanticscholar.org/ce25/00257fda92338ec0a117bea1dbc0381d7c73.pdf?_ga=1.195375081.452266805.1483390947"
)

swb_hub_500.measure(
    date(2013, 8, 25), 12.9, "DNN MMI",
    "http://www.danielpovey.com/files/2013_interspeech_dnn.pdf")
swb_hub_500.measure(
    date(2013, 8, 25), 12.6, "DNN sMBR",
コード例 #2
0
ファイル: language.py プロジェクト: milesbrundage/AI-metrics
# -*- coding: utf-8 -*-
"Hand-entered data about written language problems"
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

modelling_english = Problem("Accurate modelling of human language.", ["language", "agi"])
ptperplexity = modelling_english.metric(name="Penn Treebank (Perplexity when parsing English sentences)", scale=perplexity)
ptperplexity.measure(date(2016,9,26), 70.9, "Pointer Sentinel-LSTM", "https://arxiv.org/pdf/1609.07843v1.pdf")
ptperplexity.measure(date(2016,10,5), 73.4, "Variational LSTM", "https://arxiv.org/pdf/1512.05287v5.pdf")
ptperplexity.measure(date(2013,12,20), 107.5, "Deep RNN", "https://arxiv.org/abs/1312.6026")
ptperplexity.measure(date(2012,4,7), 78.8, "KN5+RNNME ensemble", "http://www.fit.vutbr.cz/~imikolov/rnnlm/google.pdf")
ptperplexity.measure(date(2012,4,7), 125.7, "KN5+cache baseline", "http://www.fit.vutbr.cz/~imikolov/rnnlm/google.pdf")

ptperplexity.measure(date(2012,7,27), 124.7, "RNNLM", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 74.1, "RNN-LDA+all", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 113.7, "RNN-LDA LM", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 92.0, "RNN-LDA LM+KN5+cache", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(date(2012,7,27), 80.1, "RNN-LDA ensemble", "https://www.microsoft.com/en-us/research/wp-content/uploads/2012/07/rnn_ctxt_TR.sav_.pdf")
ptperplexity.measure(None, 68.7, "RNN Dropout Regularization", "https://arxiv.org/abs/1409.2329v1")
ptperplexity.measure(None, 68.5, "RHN", "https://arxiv.org/pdf/1607.03474v3")
ptperplexity.measure(None, 66, "RHN+WT", "https://arxiv.org/pdf/1607.03474v3")
ptperplexity.measure(None, 71.3, "Variational RHN", "https://arxiv.org/abs/1607.03474")

hp_compression = modelling_english.metric(name="Hutter Prize (bits per character to encode English text)", scale=bits_per_x, target=1.3)
hp_compression.measure(date(2016,10,31), 1.313, "Surprisal-Driven Zoneout",
                   "https://pdfs.semanticscholar.org/e9bc/83f9ff502bec9cffb750468f76fdfcf5dd05.pdf")
hp_compression.measure(date(2016,10,19), 1.37, "Surprisal-Driven Feedback RNN",
                   "https://arxiv.org/pdf/1608.06027.pdf")
hp_compression.measure(date(2016,9,27), 1.39, "Hypernetworks", "https://arxiv.org/abs/1609.09106")
コード例 #3
0
ファイル: generative.py プロジェクト: zgsxwsdxg/AI-metrics
(1) The compression=prediction=understanding=intelligence view (see Hutter prize, etc.). (Note that perplexity, log-likelihood, and #bits are all equivalent measurements.)
(2) Learning a generative model is a prominent auxiliary task towards semi-supervised learning. Current SOTA semi-supervised classification results utilize generative models.
3) You're finding patterns in the data that let you compress it more efficiently. Ultimate pattern recognition benchmark because you're trying to find the patterns in all the data. 

"""

image_generation = Problem("Drawing pictures", ["vision", "agi"])
# note: this section is not on scene generation, but making the distinction seemed like a good idea.
scene_generation = Problem(
    "Be able to generate complex scene e.g. a baboon receiving their degree at convocatoin.",
    ["vision", "world-modelling", "agi"])
scene_generation.add_subproblem(image_generation)

# NOTE: scale, and target need to be checked
image_generation_metric = image_generation.metric(
    "Generative models of CIFAR-10 images",
    scale=bits_per_x,
    axis_label="Model entropy (bits per pixel)")

image_generation_metric.measure(date(2014, 10, 30), 4.48, "NICE",
                                "https://arxiv.org/abs/1410.8516")
image_generation_metric.measure(date(2015, 2, 16), 4.13, "DRAW",
                                "https://arxiv.org/abs/1502.04623")
image_generation_metric.measure(date(2016, 5, 27), 3.49, "Real NVP",
                                "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(
    date(2016, 6, 15), 3.11, "VAE with IAF",
    "https://papers.nips.cc/paper/6581-improved-variational-inference-with-inverse-autoregressive-flow"
)
image_generation_metric.measure(date(2016, 5, 27), 3.0, "PixelRNN",
                                "https://arxiv.org/abs/1605.08803")
image_generation_metric.measure(
コード例 #4
0
ファイル: vision.py プロジェクト: jeffalstott/AI-metrics
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

vision = Problem("Vision", ["agi", "vision", "world-modelling"])

image_comprehension = Problem("Image comprehension", ["agi", "vision", "language", "world-modelling"])
image_classification = Problem("Image classification", ["vision", "agi"])
image_classification.add_subproblem(image_comprehension)
vision.add_subproblem(image_classification)

imagenet = image_classification.metric("Imagenet Image Recognition", "http://image-net.org", scale=error_rate, target=0.051)
imagenet.notes = """
Correctly label images from the Imagenet dataset. As of 2016, this includes:
 - Object localization for 1000 categories.
 - Object detection for 200 fully labeled categories.
 - Object detection from video for 30 fully labeled categories.
 - Scene classification for 365 scene categories (Joint with MIT Places team) on Places2 Database http://places2.csail.mit.edu.
 - Scene parsing for 150 stuff and discrete object categories (Joint with MIT Places team).
"""
imagenet.measure(date(2010,8,31), 0.28191, "NEC UIUC", "http://image-net.org/challenges/LSVRC/2010/results")
imagenet.measure(date(2011,10,26), 0.2577, "XRCE","http://image-net.org/challenges/LSVRC/2011/results")
imagenet.measure(date(2012,10,13), 0.16422, "SuperVision", "http://image-net.org/challenges/LSVRC/2012/results.html")
imagenet.measure(date(2013,11,14), 0.11743, "Clarifai","http://www.image-net.org/challenges/LSVRC/2013/results.php")
imagenet.measure(date(2014,8,18), 0.07405, "VGG", "http://image-net.org/challenges/LSVRC/2014/index")
imagenet.measure(date(2015,12,10), 0.03567, "MSRA", "http://image-net.org/challenges/LSVRC/2015/results", algorithms=["residual-networks"])
imagenet.measure(date(2016,9,26), 0.02991, "Trimps-Soushen", "http://image-net.org/challenges/LSVRC/2016/results")

# Test automatic detection of withdrawn papers
コード例 #5
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

read_stem_papers = Problem(
    "Read a scientific or technical paper, and comprehend its contents",
    ["language", "world-modelling", "super"])

# Getting some major results from an abstract, tables or conclusion is much easier than understanding the entire paper, its assumptions, robustness, support for its claims, etc
extract_results = Problem(
    "Extract major numerical results or progress claims from a STEM paper",
    ["language", "world-modelling", "agi"])
read_stem_papers.add_subproblem(extract_results)

extract_results.metric("Automatically find new relevant ML results on arXiv")
extract_results.notes = """
This metric is the ability to automatically update the ipython Notebook you are reading by spotting results in pdfs uploaded to arxiv.org.
Pull requests demonstrating solutions are welcome :)
"""

solve_technical_problems = Problem(
    "Given an arbitrary technical problem, solve it as well as a typical professional in that field",
    ["language", "world-modelling"])

program_induction = Problem("Writing software from specifications")
solve_technical_problems.add_subproblem(program_induction)
program_induction.metric("Card2Code",
                         url="https://github.com/deepmind/card2code",
                         scale=correct_percent)
コード例 #6
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

abstract_strategy_games = Problem("Abstract strategy games", ["agi", "abstract-games"])

playing_with_hints = Problem("Playing abstract games with extensive hints", ["abstract-games"], solved=True)
abstract_strategy_games.add_subproblem(playing_with_hints)
playing_with_hints.notes = """
  Complex abstract strategy games have been solved to super-human levels
  by computer systems with extensive rule-hinting and heuristics,
  in some cases combined with machine learning techniques.
"""
computer_chess = playing_with_hints.metric("Computer Chess", scale=elo, target=2882, target_label="Best human play", target_source="https://en.wikipedia.org/w/index.php?title=Comparison_of_top_chess_players_throughout_history&oldid=777500496#Elo_system")
computer_go = playing_with_hints.metric("Computer Go", scale=elo, target=3632, target_label="Best human play", target_source="https://www.goratings.org/en/history/")
computer_go.solved = True # until we get proper data

# For some caveats, see https://en.wikipedia.org/w/index.php?title=Chess_engine&oldid=764341963#Ratings
# We could script ingestion of data from CCRL, or get data from Katja
computer_chess.measure(date(1997,5,11), 2725, "Deep Blue", uncertainty=25, url="https://www.quora.com/What-was-Deep-Blues-Elo-rating")
computer_chess.measure(date(2006,5,27), 2995, "Rybka 1.1 64bit", uncertainty=25, url="https://web.archive.org/web/20060531091049/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2010,8,7), 3269, "Rybka 4 64bit", uncertainty=22, url="https://web.archive.org/web/20100923131123/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2013,7,20), 3248, "Houdini 3 64bit", uncertainty=16, url="https://web.archive.org/web/20130415000000*/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2015,7,4), 3332, "Komodo 9", uncertainty=24, url="https://web.archive.org/web/20150708104805/http://www.computerchess.org.uk/ccrl/4040/rating_list_all.html")
computer_chess.measure(date(2017,2,27), 3393, "Stockfish", uncertainty=50, url="https://web.archive.org/web/20170227044521/http://www.computerchess.org.uk/ccrl/4040/")
# Wikipedia has some nice data here:
computer_chess.measure(date(1984,12,31), 1631, "Novag Super Constellation 6502 4 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1985,12,31), 1827, "Mephisto Amsterdam 68000 12 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1986,12,31), 1827, "Mephisto Amsterdam 68000 12 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
computer_chess.measure(date(1987,12,31), 1923, "Mephisto Dallas 68020 14 MHz", url="https://en.wikipedia.org/wiki/Swedish_Chess_Computer_Association#Rating_list_year-end_leaders")
コード例 #7
0
ファイル: acoustics.py プロジェクト: ttt-example/AI-metrics
date = datetime.date
"""
        http://melodi.ee.washington.edu/s3tp/

* * *
**_Word error rate on Switchboard (specify details): [Month, Year: Score [SWB]: Team].  Compiled by Jack Clark._**

A note about measurement: We're measuring Switchboard (SWB) and Call Home (CH) performance (mostly) from the Hub5'00 dataset, with main scores assesses in terms of word error rate on SWB. We also create 

Why do we care: Reflects the improvement of audio processing systems on speech over time.

"""
speech_recognition = Problem(name="Speech Recognition",
                             attributes=["language", "agi"])
swb_hub_500 = speech_recognition.metric(
    name="Word error rate on Switchboard trained against the Hub5'00 dataset",
    scale=error_percent,
    target=5.9)
swb_hub_500.measure(
    date(2011, 8, 31), 16.1, "CD-DNN",
    "https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/CD-DNN-HMM-SWB-Interspeech2011-Pub.pdf"
)
swb_hub_500.measure(
    date(2012, 4, 27), 18.5, "DNN-HMM",
    "https://pdfs.semanticscholar.org/ce25/00257fda92338ec0a117bea1dbc0381d7c73.pdf?_ga=1.195375081.452266805.1483390947"
)

swb_hub_500.measure(
    date(2013, 8, 25), 12.9, "DNN MMI",
    "http://www.danielpovey.com/files/2013_interspeech_dnn.pdf")
swb_hub_500.measure(
    date(2013, 8, 25), 12.6, "DNN sMBR",
コード例 #8
0
ファイル: video_games.py プロジェクト: zgsxwsdxg/AI-metrics
from taxonomy import Problem
from scales import *
import datetime

date = datetime.date

computer_games = Problem(
    "Play real-time computer & video games",
    ["world-modelling", "realtime-games", "agi", "language"])

games_requiring_novel_language = Problem(
    "Games that require inventing novel language, forms of speech, or communication"
)
games_requiring_speech = Problem(
    "Games that require both understanding and speaking a language")
games_requiring_speech.metric("Starcraft")

games_requiring_language_comprehension = Problem(
    "Games that require language comprehension", ["agi", "languge"])

computer_games.add_subproblem(games_requiring_novel_language)
games_requiring_novel_language.add_subproblem(games_requiring_speech)
games_requiring_speech.add_subproblem(games_requiring_language_comprehension)

# Atari 2600 Games: Breakout, Enduro, Pong, Q*Bert, Seaquest, S. Invaders. Each game has its own metric.
# We previously used hand-compiled by Yomna Nasser and Miles Brundage; this is
# now mostly obsolete, and the data is scraped in scrapers/atari.py

simple_games = Problem("Simple video games",
                       ["world-modelling", "realtime-games", "agi"])
computer_games.add_subproblem(simple_games)
コード例 #9
0
from taxonomy import Problem
from scales import *
import datetime
date = datetime.date

read_stem_papers = Problem(
    "Read a scientific or technical paper, and comprehend its contents",
    ["language", "world-modelling", "super"])

# Getting some major results from an abstract, tables or conclusion is much easier than understanding the entire paper, its assumptions, robustness, support for its claims, etc
extract_results = Problem(
    "Extract major numerical results or progress claims from a STEM paper",
    ["language", "world-modelling", "agi"])
read_stem_papers.add_subproblem(extract_results)

extract_results.metric("Automatically find new relevant ML results on arXiv")
extract_results.notes = """
This metric is the ability to automatically update the ipython Notebook you are reading by spotting results in pdfs uploaded to arxiv.org.
Pull requests demonstrating solutions are welcome :)
"""

solve_technical_problems = Problem(
    "Given an arbitrary technical problem, solve it as well as a typical professional in that field",
    ["language", "world-modelling"])

program_induction = Problem("Writing software from specifications")
solve_technical_problems.add_subproblem(program_induction)
program_induction.metric("Card2Code",
                         url="https://github.com/deepmind/card2code",
                         scale=correct_percent)