Example #1
0
def get_db_connection(dbname=None, building=False):
    """
    Get a global connection to the ConceptNet PostgreSQL database.

    `dbname` specifies the name of the database in PostgreSQL.
    `building` specifies whether it's okay for the DB to not exist
    (set it to True at build time).
    """
    if not building and not os.access(get_data_filename('psql/done'), os.F_OK):
        raise IOError("The ConceptNet database has not been built.")
    if dbname is None:
        dbname = config.DB_NAME
    if dbname in _CONNECTIONS:
        return _CONNECTIONS[dbname]
    else:
        for attempt in range(10):
            try:
                _CONNECTIONS[dbname] = _get_db_connection_inner(dbname)
                return _CONNECTIONS[dbname]
            except pg8000.InterfaceError:
                if attempt == 0:
                    print(
                        "Database %r at %s:%s is not available, retrying for 10 seconds"
                        % (dbname, config.DB_HOSTNAME, config.DB_PORT),
                        file=sys.stderr
                    )
                time.sleep(1)
        raise IOError(
            "Couldn't connect to database %r at %s:%s" %
            (dbname, config.DB_HOSTNAME, config.DB_PORT)
        )
Example #2
0
def get_db_connection(dbname=None, building=False):
    """
    Get a global connection to the ConceptNet PostgreSQL database.

    `dbname` specifies the name of the database in PostgreSQL.
    `building` specifies whether it's okay for the DB to not exist
    (set it to True at build time).
    """
    if not building and not os.access(get_data_filename('psql/done'), os.F_OK):
        raise IOError("The ConceptNet database has not been built.")
    if dbname is None:
        dbname = config.DB_NAME
    if dbname in _CONNECTIONS:
        return _CONNECTIONS[dbname]
    else:
        for attempt in range(10):
            try:
                _CONNECTIONS[dbname] = _get_db_connection_inner(dbname)
                return _CONNECTIONS[dbname]
            except pg8000.InterfaceError:
                if attempt == 0:
                    print(
                        "Database %r at %s:%s is not available, retrying for 10 seconds"
                        % (dbname, config.DB_HOSTNAME, config.DB_PORT),
                        file=sys.stderr)
                time.sleep(1)
        raise IOError("Couldn't connect to database %r at %s:%s" %
                      (dbname, config.DB_HOSTNAME, config.DB_PORT))
def try_configuring_sentry(app):
    dsn_path = get_data_filename('deploy/sentry-dsn.txt')
    if os.path.exists(dsn_path):
        dsn = open(dsn_path).read().strip()
        return Sentry(app, logging=True, level=logging.ERROR, dsn=dsn)
    else:
        print("Sentry is not configured.")
        return None
Example #4
0
def try_configuring_sentry(app):
    dsn_path = get_data_filename('deploy/sentry-dsn.txt')
    if os.path.exists(dsn_path):
        dsn = open(dsn_path).read().strip()
        return Sentry(app, logging=True, level=logging.ERROR, dsn=dsn)
    else:
        print("Sentry is not configured.")
        return None
Example #5
0
 def __init__(self, vector_filename=None, frame=None):
     if frame is None:
         self.frame = None
         self.vector_filename = vector_filename or get_data_filename(
             'vectors/mini.h5')
     else:
         self.frame = frame
         self.vector_filename = None
     self.small_frame = None
     self.k = None
     self.small_k = None
     self.trie = None
     self.cache = {}
Example #6
0
def _setup():
    """
    Read the dictionary file, creating a mapping from words to their
    phonetics.

    When multiple pronunciations are given, keep the last one.
    """
    with open(get_data_filename('cmudict.0.7a')) as rhymelist:
        for line in rhymelist:
            if line.startswith(';;;'): continue
            word, phon = line.strip().split('  ')
            phon = phon.split(' ')
            PHONETIC_DICT[word] = phon
Example #7
0
 def __init__(self, vector_filename=None, frame=None, use_db=True):
     if frame is None:
         self.frame = None
         self.vector_filename = vector_filename or get_data_filename('vectors/mini.h5')
     else:
         self.frame = frame
         self.vector_filename = None
     self.small_frame = None
     self.k = None
     self.small_k = None
     self.finder = None
     if use_db:
         self.finder = AssertionFinder()
Example #8
0
 def __init__(self, vector_filename=None, frame=None, use_db=True):
     if frame is None:
         self.frame = None
         self.vector_filename = vector_filename or get_data_filename("vectors/mini.h5")
     else:
         self.frame = frame
         self.vector_filename = None
     self.small_frame = None
     self.k = None
     self.small_k = None
     self.finder = None
     self.standardized = None
     if use_db:
         self.finder = AssertionFinder()
def test_languages_exist():
    lang_stats_file = get_data_filename('stats/languages.txt')
    counts = {}
    for line in open(lang_stats_file, encoding='utf-8'):
        count_str, lang = line.strip().split()
        counts[lang] = int(count_str)

    for lang in ALL_LANGUAGES:
        assert lang in counts, lang

    for lang in COMMON_LANGUAGES:
        assert counts[lang] >= 1000, counts[lang]

    for lang in CORE_LANGUAGES:
        assert counts[lang] >= 100000, (lang, counts[lang])
Example #10
0
def test_languages_exist():
    lang_stats_file = get_data_filename('stats/languages.txt')
    counts = {}
    for line in open(lang_stats_file, encoding='utf-8'):
        count_str, lang = line.strip().split()
        counts[lang] = int(count_str)

    for lang in ALL_LANGUAGES:
        assert lang in counts, lang

    for lang in COMMON_LANGUAGES:
        assert counts[lang] >= 1000, counts[lang]

    for lang in CORE_LANGUAGES:
        assert counts[lang] >= 100000, (lang, counts[lang])
Example #11
0
 def __init__(self, vector_filename=None, frame=None, use_db=True):
     if frame is None:
         self.frame = None
         self.vector_filename = vector_filename or get_data_filename(
             'vectors/mini.h5')
     else:
         self.frame = frame
         if not self.frame.index.is_monotonic_increasing:
             self.frame = self.frame.sort_index()
         self.vector_filename = None
     self.small_frame = None
     self.k = None
     self.small_k = None
     self.finder = None
     self.trie = None
     if use_db:
         self.finder = AssertionFinder()
Example #12
0
 def __init__(self, vector_filename=None, frame=None, use_db=True):
     if frame is None:
         self.frame = None
         self.vector_filename = vector_filename or get_data_filename(
             'vectors/mini.h5'
         )
     else:
         self.frame = frame
         self.vector_filename = None
     self.small_frame = None
     self.k = None
     self.small_k = None
     self.finder = None
     self.trie = None
     self.cache = {}
     if use_db:
         self.finder = AssertionFinder()
Example #13
0
from conceptnet5.assoc_query import AssocSpaceWrapper, MissingAssocSpace
from conceptnet5.util import get_data_filename
app = flask.Flask(__name__)
limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"])
CORS(app)

if not app.debug:
    import logging
    file_handler = logging.FileHandler('logs/flask_errors.log')
    file_handler.setLevel(logging.INFO)
    app.logger.addHandler(file_handler)

### Configuration ###

FINDER = AssertionFinder()
ASSOC_WRAPPER = AssocSpaceWrapper(get_data_filename('assoc/assoc-space-5.3'),
                                  FINDER)
commonsense_assoc = None

if len(sys.argv) == 1:
    root_url = 'http://conceptnet5.media.mit.edu/data/5.3'
else:
    root_url = sys.argv[1]


def configure_api(db_path, assertion_dir, assoc_dir=None, nshards=8):
    """
    Override the usual AssertionFinder with a new one, possibly with different
    settings. Do the same for the assoc_dir if given.

    This is useful for testing.
Example #14
0
        'put',
        'gun'
    },
    'de': {'die', 'der', 'das', 'ein', 'mir', 'uns', 'klein'}
}

QUERY = """
SELECT root, form, pos FROM forms
WHERE language=? AND word=?
AND root LIKE '__%' AND form != 'alternate'
AND form NOT LIKE '%short%' AND form NOT LIKE '%Short%'
AND NOT (site_language='de' AND
  (form='masculine' OR form='feminine' OR form='diminutive'))
"""

LEMMA_FILENAME = get_data_filename('db/wiktionary.db')


class DBLemmatizer:
    def __init__(self, filename=LEMMA_FILENAME):
        self.filename = filename
        self.db = None

    def lookup(self, language, word, pos=None):
        if self.db is None:
            self.db = sqlite3.connect(self.filename)
        if language not in LEMMATIZED_LANGUAGES:
            return word, ''
        exceptions = EXCEPTIONS.get(language, {})
        if word in exceptions:
            return exceptions[word]
Example #15
0
from conceptnet5.util import get_data_filename
from conceptnet5.readers import (conceptnet4, dbpedia, jmdict, ptt_petgame,
                                 verbosity, wiktionary_en, wordnet)
from conceptnet5.builders.combine_assertions import AssertionCombiner
import codecs
import os
import sys
import json
from nose.tools import eq_

if sys.version_info.major < 3:
    from StringIO import StringIO
else:
    from io import StringIO

TESTDATA_DIR = get_data_filename("testdata")


def data_path(filename):
    return os.path.join(TESTDATA_DIR, filename)


# This is a multi-test: it generates a sequence of tests, consisting of the
# function to run and the arguments to give it. nosetests knows how to run
# tests with this structure.
def test_reader_modules():
    combiner = AssertionCombiner('/l/CC/By-SA')
    io_mappings = [
        (conceptnet4, 'input/conceptnet4.jsons', ['output/conceptnet4.jsons']),
        (dbpedia, 'input/dbpedia.nt',
         ['output/dbpedia.jsons', 'output/dbpedia_map.nt']),
Example #16
0
def get_assoc_data(name):
    finder = AssertionFinder()
    assoc_wrapper = AssocSpaceWrapper(
        get_data_filename('assoc/%s' % name), finder
    )
    return finder, assoc_wrapper
Example #17
0
        'die', 'der', 'das', 'ein', 'mir', 'uns', 'klein'
    }
}


QUERY = """
SELECT root, form, pos FROM forms
WHERE language=? AND word=?
AND root LIKE '__%' AND form != 'alternate'
AND form NOT LIKE '%short%' AND form NOT LIKE '%Short%'
AND NOT (site_language='de' AND
  (form='masculine' OR form='feminine' OR form='diminutive'))
"""


LEMMA_FILENAME = get_data_filename('db/wiktionary.db')


class DBLemmatizer:
    def __init__(self, filename=LEMMA_FILENAME):
        self.filename = filename
        self.db = None

    def lookup(self, language, word, pos=None):
        if self.db is None:
            self.db = sqlite3.connect(self.filename)
        if language not in LEMMATIZED_LANGUAGES:
            return word, ''
        exceptions = EXCEPTIONS.get(language, {})
        if word in exceptions:
            return exceptions[word]
Example #18
0
we will need translations of these names to say, parse entries from the
Japanese-language Wiktionary.

>>> CODE_TO_ENGLISH_NAME['fr']
'French'
>>> CODE_TO_ENGLISH_NAME['fra']
'French'
>>> ENGLISH_NAME_TO_CODE['French']
'fr'
"""

from conceptnet5.util import get_data_filename
import codecs
import re

ISO_DATA_FILENAME = get_data_filename('iso639.txt')

CODE_TO_ENGLISH_NAME = {}
ENGLISH_NAME_TO_CODE = {}

# The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the
# browsable Web interface.
#
# This might be too many.
SUPPORTED_LANGUAGE_CODES = [
    'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'ase', 'av', 'ay',
    'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca',
    'ce', 'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv',
    'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj',
    'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi',
    'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik',
Example #19
0
we will need translations of these names to say, parse entries from the
Japanese-language Wiktionary.

>>> CODE_TO_ENGLISH_NAME['fr']
'French'
>>> CODE_TO_ENGLISH_NAME['fra']
'French'
>>> ENGLISH_NAME_TO_CODE['French']
'fr'
"""

from conceptnet5.util import get_data_filename
import codecs
import re

ISO_DATA_FILENAME = get_data_filename('iso639.txt')

CODE_TO_ENGLISH_NAME = {}
ENGLISH_NAME_TO_CODE = {}

# The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the
# browsable Web interface.
#
# This might be too many.
SUPPORTED_LANGUAGE_CODES = [
    'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'ase', 'av', 'ay',
    'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca',
    'ce', 'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv',
    'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj',
    'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi',
    'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik',
Example #20
0
from __future__ import unicode_literals
import codecs
import json
from conceptnet5.formats.json_stream import JSONStreamWriter
from conceptnet5.nodes import normalized_concept_uri
from conceptnet5.edges import make_edge
from conceptnet5.util import get_data_filename

FRAME_DATA = json.load(
    codecs.open(get_data_filename('zh_frames.json'), encoding='utf-8'))


def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace(
        '{2}', '[[' + concept2 + ']]')
    start = normalized_concept_uri('zh_TW', concept1)
    end = normalized_concept_uri('zh_TW', concept2)
    sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user]
    yield make_edge(rel,
                    start,
                    end,
                    dataset='/d/conceptnet/4/zh',
                    license='/l/CC/By',
                    sources=sources,
                    surfaceText=surfaceText,
Example #21
0
    conceptnet4, dbpedia, jmdict, ptt_petgame, verbosity, wiktionary_en, wordnet
)
from conceptnet5.builders.combine_assertions import AssertionCombiner
import codecs
import os
import sys
import json
from nose.tools import eq_

if sys.version_info.major < 3:
    from StringIO import StringIO
else:
    from io import StringIO


TESTDATA_DIR = get_data_filename("testdata")
def data_path(filename):
    return os.path.join(TESTDATA_DIR, filename)


# This is a multi-test: it generates a sequence of tests, consisting of the
# function to run and the arguments to give it. nosetests knows how to run
# tests with this structure.
def test_reader_modules():
    combiner = AssertionCombiner('/l/CC/By-SA')
    io_mappings = [
        (conceptnet4, 'input/conceptnet4.jsons', ['output/conceptnet4.jsons']),
        (dbpedia, 'input/dbpedia.nt', ['output/dbpedia.jsons', 'output/dbpedia_map.nt']),
        (jmdict, 'input/jmdict.xml', ['output/jmdict.jsons']),
        (ptt_petgame, 'input/ptt_petgame.csv', ['output/ptt_petgame.jsons']),
        (verbosity, 'input/verbosity.txt', ['output/verbosity.jsons']),
Example #22
0
from __future__ import unicode_literals
import codecs
import json
from conceptnet5.formats.json_stream import JSONStreamWriter
from conceptnet5.nodes import normalized_concept_uri
from conceptnet5.edges import make_edge
from conceptnet5.util import get_data_filename


FRAME_DATA = json.load(
    codecs.open(get_data_filename('zh_frames.json'), encoding='utf-8')
)


def handle_raw_assertion(line):
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]')
    start = normalized_concept_uri('zh_TW', concept1)
    end = normalized_concept_uri('zh_TW', concept2)
    sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user]
    yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                    license='/l/CC/By', sources=sources,
                    surfaceText=surfaceText, weight=1)

def handle_file(input_filename, output_file):
    out = JSONStreamWriter(output_file)
Example #23
0
def get_assoc_data(name):
    finder = AssertionFinder()
    assoc_wrapper = AssocSpaceWrapper(get_data_filename('assoc/%s' % name),
                                      finder)
    return finder, assoc_wrapper
Example #24
0
from conceptnet5.assoc_query import AssocSpaceWrapper, MissingAssocSpace
from conceptnet5.util import get_data_filename, get_support_data_filename



### Configuration ###

VERSION = '5.3'
API_URL = '/data/5.3'
WORKING_DIR = os.getcwd()
STATIC_PATH = os.environ.get('CONCEPTNET_WEB_STATIC', os.path.join(WORKING_DIR, 'static'))
TEMPLATE_PATH = os.environ.get('CONCEPTNET_WEB_TEMPLATES', os.path.join(WORKING_DIR, 'templates'))

FINDER = AssertionFinder()
ASSOC_WRAPPER = AssocSpaceWrapper(
    get_data_filename('assoc/assoc-space-%s' % VERSION), FINDER
)

app = flask.Flask(
    'conceptnet5',
    template_folder=TEMPLATE_PATH,
    static_folder=STATIC_PATH
)
app.config['JSON_AS_ASCII'] = False
limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"])
CORS(app)

if len(sys.argv) == 1:
    root_url = 'http://conceptnet5.media.mit.edu/data/%s' % VERSION
else:
    root_url = sys.argv[1]
Example #25
0
from conceptnet5.assoc_query import AssocSpaceWrapper, MissingAssocSpace
from conceptnet5.util import get_data_filename, get_support_data_filename

### Configuration ###

VERSION = '5.3'
API_URL = '/data/5.3'
WORKING_DIR = os.getcwd()
STATIC_PATH = os.environ.get('CONCEPTNET_WEB_STATIC',
                             os.path.join(WORKING_DIR, 'static'))
TEMPLATE_PATH = os.environ.get('CONCEPTNET_WEB_TEMPLATES',
                               os.path.join(WORKING_DIR, 'templates'))

FINDER = AssertionFinder()
ASSOC_WRAPPER = AssocSpaceWrapper(
    get_data_filename('assoc/assoc-space-%s' % VERSION), FINDER)

app = flask.Flask('conceptnet5',
                  template_folder=TEMPLATE_PATH,
                  static_folder=STATIC_PATH)
app.config['JSON_AS_ASCII'] = False
limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"])
CORS(app)

if len(sys.argv) == 1:
    root_url = 'http://conceptnet5.media.mit.edu/data/%s' % VERSION
else:
    root_url = sys.argv[1]


def configure_api(db_path, assertion_dir, assoc_dir=None, nshards=8):
Example #26
0
    """
    Deserialize composite scores serialized by save_composite_scores to the 
    given path.
    """
    with open(path, 'rb') as fp:
        canonical_scores = msgpack.unpack(fp, use_list=False, encoding='utf-8')
    scores = {
        source: (np.float32(s0), np.float32(s1))
        for source, (s0, s1) in canonical_scores.items()
    }
    return scores


if __name__ == '__main__':
    accumulator = TimeAccumulator()
    edges_filename = get_data_filename('collated/sorted/edges-shuf.csv')
    vectors_filename = get_data_filename('vectors/numberbatch-biased.h5')
    model_filename = get_data_filename('sme/sme.model')
    scores_by_dataset_filename = get_data_filename(
        'sme/scores_by_dataset.msgpack')
    scores_by_source_filename = get_data_filename(
        'sme/scores_by_source.msgpack')
    dataset_scores_filename = get_data_filename(
        'sme/dataset_composite_scores.msgpack')
    source_scores_filename = get_data_filename(
        'sme/source_composite_scores.msgpack')
    model = SemanticMatchingModel.load_model(model_filename)
    print('Scoring edges....')
    with stopwatch(accumulator):
        scores_by_dataset, scores_by_source = evaluate_sources(
            model, edges_filename, convert_logits_to_probas=True)
Example #27
0
 def __init__(self, db_filename=None, edge_dir=None, nshards=8):
     self.search_index = None
     self._db_filename = db_filename or get_data_filename(
         'db/assertions.db')
     self._edge_dir = edge_dir or get_data_filename('assertions')
     self.nshards = nshards
Example #28
0
app = flask.Flask(__name__)
limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"])
CORS(app)

if not app.debug:
    import logging
    file_handler = logging.FileHandler('logs/flask_errors.log')
    file_handler.setLevel(logging.INFO)
    app.logger.addHandler(file_handler)


### Configuration ###

FINDER = AssertionFinder()
ASSOC_WRAPPER = AssocSpaceWrapper(
    get_data_filename('assoc/assoc-space-5.3'), FINDER
)
commonsense_assoc = None

if len(sys.argv) == 1:
    root_url = 'http://conceptnet5.media.mit.edu/data/5.3'
else:
    root_url = sys.argv[1]


def configure_api(db_path, assertion_dir, assoc_dir=None, nshards=8):
    """
    Override the usual AssertionFinder with a new one, possibly with different
    settings. Do the same for the assoc_dir if given.

    This is useful for testing.
def test_relations_recorded():
    built_relations_file = get_data_filename('stats/relations.txt')
    built_relations = collect_relations(built_relations_file)
    recorded_relations = set(ALL_RELATIONS)
    missing_relations = built_relations - recorded_relations
    assert len(missing_relations) == 0
Example #30
0
 def __init__(self, db_filename=None, edge_dir=None, nshards=8):
     self.search_index = None
     self._db_filename = db_filename or get_data_filename('db/assertions.db')
     self._edge_dir = edge_dir or get_data_filename('assertions')
     self.nshards = nshards