def get_db_connection(dbname=None, building=False): """ Get a global connection to the ConceptNet PostgreSQL database. `dbname` specifies the name of the database in PostgreSQL. `building` specifies whether it's okay for the DB to not exist (set it to True at build time). """ if not building and not os.access(get_data_filename('psql/done'), os.F_OK): raise IOError("The ConceptNet database has not been built.") if dbname is None: dbname = config.DB_NAME if dbname in _CONNECTIONS: return _CONNECTIONS[dbname] else: for attempt in range(10): try: _CONNECTIONS[dbname] = _get_db_connection_inner(dbname) return _CONNECTIONS[dbname] except pg8000.InterfaceError: if attempt == 0: print( "Database %r at %s:%s is not available, retrying for 10 seconds" % (dbname, config.DB_HOSTNAME, config.DB_PORT), file=sys.stderr ) time.sleep(1) raise IOError( "Couldn't connect to database %r at %s:%s" % (dbname, config.DB_HOSTNAME, config.DB_PORT) )
def get_db_connection(dbname=None, building=False): """ Get a global connection to the ConceptNet PostgreSQL database. `dbname` specifies the name of the database in PostgreSQL. `building` specifies whether it's okay for the DB to not exist (set it to True at build time). """ if not building and not os.access(get_data_filename('psql/done'), os.F_OK): raise IOError("The ConceptNet database has not been built.") if dbname is None: dbname = config.DB_NAME if dbname in _CONNECTIONS: return _CONNECTIONS[dbname] else: for attempt in range(10): try: _CONNECTIONS[dbname] = _get_db_connection_inner(dbname) return _CONNECTIONS[dbname] except pg8000.InterfaceError: if attempt == 0: print( "Database %r at %s:%s is not available, retrying for 10 seconds" % (dbname, config.DB_HOSTNAME, config.DB_PORT), file=sys.stderr) time.sleep(1) raise IOError("Couldn't connect to database %r at %s:%s" % (dbname, config.DB_HOSTNAME, config.DB_PORT))
def try_configuring_sentry(app): dsn_path = get_data_filename('deploy/sentry-dsn.txt') if os.path.exists(dsn_path): dsn = open(dsn_path).read().strip() return Sentry(app, logging=True, level=logging.ERROR, dsn=dsn) else: print("Sentry is not configured.") return None
def __init__(self, vector_filename=None, frame=None): if frame is None: self.frame = None self.vector_filename = vector_filename or get_data_filename( 'vectors/mini.h5') else: self.frame = frame self.vector_filename = None self.small_frame = None self.k = None self.small_k = None self.trie = None self.cache = {}
def _setup(): """ Read the dictionary file, creating a mapping from words to their phonetics. When multiple pronunciations are given, keep the last one. """ with open(get_data_filename('cmudict.0.7a')) as rhymelist: for line in rhymelist: if line.startswith(';;;'): continue word, phon = line.strip().split(' ') phon = phon.split(' ') PHONETIC_DICT[word] = phon
def __init__(self, vector_filename=None, frame=None, use_db=True): if frame is None: self.frame = None self.vector_filename = vector_filename or get_data_filename('vectors/mini.h5') else: self.frame = frame self.vector_filename = None self.small_frame = None self.k = None self.small_k = None self.finder = None if use_db: self.finder = AssertionFinder()
def __init__(self, vector_filename=None, frame=None, use_db=True): if frame is None: self.frame = None self.vector_filename = vector_filename or get_data_filename("vectors/mini.h5") else: self.frame = frame self.vector_filename = None self.small_frame = None self.k = None self.small_k = None self.finder = None self.standardized = None if use_db: self.finder = AssertionFinder()
def test_languages_exist(): lang_stats_file = get_data_filename('stats/languages.txt') counts = {} for line in open(lang_stats_file, encoding='utf-8'): count_str, lang = line.strip().split() counts[lang] = int(count_str) for lang in ALL_LANGUAGES: assert lang in counts, lang for lang in COMMON_LANGUAGES: assert counts[lang] >= 1000, counts[lang] for lang in CORE_LANGUAGES: assert counts[lang] >= 100000, (lang, counts[lang])
def __init__(self, vector_filename=None, frame=None, use_db=True): if frame is None: self.frame = None self.vector_filename = vector_filename or get_data_filename( 'vectors/mini.h5') else: self.frame = frame if not self.frame.index.is_monotonic_increasing: self.frame = self.frame.sort_index() self.vector_filename = None self.small_frame = None self.k = None self.small_k = None self.finder = None self.trie = None if use_db: self.finder = AssertionFinder()
def __init__(self, vector_filename=None, frame=None, use_db=True): if frame is None: self.frame = None self.vector_filename = vector_filename or get_data_filename( 'vectors/mini.h5' ) else: self.frame = frame self.vector_filename = None self.small_frame = None self.k = None self.small_k = None self.finder = None self.trie = None self.cache = {} if use_db: self.finder = AssertionFinder()
from conceptnet5.assoc_query import AssocSpaceWrapper, MissingAssocSpace from conceptnet5.util import get_data_filename app = flask.Flask(__name__) limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"]) CORS(app) if not app.debug: import logging file_handler = logging.FileHandler('logs/flask_errors.log') file_handler.setLevel(logging.INFO) app.logger.addHandler(file_handler) ### Configuration ### FINDER = AssertionFinder() ASSOC_WRAPPER = AssocSpaceWrapper(get_data_filename('assoc/assoc-space-5.3'), FINDER) commonsense_assoc = None if len(sys.argv) == 1: root_url = 'http://conceptnet5.media.mit.edu/data/5.3' else: root_url = sys.argv[1] def configure_api(db_path, assertion_dir, assoc_dir=None, nshards=8): """ Override the usual AssertionFinder with a new one, possibly with different settings. Do the same for the assoc_dir if given. This is useful for testing.
'put', 'gun' }, 'de': {'die', 'der', 'das', 'ein', 'mir', 'uns', 'klein'} } QUERY = """ SELECT root, form, pos FROM forms WHERE language=? AND word=? AND root LIKE '__%' AND form != 'alternate' AND form NOT LIKE '%short%' AND form NOT LIKE '%Short%' AND NOT (site_language='de' AND (form='masculine' OR form='feminine' OR form='diminutive')) """ LEMMA_FILENAME = get_data_filename('db/wiktionary.db') class DBLemmatizer: def __init__(self, filename=LEMMA_FILENAME): self.filename = filename self.db = None def lookup(self, language, word, pos=None): if self.db is None: self.db = sqlite3.connect(self.filename) if language not in LEMMATIZED_LANGUAGES: return word, '' exceptions = EXCEPTIONS.get(language, {}) if word in exceptions: return exceptions[word]
from conceptnet5.util import get_data_filename from conceptnet5.readers import (conceptnet4, dbpedia, jmdict, ptt_petgame, verbosity, wiktionary_en, wordnet) from conceptnet5.builders.combine_assertions import AssertionCombiner import codecs import os import sys import json from nose.tools import eq_ if sys.version_info.major < 3: from StringIO import StringIO else: from io import StringIO TESTDATA_DIR = get_data_filename("testdata") def data_path(filename): return os.path.join(TESTDATA_DIR, filename) # This is a multi-test: it generates a sequence of tests, consisting of the # function to run and the arguments to give it. nosetests knows how to run # tests with this structure. def test_reader_modules(): combiner = AssertionCombiner('/l/CC/By-SA') io_mappings = [ (conceptnet4, 'input/conceptnet4.jsons', ['output/conceptnet4.jsons']), (dbpedia, 'input/dbpedia.nt', ['output/dbpedia.jsons', 'output/dbpedia_map.nt']),
def get_assoc_data(name): finder = AssertionFinder() assoc_wrapper = AssocSpaceWrapper( get_data_filename('assoc/%s' % name), finder ) return finder, assoc_wrapper
'die', 'der', 'das', 'ein', 'mir', 'uns', 'klein' } } QUERY = """ SELECT root, form, pos FROM forms WHERE language=? AND word=? AND root LIKE '__%' AND form != 'alternate' AND form NOT LIKE '%short%' AND form NOT LIKE '%Short%' AND NOT (site_language='de' AND (form='masculine' OR form='feminine' OR form='diminutive')) """ LEMMA_FILENAME = get_data_filename('db/wiktionary.db') class DBLemmatizer: def __init__(self, filename=LEMMA_FILENAME): self.filename = filename self.db = None def lookup(self, language, word, pos=None): if self.db is None: self.db = sqlite3.connect(self.filename) if language not in LEMMATIZED_LANGUAGES: return word, '' exceptions = EXCEPTIONS.get(language, {}) if word in exceptions: return exceptions[word]
we will need translations of these names to say, parse entries from the Japanese-language Wiktionary. >>> CODE_TO_ENGLISH_NAME['fr'] 'French' >>> CODE_TO_ENGLISH_NAME['fra'] 'French' >>> ENGLISH_NAME_TO_CODE['French'] 'fr' """ from conceptnet5.util import get_data_filename import codecs import re ISO_DATA_FILENAME = get_data_filename('iso639.txt') CODE_TO_ENGLISH_NAME = {} ENGLISH_NAME_TO_CODE = {} # The SUPPORTED_LANGUAGE_CODES are the ones that should appear in the # browsable Web interface. # # This might be too many. SUPPORTED_LANGUAGE_CODES = [ 'aa', 'ab', 'ae', 'af', 'ak', 'am', 'an', 'ar', 'as', 'ase', 'av', 'ay', 'az', 'ba', 'be', 'bg', 'bh', 'bi', 'bm', 'bn', 'bo', 'br', 'bs', 'ca', 'ce', 'ch', 'co', 'cr', 'crh', 'cs', 'cu', 'cv', 'cy', 'da', 'de', 'dv', 'dz', 'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'fy', 'ga', 'gd', 'gl', 'gn', 'gu', 'gv', 'ha', 'he', 'hi', 'ho', 'hr', 'ht', 'hu', 'hy', 'hz', 'ia', 'id', 'ie', 'ig', 'ii', 'ik',
from __future__ import unicode_literals import codecs import json from conceptnet5.formats.json_stream import JSONStreamWriter from conceptnet5.nodes import normalized_concept_uri from conceptnet5.edges import make_edge from conceptnet5.util import get_data_filename FRAME_DATA = json.load( codecs.open(get_data_filename('zh_frames.json'), encoding='utf-8')) def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace( '{2}', '[[' + concept2 + ']]') start = normalized_concept_uri('zh_TW', concept1) end = normalized_concept_uri('zh_TW', concept2) sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user] yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText,
conceptnet4, dbpedia, jmdict, ptt_petgame, verbosity, wiktionary_en, wordnet ) from conceptnet5.builders.combine_assertions import AssertionCombiner import codecs import os import sys import json from nose.tools import eq_ if sys.version_info.major < 3: from StringIO import StringIO else: from io import StringIO TESTDATA_DIR = get_data_filename("testdata") def data_path(filename): return os.path.join(TESTDATA_DIR, filename) # This is a multi-test: it generates a sequence of tests, consisting of the # function to run and the arguments to give it. nosetests knows how to run # tests with this structure. def test_reader_modules(): combiner = AssertionCombiner('/l/CC/By-SA') io_mappings = [ (conceptnet4, 'input/conceptnet4.jsons', ['output/conceptnet4.jsons']), (dbpedia, 'input/dbpedia.nt', ['output/dbpedia.jsons', 'output/dbpedia_map.nt']), (jmdict, 'input/jmdict.xml', ['output/jmdict.jsons']), (ptt_petgame, 'input/ptt_petgame.csv', ['output/ptt_petgame.jsons']), (verbosity, 'input/verbosity.txt', ['output/verbosity.jsons']),
from __future__ import unicode_literals import codecs import json from conceptnet5.formats.json_stream import JSONStreamWriter from conceptnet5.nodes import normalized_concept_uri from conceptnet5.edges import make_edge from conceptnet5.util import get_data_filename FRAME_DATA = json.load( codecs.open(get_data_filename('zh_frames.json'), encoding='utf-8') ) def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]') start = normalized_concept_uri('zh_TW', concept1) end = normalized_concept_uri('zh_TW', concept2) sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user] yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) def handle_file(input_filename, output_file): out = JSONStreamWriter(output_file)
def get_assoc_data(name): finder = AssertionFinder() assoc_wrapper = AssocSpaceWrapper(get_data_filename('assoc/%s' % name), finder) return finder, assoc_wrapper
from conceptnet5.assoc_query import AssocSpaceWrapper, MissingAssocSpace from conceptnet5.util import get_data_filename, get_support_data_filename ### Configuration ### VERSION = '5.3' API_URL = '/data/5.3' WORKING_DIR = os.getcwd() STATIC_PATH = os.environ.get('CONCEPTNET_WEB_STATIC', os.path.join(WORKING_DIR, 'static')) TEMPLATE_PATH = os.environ.get('CONCEPTNET_WEB_TEMPLATES', os.path.join(WORKING_DIR, 'templates')) FINDER = AssertionFinder() ASSOC_WRAPPER = AssocSpaceWrapper( get_data_filename('assoc/assoc-space-%s' % VERSION), FINDER ) app = flask.Flask( 'conceptnet5', template_folder=TEMPLATE_PATH, static_folder=STATIC_PATH ) app.config['JSON_AS_ASCII'] = False limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"]) CORS(app) if len(sys.argv) == 1: root_url = 'http://conceptnet5.media.mit.edu/data/%s' % VERSION else: root_url = sys.argv[1]
from conceptnet5.assoc_query import AssocSpaceWrapper, MissingAssocSpace from conceptnet5.util import get_data_filename, get_support_data_filename ### Configuration ### VERSION = '5.3' API_URL = '/data/5.3' WORKING_DIR = os.getcwd() STATIC_PATH = os.environ.get('CONCEPTNET_WEB_STATIC', os.path.join(WORKING_DIR, 'static')) TEMPLATE_PATH = os.environ.get('CONCEPTNET_WEB_TEMPLATES', os.path.join(WORKING_DIR, 'templates')) FINDER = AssertionFinder() ASSOC_WRAPPER = AssocSpaceWrapper( get_data_filename('assoc/assoc-space-%s' % VERSION), FINDER) app = flask.Flask('conceptnet5', template_folder=TEMPLATE_PATH, static_folder=STATIC_PATH) app.config['JSON_AS_ASCII'] = False limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"]) CORS(app) if len(sys.argv) == 1: root_url = 'http://conceptnet5.media.mit.edu/data/%s' % VERSION else: root_url = sys.argv[1] def configure_api(db_path, assertion_dir, assoc_dir=None, nshards=8):
""" Deserialize composite scores serialized by save_composite_scores to the given path. """ with open(path, 'rb') as fp: canonical_scores = msgpack.unpack(fp, use_list=False, encoding='utf-8') scores = { source: (np.float32(s0), np.float32(s1)) for source, (s0, s1) in canonical_scores.items() } return scores if __name__ == '__main__': accumulator = TimeAccumulator() edges_filename = get_data_filename('collated/sorted/edges-shuf.csv') vectors_filename = get_data_filename('vectors/numberbatch-biased.h5') model_filename = get_data_filename('sme/sme.model') scores_by_dataset_filename = get_data_filename( 'sme/scores_by_dataset.msgpack') scores_by_source_filename = get_data_filename( 'sme/scores_by_source.msgpack') dataset_scores_filename = get_data_filename( 'sme/dataset_composite_scores.msgpack') source_scores_filename = get_data_filename( 'sme/source_composite_scores.msgpack') model = SemanticMatchingModel.load_model(model_filename) print('Scoring edges....') with stopwatch(accumulator): scores_by_dataset, scores_by_source = evaluate_sources( model, edges_filename, convert_logits_to_probas=True)
def __init__(self, db_filename=None, edge_dir=None, nshards=8): self.search_index = None self._db_filename = db_filename or get_data_filename( 'db/assertions.db') self._edge_dir = edge_dir or get_data_filename('assertions') self.nshards = nshards
app = flask.Flask(__name__) limiter = Limiter(app, global_limits=["600 per minute", "6000 per hour"]) CORS(app) if not app.debug: import logging file_handler = logging.FileHandler('logs/flask_errors.log') file_handler.setLevel(logging.INFO) app.logger.addHandler(file_handler) ### Configuration ### FINDER = AssertionFinder() ASSOC_WRAPPER = AssocSpaceWrapper( get_data_filename('assoc/assoc-space-5.3'), FINDER ) commonsense_assoc = None if len(sys.argv) == 1: root_url = 'http://conceptnet5.media.mit.edu/data/5.3' else: root_url = sys.argv[1] def configure_api(db_path, assertion_dir, assoc_dir=None, nshards=8): """ Override the usual AssertionFinder with a new one, possibly with different settings. Do the same for the assoc_dir if given. This is useful for testing.
def test_relations_recorded(): built_relations_file = get_data_filename('stats/relations.txt') built_relations = collect_relations(built_relations_file) recorded_relations = set(ALL_RELATIONS) missing_relations = built_relations - recorded_relations assert len(missing_relations) == 0
def __init__(self, db_filename=None, edge_dir=None, nshards=8): self.search_index = None self._db_filename = db_filename or get_data_filename('db/assertions.db') self._edge_dir = edge_dir or get_data_filename('assertions') self.nshards = nshards