def prepare_db(inputs, dbfile): """ Build a SQLite database that extracts some information from our parsed versions of Wiktionary. This is information that is needed by later reader steps, such as which words are known in which languages, and which words are forms of other words. """ # If the database already exists, delete it first try: os.unlink(dbfile) except FileNotFoundError: pass db = sqlite3.connect(dbfile) make_tables(db) try: for filename in inputs: filepath = pathlib.Path(filename) file_language = filepath.name.split('.')[0] for item in read_json_stream(filename): if 'rel' in item: tfrom = item['from'] tto = item['to'] # For all non-definition relations, record the fact that # the given entry name exists in the given language. We'll # use these to disambiguate definitions later. if item['rel'] != 'definition': if 'language' in tfrom and valid_language(tfrom['language']): add_title( db, file_language, tfrom['language'], tfrom['text'] ) if 'language' in tto and valid_language(tto['language']): add_title(db, file_language, tto['language'], tto['text']) # Record word forms so we can build a lemmatizer from them. if item['rel'].startswith('form/'): form_name = item['rel'][5:] # Look for the part of speech, first in the 'from' term, # then in the 'to' term. pos = tfrom.get('pos', tto.get('pos', '?')) # Use only Etymology 1 entries for learning word forms. if (tfrom.get('etym') or '1') == '1': language = tfrom.get('language', tto.get('language')) if ( valid_language(language) and tfrom['text'] != tto['text'] ): add_form( db, file_language, language, tfrom['text'], pos, tto['text'], form_name, ) db.commit() finally: db.close()
def convert_to_assoc(input_filename, output_filename): """ Convert a JSON stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - Negative relations create associations between concepts suffixed with '/neg' - An assertion that means "People want X" in English or Chinese is converted to an assertion between X and "good", and also X and the negation of "bad" - Combining both of these, an assertion that "People don't want X" moves the negation so that X is associated with "not good" and "bad". The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ out_stream = codecs.open(output_filename, 'w', encoding='utf-8') for info in read_json_stream(input_filename): startc = reduce_concept(info['start']) endc = reduce_concept(info['end']) rel = info['rel'] weight = info['weight'] if 'dbpedia' in info['sources'] and '/or/' not in info['sources']: # DBPedia associations are still too numerous and too weird to # associate. continue pairs = [] if startc == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', endc), ('/c/en/bad/neg', endc)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', endc), ('/c/en/good/neg', endc)] else: pairs = [(startc, endc)] elif startc == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', endc), ('/c/zh/不良/neg', endc)] elif rel == '/r/NotDesires': pairs = [('/c/zh/良好/neg', endc), ('/c/zh/不良', endc)] else: pairs = [(startc, endc)] else: negated = (rel.startswith('/r/Not') or rel.startswith('/r/Antonym')) if not negated: pairs = [(startc, endc)] else: pairs = [(startc, endc + '/neg'), (startc + '/neg', endc)] for (start, end) in pairs: line = "%(start)s\t%(end)s\t%(weight)s" % { 'start': start, 'end': end, 'weight': weight, } print(line, file=out_stream)
def json_to_msgpack(input_filename, output_filename): """ Convert a JSON stream (with one object per line) to a msgpack stream. """ out_stream = MsgpackStreamWriter(output_filename) for obj in read_json_stream(input_filename): out_stream.write(obj) out_stream.close()
def json_to_csv(input_filename, output_filename): out_stream = codecs.open(output_filename, 'w', encoding='utf-8') for info in read_json_stream(input_filename): if info.get('surfaceText') is None: info['surfaceText'] = '' if info.get('context') is None: info['context'] = '' info['weight'] = str(info['weight']) columns = ['uri', 'rel', 'start', 'end', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText'] column_values = [info.get(col) for col in columns] line = '\t'.join(column_values) assert '\n' not in line print(line, file=out_stream)
def convert_to_tab_separated(input_filename, output_filename): out_stream = codecs.open(output_filename, 'w', encoding='utf-8') for info in read_json_stream(input_filename): if info['surfaceText'] is None: info['surfaceText'] = '' info['weight'] = str(info['weight']) columns = [ 'uri', 'rel', 'start', 'end', 'context', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText' ] column_values = [info.get(col) for col in columns] line = '\t'.join(column_values) print(line, file=out_stream)
def test_msgpack_to_json(): with TemporaryDirectory(prefix='conceptnet-test') as tmpdir: json_path = os.path.join(tmpdir, 'test.jsons') msgpack_path = os.path.join(tmpdir, 'test.msgpack') writer = MsgpackStreamWriter(json_path) for item in DATA: writer.write(item) writer.close() msgpack_to_json(json_path, msgpack_path) reader = read_json_stream(msgpack_path) for known, read in zip_longest(DATA, reader): eq_(known, read)
def json_to_unique_csv(input_filename, output_filename): out_stream = codecs.open(output_filename, 'w', encoding='utf-8') cache = set() cached = itemgetter('uri', 'source_uri', 'dataset', 'weight') for info in read_json_stream(input_filename): if info.get('surfaceText') is None: info['surfaceText'] = '' if info.get('context') is None: info['context'] = '' info['weight'] = str(info['weight']) cached_item = ' '.join(cached(info)) if cached_item in cache: pass else: cache.add(cached_item) columns = ['uri', 'rel', 'start', 'end', 'context', 'weight', 'source_uri', 'id', 'dataset', 'surfaceText'] column_values = [info.get(col) for col in columns] line = '\t'.join(column_values) assert '\n' not in line print(line, file=out_stream)
def segmented_stream(input_file): """ Read a JSON stream delimited by 'heading' entries, marking where the parser started parsing a new page. We distinguish these entries by the fact that they contain a 'title' key. Yield tuples of (heading, [items]), where [items] are the stream items that appear under the given heading. """ heading = None items = [] for item in read_json_stream(input_file): if 'title' in item: if heading is not None: yield heading, items heading = item items.clear() else: items.append(item) if heading is not None: yield heading, items
def convert_to_solr(input_filename, output_filename): """ Convert a JSON stream to a different JSON file that can be loaded into Solr. A JSON stream differs from standard JSON in that it contains several objects separated by line breaks. A Solr input file differs from standard JSON in a different way: it is represented as a single object with many fields. The values of these fields are the various different objects, but the key of each field must be "add". Having many values with the same key is incompatible with Python dictionaries, but is technically allowed by the JSON grammar. To create the output JSON file in Python, we have to write its components incrementally. """ out = codecs.open(output_filename, "w", encoding="utf-8") print("{", file=out) for info in read_json_stream(input_filename): boost = info["weight"] # Handle searchable lemmas info["relLemmas"] = "" info["startLemmas"] = " ".join(uri_to_lemmas(info["start"])) info["endLemmas"] = " ".join(uri_to_lemmas(info["end"])) if boost > 0: if "surfaceText" in info and info["surfaceText"] is None: del info["surfaceText"] solr_struct = {"doc": info, "boost": boost} solr_fragment = '\t"add": %s,' % json.dumps(solr_struct) print(solr_fragment, file=out) print('\t"commit": {}', file=out) print("}", file=out)
def convert_to_solr(input_filename, output_filename): """ Convert a JSON stream to a different JSON file that can be loaded into Solr. A JSON stream differs from standard JSON in that it contains several objects separated by line breaks. A Solr input file differs from standard JSON in a different way: it is represented as a single object with many fields. The values of these fields are the various different objects, but the key of each field must be "add". Having many values with the same key is incompatible with Python dictionaries, but is technically allowed by the JSON grammar. To create the output JSON file in Python, we have to write its components incrementally. """ out = codecs.open(output_filename, 'w', encoding='utf-8') print("{", file=out) for info in read_json_stream(input_filename): boost = info['weight'] # Handle searchable lemmas info['relLemmas'] = '' info['startLemmas'] = ' '.join(uri_to_lemmas(info['start'])) info['endLemmas'] = ' '.join(uri_to_lemmas(info['end'])) if boost > 0: if 'surfaceText' in info and info['surfaceText'] is None: del info['surfaceText'] solr_struct = {'doc': info, 'boost': boost} solr_fragment = '\t"add": %s,' % json.dumps(solr_struct) print(solr_fragment, file=out) print('\t"commit": {}', file=out) print('}', file=out)
def json_to_msgpack(input_filename, output_filename): out_stream = MsgpackStreamWriter(output_filename) for obj in read_json_stream(input_filename): out_stream.write(obj) out_stream.close()
def transform_file(self, input_filename, output_file): out = MsgpackStreamWriter(output_file) for obj in read_json_stream(input_filename): for new_obj in self.handle_assertion(obj): out.write(new_obj)