def handle_file(input_filename, output_file): out = JSONStreamWriter(output_file) for line in codecs.open(input_filename, encoding='utf-8'): line = line.strip() if line: for new_obj in handle_raw_assertion(line): out.write(new_obj)
def handle_file(input_filename, output_file): out = JSONStreamWriter(output_file) for line in codecs.open(input_filename, encoding='utf-8'): line = line.strip() if line: for new_obj in handle_raw_assertion(line): out.write(new_obj)
def msgpack_to_json(input_filename, output_filename): """ Convert a msgpack stream to a JSON stream (with one object per line). """ out_stream = JSONStreamWriter(output_filename) for obj in read_msgpack_stream(input_filename): out_stream.write(obj) out_stream.close()
def msgpack_to_json(input_filename, output_filename): """ Convert a msgpack stream to a JSON stream (with one object per line). """ out_stream = JSONStreamWriter(output_filename) for obj in read_msgpack_stream(input_filename): out_stream.write(obj) out_stream.close()
def test_json_to_msgpack(): with TemporaryDirectory(prefix='conceptnet-test') as tmpdir: json_path = os.path.join(tmpdir, 'test.jsons') msgpack_path = os.path.join(tmpdir, 'test.msgpack') writer = JSONStreamWriter(json_path) for item in DATA: writer.write(item) writer.close() json_to_msgpack(json_path, msgpack_path) reader = read_msgpack_stream(msgpack_path) for known, read in zip_longest(DATA, reader): eq_(known, read)
def test_json_to_msgpack(): with TemporaryDirectory(prefix='conceptnet-test') as tmpdir: json_path = os.path.join(tmpdir, 'test.jsons') msgpack_path = os.path.join(tmpdir, 'test.msgpack') writer = JSONStreamWriter(json_path) for item in DATA: writer.write(item) writer.close() json_to_msgpack(json_path, msgpack_path) reader = read_msgpack_stream(msgpack_path) for known, read in zip_longest(DATA, reader): eq_(known, read)
def build_from_dir(dirname, output_file): """ Read a GlobalMind database exported in YAML files, translate it into ConceptNet 5 edges, and write those edges to disk using a JSONStreamWriter. """ out = JSONStreamWriter(output_file) userdata = yaml.load_all(open(dirname + '/GMUser.yaml')) users = {} for userinfo in userdata: users[userinfo['pk']] = userinfo frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml')) frames = {} for frame in frame_data: frames[frame['pk']] = frame['fields'] assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml')) assertions = {} for assertion in assertiondata: obj = assertion['fields'] frame = frames[obj['frame']] frametext = frame['text'] userinfo = users[obj['author']] username = userinfo['fields']['username'] # GlobalMind provides information about what country the user is from, which # we can preserve in the contributor URI. # # If I got to re-choose these URIs, I would distinguish usernames with # a country code from those without a country code by something more # than the number of slashes, and I would write the country code in # capital letters. userlocale = userinfo['fields']['ccode'].lower() if userlocale: user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username) else: user_source = "/s/contributor/globalmind/%s" % username sources = [ user_source, "/s/activity/globalmind/assert" ] lang = LANG_CODES[obj['lcode']] start = normalized_concept_uri(lang, obj['node1']) end = normalized_concept_uri(lang, obj['node2']) rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation']) # fix messy english "around in" if ' around ' in frametext: if obj['node2'].startswith('in '): frametext = frametext.replace(' around ', ' in ') obj['node2'] = obj['node2'][3:] else: frametext = frametext.replace(' around ', ' near ') rel = '/r/LocatedNear' # fix more awkward English. I wonder how bad the other languages are. frametext = frametext.replace('hits your head', 'comes to mind') frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]') node1 = u'[[' + obj['node1'] + u']]' node2 = u'[[' + obj['node2'] + u']]' surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2) edge = make_edge(rel, start, end, dataset='/d/globalmind', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) out.write(edge) assertions[assertion['pk']] = edge translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml')) for translation in translationdata: obj = translation['fields'] assertion1 = assertions[obj['assertion1']] assertion2 = assertions[obj['assertion2']] start = assertion1['uri'] end = assertion2['uri'] rel = '/r/TranslationOf' text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '') text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '') lang1 = LANG_NAMES[get_lang(assertion1)] lang2 = LANG_NAMES[get_lang(assertion2)] surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2) userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username) else: user_source = "/s/contributor/globalmind/%s" % username sources = [ user_source, "/s/activity/globalmind/translate" ] edge = make_edge(rel, start, end, dataset='/d/globalmind', license=Licenses.cc_attribution, sources=sources, surfaceText=surfaceText, weight=1) out.write(edge)
def convert_to_json(input_filename, output_filename): out_stream = JSONStreamWriter(output_filename) for obj in read_msgpack_stream(input_filename): out_stream.write(obj) out_stream.close()
class FindTranslations(ContentHandler): def __init__(self, output_file='wiktionary.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = JSONStreamWriter(output_file) def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'RelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[ 0: 1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = (pos, sense) elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if term_is_bad(term1) or term_is_bad(term2): return source = normalized_concept_uri(lang, term1) if self.pos: target = normalized_concept_uri(lang, term2, self.pos) else: target = normalized_concept_uri(lang, term2) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license=Licenses.cc_sharealike, sources=[SOURCE, MONOLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, sense): pos, disambiguation = sense if 'Wik' in foreign or 'Wik' in english or term_is_bad( foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if lang == 'jbo' and re.search(r'x[1-5]', english): return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = normalized_concept_uri(lang, unicodedata.normalize('NFKC', foreign)) target = normalized_concept_uri('en', english, pos, disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % ( CODE_TO_ENGLISH_NAME[lang.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license=Licenses.cc_sharealike, sources=[SOURCE, TRANSLATE], weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): if term_is_bad(foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if self.langcode == 'jbo' and re.search(r'x[1-5]', english): return source = normalized_concept_uri(self.langcode + locale, foreign) target = normalized_concept_uri('en', english) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % ( CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license=Licenses.cc_sharealike, sources=[SOURCE, INTERLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
class FindTranslations(ContentHandler): def __init__(self, output_file='wiktionary.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = JSONStreamWriter(output_file) def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'RelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = (pos, sense) elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if term_is_bad(term1) or term_is_bad(term2): return source = normalized_concept_uri(lang, term1) if self.pos: target = normalized_concept_uri(lang, term2, self.pos) else: target = normalized_concept_uri(lang, term2) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license=Licenses.cc_sharealike, sources=[SOURCE, MONOLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, sense): pos, disambiguation = sense if 'Wik' in foreign or 'Wik' in english or term_is_bad(foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if lang == 'jbo' and re.search(r'x[1-5]', english): return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = normalized_concept_uri( lang, unicodedata.normalize('NFKC', foreign) ) target = normalized_concept_uri( 'en', english, pos, disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[lang.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license=Licenses.cc_sharealike, sources=[SOURCE, TRANSLATE], weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): if term_is_bad(foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if self.langcode == 'jbo' and re.search(r'x[1-5]', english): return source = normalized_concept_uri( self.langcode + locale, foreign ) target = normalized_concept_uri( 'en', english ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license=Licenses.cc_sharealike, sources=[SOURCE, INTERLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def handle_file(infile, outfile): count = 0 outcomes = defaultdict(int) sources = ['/s/site/verbosity'] writer = JSONStreamWriter(outfile) for line in open(infile): parts = line.strip().split('\t') if not parts: outcomes['blank'] += 1 continue # The first 5 columns of the Verbosity output file are: # # left: the word being clued # relation: the relation between the word and the clue that the # clue-giver chose, in a form such as "it is part of" # right: the one or two words used as the clue # freq: the number of different times this clue was given # orderscore: the average position in the list of clues # # 'orderscore' is a number from 0 to 999, representing the average # quantile of its position in the list of clues. (It's like a # percentile, except there are 1000 of them, not 100.) # # A clue that's always given first has an orderscore of 0. A clue # that always appears halfway through the list has an orderscore of # 500. # # This may seem like a strange thing to measure, and I didn't come up # with it, but it actually turns out to be somewhat informative. # A clue with an orderscore of 0 is probably a good common-sense # relation, representing the first thing that comes to mind. A clue # with a high order score may be a move of desperation after several # other clues have failed. It causes the guesser to get the answer # soon afterward, but perhaps because it's a "cheating" move. So, # low orderscores represent better common sense relations. left, relation, right, freq, orderscore = parts[:5] freq = int(freq) orderscore = int(orderscore) # Test each word flagged = False for rword in right.split(): if BAD_CLUE_REGEX.match(rword): flagged = True break if flagged: outcomes['flag word'] += 1 continue if len(right) < 3: outcomes['clue too short'] += 1 continue if len(right.split()[-1]) == 1: outcomes['letter'] += 1 continue # The Verbosity interface and gameplay did not particularly encourage # players to choose an appropriate relation. In practice, players seem # to have used them all interchangeably, except for the negative # relation "it is the opposite of", expressing /r/Antonym. # # Another way that players expressed negative relations was to use # 'not' as the first word of their clue; we make that into an instance # of /r/Antonym as well. # # In other cases, the relation is a positive relation, so we replace it # with the most general positive relation, /r/RelatedTo. rel = '/r/RelatedTo' reltext = 'is related to' if right.startswith('not '): rel = '/r/Antonym' right = right[4:] reltext = 'is not' if relation == 'it is the opposite of': rel = '/r/Antonym' reltext = 'is the opposite of' # The "sounds-like score" determines whether this clue seems to be a # pun or rhyme, rather than an actual common-sense relationship. If # the sounds-like score is over 0.35, skip the assertion. sls = sounds_like_score(left, right) if sls > 0.35: outcomes['text similarity'] += 1 continue # Calculate a score for the assertion: # # - The number of times it's been used as a clue # - ...with a linear penalty for a high sounds-like score # - ...and a linear penalty for high orderscores # # The penalties are multiplicative factors from 0 to 1, which decrease # linearly as the relevant penalties increase. If a clue is given N # times, with a sounds-like score of 0 and an orderscore of 0, it will # get an overall score of 2N - 1. This is a formula we should probably # revisit. # # The weight is the score divided by 100. All divisions are floating # point, as defined by the __future__ import at the top of this module. score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000) if score <= 0.5: outcomes['low score'] += 1 continue weight = score / 100 # If the clue on the right is a two-word phrase, we make additional # connections to both words individually. We label them with the # rule-based source '/s/rule/split_words' to track that this happened. rightwords = [right] if ' ' in right: morewords = [word for word in right.split(' ') if word not in STOPWORDS] rightwords.extend(morewords) for i, rightword in enumerate(rightwords): edge_sources = list(sources) if i > 0: edge_sources.append('/s/rule/split_words') # Build the natural-language-ish surface text for this clue text = '[[%s]] %s [[%s]]' % (left, reltext, rightword) count += 1 outcomes['success'] += 1 leftc = normalized_concept_uri('en', left) rightc = normalized_concept_uri('en', rightword) edge = make_edge(rel, leftc, rightc, dataset='/d/verbosity', license=Licenses.cc_attribution, sources=sources, surfaceText=text, weight=weight) writer.write(edge) # Count the various outcomes. This can be used as a sanity-check. It # also was used for a graph in a ConceptNet 5 paper. print("Verbosity outcomes: %s" % outcomes)
def run_wordnet(input_dir, output_file, sw_map_file): out = JSONStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = normalized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ( 'wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl' ): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge( rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0 ) out.write(edge)
def build_from_dir(dirname, output_file): """ Read a GlobalMind database exported in YAML files, translate it into ConceptNet 5 edges, and write those edges to disk using a JSONStreamWriter. """ out = JSONStreamWriter(output_file) userdata = yaml.load_all(open(dirname + '/GMUser.yaml')) users = {} for userinfo in userdata: users[userinfo['pk']] = userinfo frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml')) frames = {} for frame in frame_data: frames[frame['pk']] = frame['fields'] assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml')) assertions = {} for assertion in assertiondata: obj = assertion['fields'] frame = frames[obj['frame']] frametext = frame['text'] userinfo = users[obj['author']] username = userinfo['fields']['username'] # GlobalMind provides information about what country the user is from, which # we can preserve in the contributor URI. # # If I got to re-choose these URIs, I would distinguish usernames with # a country code from those without a country code by something more # than the number of slashes, and I would write the country code in # capital letters. userlocale = userinfo['fields']['ccode'].lower() if userlocale: user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username) else: user_source = "/s/contributor/globalmind/%s" % username sources = [user_source, "/s/activity/globalmind/assert"] lang = LANG_CODES[obj['lcode']] start = normalized_concept_uri(lang, obj['node1']) end = normalized_concept_uri(lang, obj['node2']) rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation']) # fix messy english "around in" if ' around ' in frametext: if obj['node2'].startswith('in '): frametext = frametext.replace(' around ', ' in ') obj['node2'] = obj['node2'][3:] else: frametext = frametext.replace(' around ', ' near ') rel = '/r/LocatedNear' # fix more awkward English. I wonder how bad the other languages are. frametext = frametext.replace('hits your head', 'comes to mind') frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]') node1 = u'[[' + obj['node1'] + u']]' node2 = u'[[' + obj['node2'] + u']]' surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace( '[node2]', node2) edge = make_edge(rel, start, end, dataset='/d/globalmind', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) out.write(edge) assertions[assertion['pk']] = edge translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml')) for translation in translationdata: obj = translation['fields'] assertion1 = assertions[obj['assertion1']] assertion2 = assertions[obj['assertion2']] start = assertion1['uri'] end = assertion2['uri'] rel = '/r/TranslationOf' text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '') text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '') lang1 = LANG_NAMES[get_lang(assertion1)] lang2 = LANG_NAMES[get_lang(assertion2)] surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2) userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username) else: user_source = "/s/contributor/globalmind/%s" % username sources = [user_source, "/s/activity/globalmind/translate"] edge = make_edge(rel, start, end, dataset='/d/globalmind', license=Licenses.cc_attribution, sources=sources, surfaceText=surfaceText, weight=1) out.write(edge)
def transform_file(self, input_filename, output_file): out = JSONStreamWriter(output_file) for obj in read_json_stream(input_filename): for new_obj in self.handle_assertion(obj): out.write(new_obj)
def msgpack_to_json(input_filename, output_filename): out_stream = JSONStreamWriter(output_filename) for obj in read_msgpack_stream(input_filename): out_stream.write(obj) out_stream.close()
def handle_file(infile, outfile): count = 0 outcomes = defaultdict(int) sources = ['/s/site/verbosity'] writer = JSONStreamWriter(outfile) for line in open(infile): parts = line.strip().split('\t') if not parts: outcomes['blank'] += 1 continue # The first 5 columns of the Verbosity output file are: # # left: the word being clued # relation: the relation between the word and the clue that the # clue-giver chose, in a form such as "it is part of" # right: the one or two words used as the clue # freq: the number of different times this clue was given # orderscore: the average position in the list of clues # # 'orderscore' is a number from 0 to 999, representing the average # quantile of its position in the list of clues. (It's like a # percentile, except there are 1000 of them, not 100.) # # A clue that's always given first has an orderscore of 0. A clue # that always appears halfway through the list has an orderscore of # 500. # # This may seem like a strange thing to measure, and I didn't come up # with it, but it actually turns out to be somewhat informative. # A clue with an orderscore of 0 is probably a good common-sense # relation, representing the first thing that comes to mind. A clue # with a high order score may be a move of desperation after several # other clues have failed. It causes the guesser to get the answer # soon afterward, but perhaps because it's a "cheating" move. So, # low orderscores represent better common sense relations. left, relation, right, freq, orderscore = parts[:5] freq = int(freq) orderscore = int(orderscore) # Test each word flagged = False for rword in right.split(): if BAD_CLUE_REGEX.match(rword): flagged = True break if flagged: outcomes['flag word'] += 1 continue if len(right) < 3: outcomes['clue too short'] += 1 continue if len(right.split()[-1]) == 1: outcomes['letter'] += 1 continue # The Verbosity interface and gameplay did not particularly encourage # players to choose an appropriate relation. In practice, players seem # to have used them all interchangeably, except for the negative # relation "it is the opposite of", expressing /r/Antonym. # # Another way that players expressed negative relations was to use # 'not' as the first word of their clue; we make that into an instance # of /r/Antonym as well. # # In other cases, the relation is a positive relation, so we replace it # with the most general positive relation, /r/RelatedTo. rel = '/r/RelatedTo' reltext = 'is related to' if right.startswith('not '): rel = '/r/Antonym' right = right[4:] reltext = 'is not' if relation == 'it is the opposite of': rel = '/r/Antonym' reltext = 'is the opposite of' # The "sounds-like score" determines whether this clue seems to be a # pun or rhyme, rather than an actual common-sense relationship. If # the sounds-like score is over 0.35, skip the assertion. sls = sounds_like_score(left, right) if sls > 0.35: outcomes['text similarity'] += 1 continue # Calculate a score for the assertion: # # - The number of times it's been used as a clue # - ...with a linear penalty for a high sounds-like score # - ...and a linear penalty for high orderscores # # The penalties are multiplicative factors from 0 to 1, which decrease # linearly as the relevant penalties increase. If a clue is given N # times, with a sounds-like score of 0 and an orderscore of 0, it will # get an overall score of 2N - 1. This is a formula we should probably # revisit. # # The weight is the score divided by 100. All divisions are floating # point, as defined by the __future__ import at the top of this module. score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000) if score <= 0.5: outcomes['low score'] += 1 continue weight = score / 100 # If the clue on the right is a two-word phrase, we make additional # connections to both words individually. We label them with the # rule-based source '/s/rule/split_words' to track that this happened. rightwords = [right] if ' ' in right: morewords = [ word for word in right.split(' ') if word not in STOPWORDS ] rightwords.extend(morewords) for i, rightword in enumerate(rightwords): edge_sources = list(sources) if i > 0: edge_sources.append('/s/rule/split_words') # Build the natural-language-ish surface text for this clue text = '[[%s]] %s [[%s]]' % (left, reltext, rightword) count += 1 outcomes['success'] += 1 leftc = normalized_concept_uri('en', left) rightc = normalized_concept_uri('en', rightword) edge = make_edge(rel, leftc, rightc, dataset='/d/verbosity', license=Licenses.cc_attribution, sources=sources, surfaceText=text, weight=weight) writer.write(edge) # Count the various outcomes. This can be used as a sanity-check. It # also was used for a graph in a ConceptNet 5 paper. print("Verbosity outcomes: %s" % outcomes)
def run_wordnet(input_dir, output_file, sw_map_file): out = JSONStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = normalized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl'): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file( filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge(rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0) out.write(edge)