def subwords_to_edges(language, input, output): """ Morfessor hypothesizes ways to break words into sub-word chunks. Produce edges from these sub-words that can be used in retrofitting. """ writer = MsgpackStreamWriter(output) for line in input: line = line.rstrip() if not line or line.startswith('#'): continue # Remove the unnecessary count ("1 ") from the start of each line line = line.split(' ', 1)[1] chunks = line.split(' + ') # Strip a possible trailing underscore, which would particularly show # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese) full_text = ''.join(chunks).strip('_') end = join_uri('c', language, full_text) for chunk in chunks: if chunk != '_': start = join_uri('x', language, chunk.strip('_')) edge = make_edge( '/r/SubwordOf', start, end, dataset='/d/morphology', license=Licenses.cc_attribution, sources=MORPH_SOURCES, weight=0.01, ) writer.write(edge) writer.close()
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ activity = parts_dict["activity"] creator_node = join_uri( '/s/contributor/omcs', normalize_text(parts_dict["creator"], lowercase=False) ) activity_node = join_uri('/s/activity/omcs', normalize_text(activity)) if preposition_fix: conjunction = [creator_node, activity_node, '/s/rule/preposition_fix'] else: conjunction = [creator_node, activity_node] weighted_sources = [(conjunction, 1)] for vote in parts_dict["votes"]: username = vote[0] vote_int = vote[1] conjunction = [ join_uri('/s/contributor/omcs', username), '/s/activity/omcs/vote' ] weighted_sources.append((conjunction, vote_int)) return weighted_sources
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri("/s/contributor/omcs", standardize_username(parts_dict["creator"])) creator_source["contributor"] = creator_node activity = parts_dict["activity"] activity_node = join_uri("/s/activity/omcs", standardize_text(activity)) creator_source["activity"] = activity_node if preposition_fix: creator_source["process"] = "/s/process/preposition_fix" creator_source["weight"] = 1.0 sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { "contributor": join_uri("/s/contributor/omcs", standardize_username(username)), "activity": "/s/activity/omcs/vote", "weight": float(vote_int), } sources.append(vote_source) return sources
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ activity = parts_dict["activity"] creator_node = join_uri('/s/contributor/omcs', standardize_text(parts_dict["creator"])) activity_node = join_uri('/s/activity/omcs', standardize_text(activity)) if preposition_fix: conjunction = [creator_node, activity_node, '/s/rule/preposition_fix'] else: conjunction = [creator_node, activity_node] weighted_sources = [(conjunction, 1)] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] conjunction = [ join_uri('/s/contributor/omcs', standardize_text(username)), '/s/activity/omcs/vote' ] weighted_sources.append((conjunction, vote_int)) return weighted_sources
def build_relation(parts_dict): """ Update relation names to ConceptNet 5's names. Mostly we preserve the same names, but any instance of "ConceptuallyRelatedTo" becomes "RelatedTo". Statements with negative polarity get new negative relations. """ relname = parts_dict["relname"] polarity = polarity = parts_dict["polarity"] if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = join_uri('/r', relname) else: relation = join_uri('/r', 'Not' + relname) return relation
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri('/s/contributor/omcs', standardize_username(parts_dict["creator"])) creator_source['contributor'] = creator_node activity = parts_dict["activity"] activity = '_'.join(simple_tokenize(activity.replace('_', ' '))) activity_node = join_uri('/s/activity/omcs', activity) creator_source['activity'] = activity_node if preposition_fix: creator_source['process'] = '/s/process/preposition_fix' creator_source['weight'] = 1. sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { 'contributor': join_uri('/s/contributor/omcs', standardize_username(username)), 'activity': '/s/activity/omcs/vote', 'weight': float(vote_int) } sources.append(vote_source) return sources
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri( '/s/contributor/omcs', standardize_username(parts_dict["creator"]) ) creator_source['contributor'] = creator_node activity = parts_dict["activity"] activity = '_'.join(simple_tokenize(activity.replace('_', ' '))) activity_node = join_uri('/s/activity/omcs', activity) creator_source['activity'] = activity_node if preposition_fix: creator_source['process'] = '/s/process/preposition_fix' creator_source['weight'] = 1. sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { 'contributor': join_uri( '/s/contributor/omcs', standardize_username(username) ), 'activity': '/s/activity/omcs/vote', 'weight': float(vote_int), } sources.append(vote_source) return sources
def lemmatize_uri(self, uri): pieces = split_uri(uri) if len(pieces) < 2: return uri language = pieces[1] text = pieces[2] rest = pieces[3:] if rest: pos = rest[0] else: pos = None root, _form = self.lookup(language, text, pos) return join_uri('c', language, root, *rest)
def reduce_concept(concept): """ Remove the part of speech and disambiguation (if present) from a concept, leaving a potentially ambiguous concept that can be matched against surface text. Additionally, remove the region tag from Chinese assertions, so they are considered simply as assertions about Chinese regardless of whether it is Traditional or Simplified Chinese. In the cases where they overlap, this helps to make the information more complete. >>> reduce_concept('/c/en/cat/n/feline') '/c/en/cat' >>> reduce_concept('/c/zh_TW/良好') '/c/zh/良好' """ parts = split_uri(concept) # Unify simplified and traditional Chinese in associations. if parts[1] == 'zh_CN' or parts[1] == 'zh_TW': parts[1] = 'zh' return join_uri(*parts[:3])
def reduce_concept(concept): """ Remove the part of speech and disambiguation (if present) from a concept, leaving a potentially ambiguous concept that can be matched against surface text. Additionally, simplify language tags to a bare language. The main purpose is to remove the region tag from Chinese assertions, so they are considered simply as assertions about Chinese regardless of whether it is Traditional or Simplified Chinese. In the cases where they overlap, this helps to make the information more complete. >>> reduce_concept('/c/en/cat/n/feline') '/c/en/cat' >>> reduce_concept('/c/zh_TW/良好') '/c/zh/良好' """ parts = split_uri(concept) langtag = parts[1] if parts[1] != '[': langcode = langcodes.get(langtag).language if langcode: parts[1] = langcode return join_uri(*parts[:3])
def build_data_set(parts_dict): lang = parts_dict['lang'] dataset = join_uri('/d/conceptnet/4', lang) return dataset
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. As a special case, we convert some "Desires" and "NotDesires" relations to "HasProperty" relations, so that: - An assertion that means "People want X" in English or Chinese is converted to an association meaning "X is good" - An assertion that "People don't want X" is converted to an association meaning "X is bad" The result is used to build machine-learning models that recognize semantic similarities between words, and particularly the ConceptNet Numberbatch embedding space. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not (get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf') weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person' or start_uri == '/c/en/people': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ( 'wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl' ): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge( rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0 ) out.write(edge)
def build_data_set(parts_dict): lang = parts_dict["lang"] dataset = join_uri("/d/conceptnet/4", lang) return dataset
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - An assertion that means "People want X" in English or Chinese is converted to an association between X and "good" - An assertion that "People don't want X" is converted to an association between X and "bad" The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. FIXME: the above is out of date, we use conceptnet5.vectors now The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] pairs = [] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf' ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - An assertion that means "People want X" in English or Chinese is converted to an association between X and "good" - An assertion that "People don't want X" is converted to an association between X and "bad" The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( start_uri.startswith('/c/') and end_uri.startswith('/c/') and get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] pairs = [] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf' ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl'): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file( filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge(rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0) out.write(edge)
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. As a special case, we convert some "Desires" and "NotDesires" relations to "HasProperty" relations, so that: - An assertion that means "People want X" in English or Chinese is converted to an association meaning "X is good" - An assertion that "People don't want X" is converted to an association meaning "X is bad" The result is used to build machine-learning models that recognize semantic similarities between words, and particularly the ConceptNet Numberbatch embedding space. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf', ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person' or start_uri == '/c/en/people': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)