def build_relation(parts_dict): polarity = parts_dict["polarity"] relname = parts_dict["relname"] if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/' + relname) else: relation = normalize_uri('/r/Not' + relname) return relation
def build_relation(parts_dict): polarity = parts_dict["polarity"] relname = parts_dict["relname"] if relname == "ConceptuallyRelatedTo": relname = "RelatedTo" if polarity > 0: relation = normalize_uri("/r/" + relname) else: relation = normalize_uri("/r/Not" + relname) return relation
def build_relation(raw_assertion): polarity = raw_assertion.frame.frequency.value relname = raw_assertion.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) return relation
def build_sources(raw_assertion): activity = raw_assertion.sentence.activity.name creator_node = normalize_uri(u'/s/contributor/omcs/'+raw_assertion.creator.username) activity_node = normalize_uri(u'/s/activity/omcs/'+activity) sources = [([creator_node, activity_node], 1)] for vote in raw_assertion.votes.all(): sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username), normalize_uri(u'/s/activity/omcs/vote')], vote.vote)) return sources
def build_relation(parts_dict): polarity = parts_dict["polarity"] relname = parts_dict["relname"] if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) return relation
def build_sources(parts_dict): activity = parts_dict["activity"] creator_node = normalize_uri(u'/s/contributor/omcs/'+parts_dict["creator"]) activity_node = normalize_uri(u'/s/activity/omcs/'+activity) sources = [([creator_node, activity_node], 1)] for vote in parts_dict["votes"]: username = vote[0] vote_int = vote[1] sources.append(([normalize_uri('/s/contributor/omcs/'+username), normalize_uri(u'/s/activity/omcs/vote')], vote_int)) return sources
def build_sources(raw_assertion): activity = raw_assertion.sentence.activity.name creator_node = normalize_uri(u'/s/contributor/omcs/' + raw_assertion.creator.username) activity_node = normalize_uri(u'/s/activity/omcs/' + activity) sources = [([creator_node, activity_node], 1)] for vote in raw_assertion.votes.all(): sources.append(([ normalize_uri('/s/contributor/omcs/' + vote.user.username), normalize_uri(u'/s/activity/omcs/vote') ], vote.vote)) return sources
def build_sources(parts_dict, preposition_fix=False): activity = parts_dict["activity"] creator_node = normalize_uri(u'/s/contributor/omcs/'+parts_dict["creator"]) activity_node = normalize_uri(u'/s/activity/omcs/'+activity) if preposition_fix: sources = [([creator_node, activity_node, '/s/rule/preposition_fix'], 1)] else: sources = [([creator_node, activity_node], 1)] for vote in parts_dict["votes"]: username = vote[0] vote_int = vote[1] sources.append(([normalize_uri('/s/contributor/omcs/'+username), normalize_uri(u'/s/activity/omcs/vote')], vote_int)) return sources
def handle_triple(line): items = line.split() for i in xrange(3): if not (items[i].startswith('<') and items[i].endswith('>')): return items[i] = items[i][1:-1] subj, pred, obj = items[:3] if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return subj_concept = make_concept_uri(translate_wp_url(subj), 'en') obj_concept = make_concept_uri(translate_wp_url(obj), 'en') webrel = map_web_relation(pred) if webrel is None: return rel = normalize_uri('/r/'+webrel) if (pred, rel) not in sw_map_used: sw_map_used.add((pred, rel)) sw_map.write({'from': pred, 'to': rel}) if (subj, subj_concept) not in sw_map_used: sw_map_used.add((subj, subj_concept)) sw_map.write({'from': subj, 'to': subj_concept}) if (obj, obj_concept) not in sw_map_used: sw_map_used.add((obj, obj_concept)) sw_map.write({'from': obj, 'to': obj_concept}) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license='/l/CC/By-SA', sources=['/s/dbpedia/3.7'], context='/ctx/all', weight=0.5) writer.write(edge)
def handle_triple(line): items = line.split() for i in xrange(3): if not (items[i].startswith('<') and items[i].endswith('>')): return items[i] = items[i][1:-1] subj, pred, obj = items[:3] if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return subj_concept = make_concept_uri(translate_wp_url(subj), 'en') obj_concept = make_concept_uri(translate_wp_url(obj), 'en') webrel = map_web_relation(pred) if webrel is None: return rel = normalize_uri('/r/' + webrel) if (pred, rel) not in sw_map_used: sw_map_used.add((pred, rel)) sw_map.write({'from': pred, 'to': rel}) if (subj, subj_concept) not in sw_map_used: sw_map_used.add((subj, subj_concept)) sw_map.write({'from': subj, 'to': subj_concept}) if (obj, obj_concept) not in sw_map_used: sw_map_used.add((obj, obj_concept)) sw_map.write({'from': obj, 'to': obj_concept}) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license='/l/CC/By-SA', sources=['/s/dbpedia/3.7'], context='/ctx/all', weight=0.5) writer.write(edge)
def _create_node_by_type(self, uri, properties = {}): """ creates generic node object, parses uri, takes out args, identifies type of node and runs relevant method args: uri -- identifier of intended node, used in index properties -- (optional) properties for assertions (see assertions) """ # Apply normalization to the URI here. All downstream functions can # assume it's normalized. uri = normalize_uri(uri) if uri.count('/') < 2: raise ValueError(""" The URI %r is too short. You can't create the root or a type with this method. """ % uri) _, _type, rest = uri.split('/', 2) # Check if this is a web_concept if uri.find('http') == 0: return self._create_web_concept_node( '/web_concept/%s' % uri, uri, properties) method = getattr(self, '_create_%s_node' % _type) if method is None: raise ValueError("I don't know how to create type %r" % _type) return method(uri, rest, properties)
def get_node(self, uri): """ searches for node in main index, returns either single Node, None or Error (for multiple results) args: uri -- the uri of the node in question """ uri = normalize_uri(uri) return self.db.nodes.find_one({'uri': uri})
def get_prefix(self, uri_prefix): """ returns a generator of nodes whose uri begins with uri_prefix args: uri_prefix -- the prefix which the uri of the nodes must have """ uri_prefix = normalize_uri(uri_prefix) regex = '^' + uri_prefix return self.get_regex(regex)
def build_sources(parts_dict, preposition_fix=False): activity = parts_dict["activity"] creator_node = normalize_uri(u'/s/contributor/omcs/' + parts_dict["creator"]) activity_node = normalize_uri(u'/s/activity/omcs/' + activity) if preposition_fix: sources = [([creator_node, activity_node, '/s/rule/preposition_fix'], 1)] else: sources = [([creator_node, activity_node], 1)] for vote in parts_dict["votes"]: username = vote[0] vote_int = vote[1] sources.append(([ normalize_uri('/s/contributor/omcs/' + username), normalize_uri(u'/s/activity/omcs/vote') ], vote_int)) return sources
def get_or_create_source(self, source_list): """ finds or creates source using a list of the source uri components. convenience function. args: source_list -- list of source components ex. for '/source/contributor/omcs/bedume' source_list would be ['contributor','omcs','bedume'] """ uri = normalize_uri("/source/" + "/".join(source_list)) return self.get_node(uri) or self._create_node_by_type(uri, {})
def handle_raw_assertion(raw, writer): try: lang = raw.language_id assert lang == 'ja' if raw.frame.goodness < 1: return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace( '{2}', '[[%s]]' % raw.text2) activity_node = normalize_uri(u'/s/site/nadya.jp') startText = ' '.join(JA.normalize_list(raw.text1)) endText = ' '.join(JA.normalize_list(raw.text2)) if startText != raw.text1: print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8') normalize_uri('/text/' + lang + '/' + startText) end = normalize_uri('/text/' + lang + '/' + endText) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/' + relname) else: relation = normalize_uri('/r/Not' + relname) dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score / 5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def _any_to_uri(self, obj): """ Converts any given input in the form of an id, uri or node into a uri string. args: obj -- the object to be converted """ if isinstance(obj, basestring): return normalize_uri(obj) elif hasattr(obj, '__getitem__'): return obj['uri'] elif obj == 0: # backwards compatibility return u'/' else: raise TypeError
def handle_raw_assertion(raw, writer): try: lang = raw.language_id assert lang == 'ja' if raw.frame.goodness < 1: return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace('{2}', '[[%s]]' % raw.text2) activity_node = normalize_uri(u'/s/site/nadya.jp') startText = ' '.join(JA.normalize_list(raw.text1)) endText = ' '.join(JA.normalize_list(raw.text2)) if startText != raw.text1: print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8') normalize_uri('/text/'+lang+'/'+startText) end = normalize_uri('/text/'+lang+'/'+endText) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score/5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def get_node_w_score(self, uri): """ functions in the same manner as get_node, also queries the justification database in order to find the score of the queried node. args: uri -- the uri of the node in question """ uri = normalize_uri(uri) id_uri = uri[1:] return_dict = self.db.nodes.find_one({'uri':uri}) score = self.db.justification.find_one({'_id':uri}) if score == None: return_dict['score'] = None else: return_dict['score'] = score['value'] return return_dict
def get_regex(self, uri_regex): """ returns a generator of nodes whose uri regular expression matches uri_regex args: uri_regex -- the regex which the uri of the nodes must match """ uri_regex = normalize_uri(uri_regex) latest_result = '' while True: hasMore = False for node in self.db.nodes.find \ ({ 'uri' : {'$regex' : uri_regex, '$gt' : latest_result}}) \ .limit(100): yield node hasMore = True latest_result = node['uri'] if not hasMore: break
def handle_raw_assertion(raw, writer): try: lang = raw.language_id if raw.frame.goodness < 1: return if lang.startswith('zh'): return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text if polarity > 0: frame_text = frame_text.replace('{%}', '') else: frame_text = frame_text.replace('{%}', 'not') frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace('{2}', '[[%s]]' % raw.text2) creator_node = normalize_uri(u'/s/contributor/omcs/'+raw.creator.username) activity_node = normalize_uri(u'/s/activity/omcs/'+activity) startText = raw.text1 endText = raw.text2 normalize_uri('/text/'+lang+'/'+raw.text1) end = normalize_uri('/text/'+lang+'/'+raw.text2) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) dataset = normalize_uri('/d/conceptnet/4/'+lang) sources = [([creator_node, activity_node], 1)] for vote in raw.votes.all(): sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username), normalize_uri(u'/s/activity/omcs/vote')], vote.vote)) for source_list, weight in sources: bad = False if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) if 'bedume' in ' '.join(source_list): for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES: check = '/'+flagged.replace(' ', '_') if start.endswith(check) or end.endswith(check): bad = True print "flagged:", str(raw) break if not bad: edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def _any_to_uri(self, obj): if isinstance(obj, basestring): return normalize_uri(obj) else: raise TypeError
def build_sources(raw_assertion): score = raw_assertion.score activity_node = normalize_uri(u'/s/site/nadya.jp') sources = [([activity_node], score/5.)] return sources
def test_normalize_uri(): assert normalize_uri(' one two') == u'one_two' assert normalize_uri(normalize_uri(' one two')) == u'one_two'
def build_data_set(raw_assertion): lang = raw_assertion.language_id dataset = normalize_uri('/d/conceptnet/4/'+lang) return dataset
def build_data_set(raw_assertion): lang = raw_assertion.language_id dataset = normalize_uri('/d/conceptnet/4/' + lang) return dataset
def build_data_set(parts_dict): lang = parts_dict["lang"] dataset = normalize_uri("/d/conceptnet/4/" + lang) return dataset
def build_sources(parts_dict): score = parts_dict["score"] activity_node = normalize_uri(u"/s/site/nadya.jp") sources = [([activity_node], score / 5.0)] return sources
def build_data_set(): return normalize_uri('/d/nadya.jp')
def build_sources(parts_dict): score = parts_dict["score"] activity_node = normalize_uri(u'/s/site/nadya.jp') sources = [([activity_node], score / 5.)] return sources
def build_data_set(): return normalize_uri("/d/nadya.jp")
def build_data_set(parts_dict): lang = parts_dict['lang'] dataset = normalize_uri('/d/conceptnet/4/'+lang) return dataset