def run_single_process(): writer = MultiWriter('conceptnet4_nadya') raw_assertions = RawAssertion.objects.filter() for raw_assertion in raw_assertions: edges = handle_raw_assertion(raw_assertion) for edge in edges: writer.write(edge)
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def run_single_process(): writer = MultiWriter("conceptnet4_nadya") path = "./raw_data/" for filename in os.listdir(path): for raw_assertion in codecs.open(path + filename, encoding="utf-8", errors="replace"): edges = handle_raw_flat_assertion(raw_assertion) for edge in edges: writer.write(edge)
def run_single_process(): writer = MultiWriter('conceptnet4') path = "./raw_data/" for filename in os.listdir(path): for raw_assertion in codecs.open(path + filename, encoding='utf-8', errors='replace'): edges = handle_raw_assertion(raw_assertion) for edge in edges: writer.write(edge)
def run_single_process(): writer = MultiWriter('conceptnet4_zh') path = "./raw_data/" for filename in os.listdir(path): for line in codecs.open(path + filename, encoding='utf-8', errors='replace'): aggregate_assertion(line) for assertion, users in assertion_map.items(): edges = handle_raw_assertion((assertion, users)) for edge in edges: writer.write(edge)
def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary')
def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary_ja') self.nosensetrans = None # non-sense-specific translation
def create_processes(self): processes = [] for i in range(self.num_threads): writer = MultiWriter(self.writer_name + "_" + str(i)) p = Process(target=self.pull_lines, args=(self.queue, writer)) p.daemon = True p.start() processes.append(p) return processes
def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = "" self.curText = "" self.locales = [] self.curRelation = None self.writer = MultiWriter("wiktionary") self.trans = False # in translation mode
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open( CURRENT_DIR + '/data/temp/core_' + str(file_index) + '.txt', 'r', 'utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
# disambig = sense_name # break # if disambig is None: # disambig = glossary[synset] #if disambig is None: # disambig = '*' node = make_concept_uri(synset_name, 'en', pos+'/'+disambig) if synset not in mapping: mapping[synset] = node # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] sources = ['/s/wordnet/3.0'] writer = MultiWriter('wordnet3') sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json') sw_map_used = set() for line in chain( open('raw_data/wordnet-attribute.ttl'), open('raw_data/wordnet-causes.ttl'), open('raw_data/wordnet-classifiedby.ttl'), open('raw_data/wordnet-entailment.ttl'), open('raw_data/wordnet-hyponym.ttl'), open('raw_data/wordnet-instances.ttl'), open('raw_data/wordnet-membermeronym.ttl'), open('raw_data/wordnet-partmeronym.ttl'), open('raw_data/wordnet-sameverbgroupas.ttl'), open('raw_data/wordnet-similarity.ttl'), open('raw_data/wordnet-substancemeronym.ttl'),
relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/'+relname) else: relation = normalize_uri('/r/Not'+relname) dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score/5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc() if __name__ == '__main__': writer = MultiWriter('nadya.jp') queryset_foreach(RawAssertion.objects.filter(), lambda item: handle_raw_assertion(item, writer)) writer.close()
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = "" self.curText = "" self.locales = [] self.curRelation = None self.writer = MultiWriter("wiktionary") self.trans = False # in translation mode def startElement(self, name, attrs): if name == "page": self.inArticle = True self.curText = [] elif name == "title": self.inTitle = True self.curTitle = "" def endElement(self, name): if name == "page": self.inArticle = False self.handleArticle(self.curTitle, "".join(self.curText)) elif name == "title": self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split("\n") self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) ### Get translation if trans_top_match: # start translation part self.trans = True if self.trans and trans_bottom_match: # end translation part self.trans = False if self.trans and line.startswith("*{{"): # get translation lang = line[3:5] # get language of translation # find all translations of that language translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line) for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, "") ### Get relation if line.startswith("{{Synonyme}}"): # synonym self.curRelation = "synonym" elif line.startswith(u"{{Gegenwörter}}"): # antonym self.curRelation = "antonym" elif line.startswith("{{Oberbegriffe}}"): # hypernym self.curRelation = "hypernym" elif line.startswith("{{Unterbegriffe}}"): # hyponym self.curRelation = "hyponym" elif line.startswith("{{Redewendungen}}"): # idiom self.curRelation = "idiom" elif line.startswith("{{Charakteristische Wortkombinationen}}"): # word combination self.curRelation = "word combination" elif line.startswith("{{Wortbildungen}}"): # morphology self.curRelation = "morphology" if self.curRelation and line == "": # end relation self.curRelation = None if self.curRelation: related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words_or_phrases: self.output_monolingual("deu", self.curRelation, related_word, title) def output_monolingual(self, lang, relation, term1, term2): if "Wik" in term1 or "Wik" in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) # print surfaceText edge = make_edge( "/r/" + relation, source, target, "/d/wiktionary/%s/%s" % (lang, lang), license="/l/CC/By-SA", sources=[SOURCE, MONOLINGUAL], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge) def output_sense_translation(self, lang, foreign, german, disambiguation): if "Wik" in foreign or "Wik" in german: return if lang == "zh-cn": lang = "zh_CN" elif lang == "zh-tw": lang = "zh_TW" source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang) target = make_concept_uri(german, "de", disambiguation) relation = "/r/TranslationOf" try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split("/")[-1].replace("_", " "), ) # print surfaceText edge = make_edge( relation, source, target, "/d/wiktionary/en/%s" % lang, license="/l/CC/By-SA", sources=[SOURCE, TRANSLATE], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge) def output_translation(self, foreign, english, locale=""): source = make_concept_uri(unicodedata.normalize("NFKC", foreign), self.langcode + locale) target = make_concept_uri(english, "en") relation = "/r/TranslationOf" try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge( relation, source, target, "/d/wiktionary/en/%s" % self.langcode, license="/l/CC/By-SA", sources=[SOURCE, INTERLINGUAL], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge)
""" Get data from DBPedia. """ __author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])' from metanl.english import normalize_topic, un_camel_case from conceptnet5.nodes import make_concept_uri, normalize_uri from conceptnet5.edges import make_edge, MultiWriter, FlatEdgeWriter import urllib import urllib2 source = '/s/web/dbpedia.org' WRITER_NUM = 1 writer = MultiWriter('dbpedia.%d' % WRITER_NUM) sw_map = FlatEdgeWriter('data/sw/dbpedia.map.json') sw_map_used = set() def cycle_writer(): global writer, WRITER_NUM writer.close() WRITER_NUM += 1 writer = MultiWriter('dbpedia.%d' % WRITER_NUM) def translate_wp_url(url): url = urllib.unquote(url).decode('utf-8', 'ignore') return un_camel_case(url.strip('/').split('/')[-1].split('#')[-1])
weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open('data/flat/CORE', encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weight) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, weight in assertions.iteritems(): if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' relation, start, end, context, weight = assertions[uri] edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: # writer_sa.write(edge)
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary') def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos+'/'+sense elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( english, 'en', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri( english, 'en' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def create_processes(self): for i in range(self.num_threads): writer = MultiWriter(self.writer_name + "_" + str(i), self.isTest) p = Process(target=self.pull_lines, args=(self.queue, writer)) #p.daemon=True p.start()
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary') self.trans = False # in translation mode def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) ### Get translation if trans_top_match: # start translation part self.trans = True if self.trans and trans_bottom_match: # end translation part self.trans = False if self.trans and line.startswith('*{{'): # get translation lang = line[3:5] # get language of translation # find all translations of that language translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line) for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, '') ### Get relation if line.startswith('{{Synonyme}}'): # synonym self.curRelation = 'synonym' elif line.startswith(u'{{Gegenwörter}}'): # antonym self.curRelation = 'antonym' elif line.startswith('{{Oberbegriffe}}'): # hypernym self.curRelation = 'hypernym' elif line.startswith('{{Unterbegriffe}}'): # hyponym self.curRelation = 'hyponym' elif line.startswith('{{Redewendungen}}'): # idiom self.curRelation = 'idiom' elif line.startswith('{{Charakteristische Wortkombinationen}}'): \ # word combination self.curRelation = 'word combination' elif line.startswith('{{Wortbildungen}}'): # morphology self.curRelation = 'morphology' if self.curRelation and line == '': # end relation self.curRelation = None if self.curRelation: related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words_or_phrases: self.output_monolingual('deu', self.curRelation, \ related_word, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, german, disambiguation): if 'Wik' in foreign or 'Wik' in german: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( german, 'de', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri( english, 'en' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
counts = defaultdict(int) text_similarities = [] flag_out = open('data/output/flagged_assertions.txt', 'w') similar_out = open('data/output/text_similarity.txt', 'w') weak_out = open('data/output/weak_assertions.txt', 'w') good_out = open('data/output/ok_assertions.txt', 'w') sources = ['/s/site/verbosity'] writer = None if make_json: writer = MultiWriter('verbosity') for line in open('raw_data/verbosity.txt'): parts = line.strip().split('\t') if not parts: counts['blank'] += 1 continue left, relation, right, freq, orderscore = parts[:5] # catch bad stuff flagged = False for rword in right.split(): if bad_regex_no_biscuit.match(rword):
if current_obj is None: current_obj = obj current_score = obj['weight'] obj['surfaceRel'] = obj['rel'] elif obj['weight'] == current_score: if normalize(obj['arg1']) == normalize( current_obj['arg1']) and normalize( obj['arg2']) == normalize(current_obj['arg2']): current_obj['rel'] = obj['rel'] output_edge(current_obj, writer) current_obj = None current_score = None else: if current_obj is not None: output_edge(current_obj, writer) current_obj = obj current_score = obj['weight'] obj['surfaceRel'] = obj['rel'] if current_obj is not None: output_edge(current_obj, writer) writer.close() if __name__ == '__main__': writer = MultiWriter('reverb-wp-frontpage') for file_to_read in REVERB_FILES: lines = codecs.open(file_to_read, encoding='utf-8', errors='replace') handle_lines(lines, writer)
sources = [([creator_node, activity_node], 1)] for vote in raw.votes.all(): sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username), normalize_uri(u'/s/activity/omcs/vote')], vote.vote)) for source_list, weight in sources: bad = False if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) if 'bedume' in ' '.join(source_list): for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES: check = '/'+flagged.replace(' ', '_') if start.endswith(check) or end.endswith(check): bad = True print "flagged:", str(raw) break if not bad: edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc() if __name__ == '__main__': writer = MultiWriter('conceptnet4') queryset_foreach(RawAssertion.objects.filter(), lambda item: handle_raw_assertion(item, writer)) writer.close()
dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score / 5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc() if __name__ == '__main__': writer = MultiWriter('nadya.jp') queryset_foreach(RawAssertion.objects.filter(), lambda item: handle_raw_assertion(item, writer)) writer.close()
from conceptnet.models import * import os import codecs from conceptnet5.nodes import make_concept_uri from conceptnet5.edges import make_edge, MultiWriter sparse_pieces = [] for filename in os.listdir('.'): if filename.startswith('conceptnet_zh_'): writer = MultiWriter(filename.split('.')[0]) for line in codecs.open(filename, encoding='utf-8', errors='replace'): line = line.strip() if line: parts = line.split(', ') user, frame_id, concept1, concept2 = parts frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/' + relation surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = [ '/s/contributor/petgame/' + user, '/s/activity/ntt/petgame' ] edge = make_edge(rel, start,
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary') def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[ 0: 1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos + '/' + sense elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) print surfaceText edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang) target = make_concept_uri(english, 'en', disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri(unicodedata.normalize('NFKC', foreign), self.langcode + locale) target = make_concept_uri(english, 'en') relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def cycle_writer(): global writer, WRITER_NUM writer.close() WRITER_NUM += 1 writer = MultiWriter('dbpedia.%d' % WRITER_NUM)
from conceptnet5.nodes import make_concept_uri from conceptnet5.edges import MultiWriter, make_edge import yaml userdata = yaml.load_all(open('./GMUser.yaml')) users = {} writer = MultiWriter('globalmind') lang_codes = { 'eng': 'en', 'cht': 'zh_TW', 'chs': 'zh_CN', 'jpn': 'ja', 'kor': 'ko', 'spa': 'es', } lang_names = { 'eng': 'English', 'en': 'English', 'cht': 'Traditional Chinese', 'zh_TW': 'Traditional Chinese', 'chs': 'Simplified Chinese', 'zh_CN': 'Simplified Chinese', 'jpn': 'Japanese', 'ja': 'Japanese', 'kor': 'Korean', 'ko': 'Korean', 'spa': 'Spanish', 'es': 'Spanish' }
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary_ja') self.nosensetrans = None # non-sense-specific translation def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_match = TRANS.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if language_match: self.langcode = get_language_code(language_match.group(1)) ### Get sense-specific translation if trans_top_match: # start translation part pos = self.pos or 'n' # get translation sense if trans_top_match.group(1): sense = trans_top_match.group(1).lstrip('|') self.curSense = pos+'/'+sense return else: self.curSense = pos return if trans_bottom_match: # end translation part self.curSense = None return if self.curSense and line[0:5] == '*[[{{': # get translation lang = line[5:].split('}')[0] # get language of translation if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] # find all translations of that language translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, \ self.curSense) return ### Get relation if line.startswith('===={{rel}}===='): # start relation part self.curRelation = 'ConceptuallyRelatedTo' return if self.curRelation and self.langcode: # within relation part if line.startswith('*'): # get relation relations = re.findall(r"\{\{(.*?)\}\}", line) if len(relations) > 0: if relations[0] == 'syn': # synonym self.curRelation = 'Synonym' if relations[0] == 'drv': # derivative self.curRelation = 'Derivative' related_words = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words: self.output_monolingual(self.langcode, self.curRelation, \ related_word, title) self.curRelation = 'ConceptuallyRelatedTo' # back to default else: self.curRelation = None ### Get non-sense-specific translation if trans_match: self.nosensetrans = 1 # *maybe* start non-sense-specific translation if self.nosensetrans == 1 and line.startswith('{{top}}'): self.nosensetrans = 2 # start non-sense-specific translation if self.nosensetrans == 2: if line.startswith('{{bottom}}'): self.nosensetrans = None return if line.startswith('*{{'): lang = line[3:].split('}')[0] if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] translations = re.findall(r"\[\[(.*?)\]\]", line) for translation in translations: self.output_sense_translation(lang, translation, title, '') def output_monolingual(self, lang, relation, term1, term2): # skip Wiktionary: links and templates if u'ウィク' in term1 or u'ウィク' in term2: return if u'テンプレート' in term1 or u'テンプレート' in term2: return if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] source = make_concept_uri_safe(term1, lang) if self.pos: target = make_concept_uri_safe(term2, lang, self.pos) else: target = make_concept_uri_safe(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/ja/%s' % (lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, translated, disambiguation): if u':' in foreign or u':' in translated: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri_safe( translated, self.langcode, disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang if disambiguation and '/' in disambiguation: surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' ')) else: surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode), license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, japanese, locale=''): source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri_safe( japanese, 'ja' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese) edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
ccby = defaultdict(bool) for line in codecs.open('data/flat/CORE', encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weight) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, weight in assertions.iteritems(): if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' relation, start, end, context, weight = assertions[uri] edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'],