def run_single_process():
    writer = MultiWriter('conceptnet4_nadya')
    raw_assertions = RawAssertion.objects.filter()
    for raw_assertion in raw_assertions:
        edges = handle_raw_assertion(raw_assertion)
        for edge in edges:
            writer.write(edge)
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True


    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
Example #3
0
def run_single_process():
    writer = MultiWriter('conceptnet4_nadya')
    raw_assertions = RawAssertion.objects.filter()
    for raw_assertion in raw_assertions:
        edges = handle_raw_assertion(raw_assertion)
        for edge in edges:
            writer.write(edge)
def run_single_process():
    writer = MultiWriter("conceptnet4_nadya")
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename, encoding="utf-8", errors="replace"):
            edges = handle_raw_flat_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
def run_single_process():
    writer = MultiWriter('conceptnet4')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename, encoding='utf-8', errors='replace'):
            edges = handle_raw_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
Example #6
0
def run_single_process():
    writer = MultiWriter('conceptnet4_zh')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for line in codecs.open(path + filename, encoding='utf-8', errors='replace'):
            aggregate_assertion(line)
    for assertion, users in assertion_map.items():
        edges = handle_raw_assertion((assertion, users))
        for edge in edges:
            writer.write(edge)
Example #7
0
def run_single_process():
    writer = MultiWriter('conceptnet4')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for raw_assertion in codecs.open(path + filename,
                                         encoding='utf-8',
                                         errors='replace'):
            edges = handle_raw_assertion(raw_assertion)
            for edge in edges:
                writer.write(edge)
Example #8
0
def run_single_process():
    writer = MultiWriter('conceptnet4_zh')
    path = "./raw_data/"
    for filename in os.listdir(path):
        for line in codecs.open(path + filename,
                                encoding='utf-8',
                                errors='replace'):
            aggregate_assertion(line)
    for assertion, users in assertion_map.items():
        edges = handle_raw_assertion((assertion, users))
        for edge in edges:
            writer.write(edge)
Example #9
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split(
                '\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb')
                        or dataset.startswith('/d/wiktionary')
                        or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #10
0
def sum_assertions(file_index):
    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)

    for line in codecs.open(
            CURRENT_DIR + '/data/temp/core_' + str(file_index) + '.txt', 'r',
            'utf-8'):
        uri, rel, start, end, context, weight, sources, id, dataset = line.split(
            '\t')[:9]
        if uri != 'uri' and context == '/ctx/all':
            weight = float(weight)
            weights[uri] += float(weight)
            assertions[uri] = (rel, start, end, context, weights[uri])
            if not (dataset.startswith('/d/reverb')
                    or dataset.startswith('/d/wiktionary')
                    or dataset.startswith('/d/dbpedia')):
                ccby[uri] = True

    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')
    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation,
                         start,
                         end,
                         dataset,
                         license, ['/s/rule/sum_edges'],
                         '/ctx/all',
                         weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
        #writer_sa.write(edge)
    writer_core.close()
Example #11
0
def build_core_from_csvs(csv_files):

    weights = defaultdict(float)
    assertions = {}
    ccby = defaultdict(bool)


    for csv_file in csv_files:
        print "currently in file: " + str(csv_file)
        for line in codecs.open(csv_file, encoding='utf-8'):
            uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
            if uri != 'uri' and context == '/ctx/all':
                weight = float(weight)
                weights[uri] += float(weight)
                assertions[uri] = (rel, start, end, context, weights[uri])
                if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
                    ccby[uri] = True

    print 'writing'
    writer_core = MultiWriter('assertion_totals_core')
    #writer_sa = MultiWriter('assertion_totals_sa')

    for uri, values in assertions.iteritems():
        relation, start, end, context, weight = values
        if ccby[uri]:
            license = '/l/CC/By'
            dataset = '/d/conceptnet/5/combined-core'
        else:
            license = '/l/CC/By-SA'
            dataset = '/d/conceptnet/5/combined-sa'
        edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
        if license == '/l/CC/By':
            writer_core.write(edge)
        #else:
            #writer_sa.write(edge)
    writer_core.close()
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary')
        self.trans = False # in translation mode

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        
        ### Get translation        
        if trans_top_match: # start translation part
            self.trans = True
        if self.trans and trans_bottom_match: # end translation part
            self.trans = False
        if self.trans and line.startswith('*{{'): # get translation
            lang = line[3:5] # get language of translation
            # find all translations of that language
            translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line)
            for translation in translations: # iterate over translations
                self.output_sense_translation(lang, translation, title, '')
        
        ### Get relation
        if line.startswith('{{Synonyme}}'): # synonym
            self.curRelation = 'synonym'
        elif line.startswith(u'{{Gegenwörter}}'): # antonym
            self.curRelation = 'antonym'
        elif line.startswith('{{Oberbegriffe}}'): # hypernym
            self.curRelation = 'hypernym'
        elif line.startswith('{{Unterbegriffe}}'): # hyponym
            self.curRelation = 'hyponym'
        elif line.startswith('{{Redewendungen}}'): # idiom
            self.curRelation = 'idiom'
        elif line.startswith('{{Charakteristische Wortkombinationen}}'): \
             # word combination
            self.curRelation = 'word combination'
        elif line.startswith('{{Wortbildungen}}'): # morphology
            self.curRelation = 'morphology'
        if self.curRelation and line == '': # end relation
            self.curRelation = None
        if self.curRelation:
            related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line)
            for related_word in related_words_or_phrases:
                self.output_monolingual('deu', self.curRelation, \
                                            related_word, title)    
        
                
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, german, disambiguation):
        if 'Wik' in foreign or 'Wik' in german:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri(
          german, 'de', disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri(
          english, 'en'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #13
0
    
    score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000
    if score <= 0:
        counts['low score'] += 1
        weak_out.write(line)
        continue

    count += 1
    counts['success'] += 1
    good_out.write(line)
    
    if make_json:
        edge = make_edge(rel, left, right, '/d/verbosity',
                         '/l/CC/By', sources, surfaceText=text,
                         weight = score/10.0)
        writer.write(edge)


if make_json:
    writer.close()

flag_out.close()
good_out.close()
weak_out.close()
similar_out.close()

simout = open('data/output/similarity-scores.txt', 'w')
for sim in text_similarities:
    print >> simout, sim
simout.close()
Example #14
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary')

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos+'/'+sense
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)
    
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri(
          english, 'en', disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri(
          english, 'en'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary_ja')
        self.nosensetrans = None # non-sense-specific translation

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_match = TRANS.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)

        if language_match:
            self.langcode = get_language_code(language_match.group(1))
        
        ### Get sense-specific translation
        if trans_top_match: # start translation part
            pos = self.pos or 'n'
            # get translation sense
            if trans_top_match.group(1):
                sense = trans_top_match.group(1).lstrip('|')
                self.curSense = pos+'/'+sense
                return
            else:
                self.curSense = pos
                return
        if trans_bottom_match: # end translation part
            self.curSense = None
            return
        if self.curSense and line[0:5] == '*[[{{': # get translation
            lang = line[5:].split('}')[0]  # get language of translation
            if lang in LANGUAGES_3_TO_2:   # convert 3-letter code to 2-letter code
                lang = LANGUAGES_3_TO_2[lang]
            # find all translations of that language
            translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] 
            for translation in translations: # iterate over translations
                self.output_sense_translation(lang, translation, title, \
                                              self.curSense)
            return

        ### Get relation
        if line.startswith('===={{rel}}===='): # start relation part
            self.curRelation = 'ConceptuallyRelatedTo'
            return
        if self.curRelation and self.langcode: # within relation part
            if line.startswith('*'): # get relation
                relations = re.findall(r"\{\{(.*?)\}\}", line)
                if len(relations) > 0:
                    if relations[0] == 'syn': # synonym
                        self.curRelation = 'Synonym'
                    if relations[0] == 'drv': # derivative
                        self.curRelation = 'Derivative'                    
                related_words = re.findall(r"\[\[(.*?)\]\]", line)
                for related_word in related_words:
                    self.output_monolingual(self.langcode, self.curRelation, \
                                            related_word, title)
                self.curRelation = 'ConceptuallyRelatedTo' # back to default
            else:
                self.curRelation = None

        ### Get non-sense-specific translation
        if trans_match: 
            self.nosensetrans = 1 # *maybe* start non-sense-specific translation
        if self.nosensetrans == 1 and line.startswith('{{top}}'):
            self.nosensetrans = 2 # start non-sense-specific translation            
        if self.nosensetrans == 2:
            if line.startswith('{{bottom}}'):
                self.nosensetrans = None
                return
            if line.startswith('*{{'):
                lang = line[3:].split('}')[0]
                if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
                    lang = LANGUAGES_3_TO_2[lang]
                translations = re.findall(r"\[\[(.*?)\]\]", line)
                for translation in translations:
                    self.output_sense_translation(lang, translation, title, '')
    
    def output_monolingual(self, lang, relation, term1, term2):
        # skip Wiktionary: links and templates
        if u'ウィク' in term1 or u'ウィク' in term2:
            return
        if u'テンプレート' in term1 or u'テンプレート' in term2:
            return

        if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
            lang = LANGUAGES_3_TO_2[lang]
        source = make_concept_uri_safe(term1, lang)
        if self.pos:
            target = make_concept_uri_safe(term2, lang, self.pos)
        else:
            target = make_concept_uri_safe(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/ja/%s' % (lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, translated, disambiguation):
        if u':' in foreign or u':' in translated:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri_safe(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri_safe(
          translated, self.langcode, disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        if disambiguation and '/' in disambiguation:
            surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' '))
        else:
            surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated)
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, japanese, locale=''):
        source = make_concept_uri_safe(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri_safe(
          japanese, 'ja'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese)
        edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #16
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ""
        self.curText = ""
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter("wiktionary")
        self.trans = False  # in translation mode

    def startElement(self, name, attrs):
        if name == "page":
            self.inArticle = True
            self.curText = []
        elif name == "title":
            self.inTitle = True
            self.curTitle = ""

    def endElement(self, name):
        if name == "page":
            self.inArticle = False
            self.handleArticle(self.curTitle, "".join(self.curText))
        elif name == "title":
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split("\n")
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)

        ### Get translation
        if trans_top_match:  # start translation part
            self.trans = True
        if self.trans and trans_bottom_match:  # end translation part
            self.trans = False
        if self.trans and line.startswith("*{{"):  # get translation
            lang = line[3:5]  # get language of translation
            # find all translations of that language
            translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line)
            for translation in translations:  # iterate over translations
                self.output_sense_translation(lang, translation, title, "")

        ### Get relation
        if line.startswith("{{Synonyme}}"):  # synonym
            self.curRelation = "synonym"
        elif line.startswith(u"{{Gegenwörter}}"):  # antonym
            self.curRelation = "antonym"
        elif line.startswith("{{Oberbegriffe}}"):  # hypernym
            self.curRelation = "hypernym"
        elif line.startswith("{{Unterbegriffe}}"):  # hyponym
            self.curRelation = "hyponym"
        elif line.startswith("{{Redewendungen}}"):  # idiom
            self.curRelation = "idiom"
        elif line.startswith("{{Charakteristische Wortkombinationen}}"):
        # word combination
            self.curRelation = "word combination"
        elif line.startswith("{{Wortbildungen}}"):  # morphology
            self.curRelation = "morphology"
        if self.curRelation and line == "":  # end relation
            self.curRelation = None
        if self.curRelation:
            related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line)
            for related_word in related_words_or_phrases:
                self.output_monolingual("deu", self.curRelation, related_word, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if "Wik" in term1 or "Wik" in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        # print surfaceText

        edge = make_edge(
            "/r/" + relation,
            source,
            target,
            "/d/wiktionary/%s/%s" % (lang, lang),
            license="/l/CC/By-SA",
            sources=[SOURCE, MONOLINGUAL],
            context="/ctx/all",
            weight=1.5,
            surfaceText=surfaceText,
        )
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, german, disambiguation):
        if "Wik" in foreign or "Wik" in german:
            return
        if lang == "zh-cn":
            lang = "zh_CN"
        elif lang == "zh-tw":
            lang = "zh_TW"
        source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang)
        target = make_concept_uri(german, "de", disambiguation)
        relation = "/r/TranslationOf"
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign,
            surfaceRel,
            english,
            disambiguation.split("/")[-1].replace("_", " "),
        )
        # print surfaceText
        edge = make_edge(
            relation,
            source,
            target,
            "/d/wiktionary/en/%s" % lang,
            license="/l/CC/By-SA",
            sources=[SOURCE, TRANSLATE],
            context="/ctx/all",
            weight=1.5,
            surfaceText=surfaceText,
        )
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=""):
        source = make_concept_uri(unicodedata.normalize("NFKC", foreign), self.langcode + locale)
        target = make_concept_uri(english, "en")
        relation = "/r/TranslationOf"
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(
            relation,
            source,
            target,
            "/d/wiktionary/en/%s" % self.langcode,
            license="/l/CC/By-SA",
            sources=[SOURCE, INTERLINGUAL],
            context="/ctx/all",
            weight=1.5,
            surfaceText=surfaceText,
        )
        self.writer.write(edge)
Example #17
0
class FindTranslations(ContentHandler):
    def __init__(self):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = MultiWriter('wiktionary')

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[
                0:
                1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos + '/' + sense
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        print surfaceText

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang)
        target = make_concept_uri(english, 'en', disambiguation)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign, surfaceRel, english,
            disambiguation.split('/')[-1].replace('_', ' '))
        print surfaceText
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(unicodedata.normalize('NFKC', foreign),
                                  self.langcode + locale)
        target = make_concept_uri(english, 'en')
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
for line in codecs.open('data/flat/CORE', encoding='utf-8'):
    uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9]
    if uri != 'uri' and context == '/ctx/all':
        weight = float(weight)
        weights[uri] += float(weight)
        assertions[uri] = (rel, start, end, context, weight)
        if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight)
    if license == '/l/CC/By':
        writer_core.write(edge)
    #else:
    #    writer_sa.write(edge)
writer_core.close()
#writer_sa.close()

Example #19
0
    if score <= 0:
        counts['low score'] += 1
        weak_out.write(line)
        continue

    count += 1
    counts['success'] += 1
    good_out.write(line)
    
    if make_json:
        left = make_concept_uri(unicode(left), 'en')
        right = make_concept_uri(unicode(right), 'en')
        edge = make_edge(rel, left, right, '/d/verbosity',
                         '/l/CC/By', sources, surfaceText=text,
                         weight = score/10.0)
        writer.write(edge)


if make_json:
    writer.close()

flag_out.close()
good_out.close()
weak_out.close()
similar_out.close()

simout = open('data/output/similarity-scores.txt', 'w')
for sim in text_similarities:
    print >> simout, sim
simout.close()
Example #20
0
        if not (dataset.startswith('/d/reverb')
                or dataset.startswith('/d/wiktionary')
                or dataset.startswith('/d/dbpedia')):
            ccby[uri] = True

print 'writing'
writer_core = MultiWriter('assertion_totals_core')
#writer_sa = MultiWriter('assertion_totals_sa')

for uri, weight in assertions.iteritems():
    if ccby[uri]:
        license = '/l/CC/By'
        dataset = '/d/conceptnet/5/combined-core'
    else:
        license = '/l/CC/By-SA'
        dataset = '/d/conceptnet/5/combined-sa'
    relation, start, end, context, weight = assertions[uri]
    edge = make_edge(relation,
                     start,
                     end,
                     dataset,
                     license, ['/s/rule/sum_edges'],
                     '/ctx/all',
                     weight=weight)
    if license == '/l/CC/By':
        writer_core.write(edge)
    #else:
    #    writer_sa.write(edge)
writer_core.close()
#writer_sa.close()