Example #1
0
 def __init__(self, out_filename='wiktionary.json'):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = FlatEdgeWriter(out_filename)
Example #2
0
 def __init__(self, out_filename='wiktionary_ja.json'):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = FlatEdgeWriter(out_filename)
     self.nosensetrans = None  # non-sense-specific translation
Example #3
0
 def __init__(self, out_filename='wiktionary.json'):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = FlatEdgeWriter(out_filename)
Example #4
0
 def __init__(self, out_filename='wiktionary_ja.json'):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = FlatEdgeWriter(out_filename)
     self.nosensetrans = None # non-sense-specific translation
Example #5
0
def run_wordnet(input_dir, output_file, sw_map_file):
    mapping = {}
    labels = {}
    prefixes = {}
    glossary = {}
    synset_senses = defaultdict(list)
    synset_sense_names = defaultdict(list)
    sense_name_synsets = defaultdict(list)
    sense_synsets = defaultdict(list)

    parts_of_speech = {
        'noun': 'n',
        'verb': 'v',
        'adjective': 'a',
        'adjectivesatellite': 'a',
        'adverb': 'r',
    }

    rel_mapping = {
        'attribute': 'Attribute',
        'causes': 'Causes',
        'classifiedByRegion': 'HasContext',
        'classifiedByUsage': 'HasContext',
        'classifiedByTopic': 'HasContext',
        'entails': 'Entails',
        'hyponymOf': 'IsA',
        'instanceOf': 'InstanceOf',
        'memberMeronymOf': 'MemberOf',
        'partMeronymOf': 'PartOf',
        'sameVerbGroupAs': '******',
        'similarTo': 'SimilarTo',
        'substanceMeronymOf': '~MadeOf',
        'antonymOf': 'Antonym',
        'derivationallyRelated': '~DerivedFrom',
        'pertainsTo': 'PertainsTo',
        'seeAlso': 'RelatedTo',
    }

    def resolve_prefix(entry):
        prefix, name = entry.split(':')
        return prefixes[prefix] + name

    def handle_line(line):
        """
        Get the (subj, obj, pred) parts of a line, unless it's a blank line
        or a prefix definition, in which case return None.
        """
        line = line.decode('utf-8').strip()
        if not line:
            return None
        parts = line.split(None, 2)
        if parts[0] == '@prefix':
            prefix = parts[1].strip(': ')
            value = parts[2].strip('<>. ')
            prefixes[prefix] = value
            return None
        return parts[0], parts[1], parts[2].strip('. ')

    # First, get the human-readable label and gloss for every synset.
    for line in chain(
        open(input_dir + '/wordnet-synset.ttl'),
        open(input_dir + '/full/wordnet-wordsensesandwords.ttl'),
        open(input_dir + '/wordnet-glossary.ttl')
    ):
        parts = handle_line(line)
        if parts is None:
            continue
        if parts[1] == 'rdfs:label':
            subj = resolve_prefix(parts[0])
            obj = parts[2].split('"')[1]
            labels[subj] = obj
        elif parts[1] == 'wn20schema:gloss':
            subj = resolve_prefix(parts[0])
            obj = parts[2].split('"')[1]
            glossary[subj] = obj.split(';')[0]
            while '(' in glossary[subj] and ')' in glossary[subj]:
                glossary[subj] = re.sub(r"\([^)]+\) ?", r"", glossary[subj])

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    for line in open(input_dir + '/full/wordnet-wordsense-synset-relations.ttl'):
        parts = handle_line(line)
        if parts is None:
            continue
        if parts[1] == 'wn20schema:containsWordSense':
            subj = resolve_prefix(parts[0])
            obj = resolve_prefix(parts[2].strip('. '))
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj
            sense_name = labels[obj]
            synset_sense_names[subj].append(sense_name)
            sense_name_synsets[sense_name].append(subj)

    # Assign every synset a disambiguation name.
    for synset in synset_senses:
        senses = sorted(synset_senses[synset])
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = parts_of_speech[synset_pos]
        disambig = glossary[synset].replace('/', '_')
        # TODO: take into account domains, etc.
        #
        #if len(sense_name_synsets[synset_name]) > 1:
        #    for sense in senses:
        #        sense_name = labels[sense]
        #        more_synsets = sense_name_synsets[sense_name]
        #        if len(more_synsets) == 1:
        #            disambig = sense_name
        #            break
        #    if disambig is None:
        #        disambig = glossary[synset]
        #if disambig is None:
        #    disambig = '*'
        node = make_concept_uri(synset_name, 'en', pos+'/'+disambig)
        if synset not in mapping:
            mapping[synset] = node

    # Map senses to the same nodes.
    for sense, synset in sense_synsets.items():
        mapping[sense] = mapping[synset]

    sources = ['/s/wordnet/3.0']
    writer = FlatEdgeWriter(output_file)
    sw_map = FlatEdgeWriter(sw_map_file)
    sw_map_used = set()

    for line in chain(
        open(input_dir + '/wordnet-attribute.ttl'),
        open(input_dir + '/wordnet-causes.ttl'),
        open(input_dir + '/wordnet-classifiedby.ttl'),
        open(input_dir + '/wordnet-entailment.ttl'),
        open(input_dir + '/wordnet-hyponym.ttl'),
        open(input_dir + '/wordnet-instances.ttl'),
        open(input_dir + '/wordnet-membermeronym.ttl'),
        open(input_dir + '/wordnet-partmeronym.ttl'),
        open(input_dir + '/wordnet-sameverbgroupas.ttl'),
        open(input_dir + '/wordnet-similarity.ttl'),
        open(input_dir + '/wordnet-substancemeronym.ttl'),
        open(input_dir + '/full/wordnet-antonym.ttl'),
        open(input_dir + '/full/wordnet-derivationallyrelated.ttl'),
        open(input_dir + '/full/wordnet-participleof.ttl'),
        open(input_dir + '/full/wordnet-pertainsto.ttl'),
        open(input_dir + '/full/wordnet-seealso.ttl'),
    ):
        parts = handle_line(line)
        if parts is None:
            continue
        web_subj = resolve_prefix(parts[0])
        web_rel = resolve_prefix(parts[1])
        web_obj = resolve_prefix(parts[2])
        subj = mapping[web_subj]
        obj = mapping[web_obj]
        pred_label = parts[1].split(':')[-1]
        if pred_label in rel_mapping:
            mapped = rel_mapping[pred_label]
            if mapped.startswith('~'):
                subj, obj = obj, subj
                web_subj, web_obj = web_obj, web_subj
                web_rel = web_rel.replace('meronym', 'holonym')
                mapped = mapped[1:]
            pred = '/r/'+mapped
        else:
            pred = '/r/wordnet/'+pred_label

        if (web_rel, pred) not in sw_map_used:
            sw_map.write({'from': web_rel, 'to': pred})
            sw_map_used.add((web_rel, pred))
        if (web_subj, subj) not in sw_map_used:
            sw_map.write({'from': web_subj, 'to': subj})
            sw_map_used.add((web_subj, subj))
        if (web_obj, obj) not in sw_map_used:
            sw_map.write({'from': web_obj, 'to': obj})
            sw_map_used.add((web_obj, obj))

        edge = make_edge(
            pred, subj, obj, '/d/wordnet/3.0',
            license='/l/CC/By', sources=sources,
            context='/ctx/all', weight=2.0
        )
        writer.write(edge)

    writer.close()
    sw_map.close()
Example #6
0
class FindTranslations(ContentHandler):
    def __init__(self, out_filename='wiktionary_ja.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = FlatEdgeWriter(out_filename)
        self.nosensetrans = None # non-sense-specific translation

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_match = TRANS.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)

        if language_match:
            self.langcode = get_language_code(language_match.group(1))
        
        ### Get sense-specific translation
        if trans_top_match: # start translation part
            pos = self.pos or 'n'
            # get translation sense
            if trans_top_match.group(1):
                sense = trans_top_match.group(1).lstrip('|')
                self.curSense = pos+'/'+sense
                return
            else:
                self.curSense = pos
                return
        if trans_bottom_match: # end translation part
            self.curSense = None
            return
        if self.curSense and line[0:5] == '*[[{{': # get translation
            lang = line[5:].split('}')[0]  # get language of translation
            if lang in LANGUAGES_3_TO_2:   # convert 3-letter code to 2-letter code
                lang = LANGUAGES_3_TO_2[lang]
            # find all translations of that language
            translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] 
            for translation in translations: # iterate over translations
                self.output_sense_translation(lang, translation, title, \
                                              self.curSense)
            return

        ### Get relation
        if line.startswith('===={{rel}}===='): # start relation part
            self.curRelation = 'ConceptuallyRelatedTo'
            return
        if self.curRelation and self.langcode: # within relation part
            if line.startswith('*'): # get relation
                relations = re.findall(r"\{\{(.*?)\}\}", line)
                if len(relations) > 0:
                    if relations[0] == 'syn': # synonym
                        self.curRelation = 'Synonym'
                    if relations[0] == 'drv': # derivative
                        self.curRelation = 'Derivative'                    
                related_words = re.findall(r"\[\[(.*?)\]\]", line)
                for related_word in related_words:
                    self.output_monolingual(self.langcode, self.curRelation, \
                                            related_word, title)
                self.curRelation = 'ConceptuallyRelatedTo' # back to default
            else:
                self.curRelation = None

        ### Get non-sense-specific translation
        if trans_match: 
            self.nosensetrans = 1 # *maybe* start non-sense-specific translation
        if self.nosensetrans == 1 and line.startswith('{{top}}'):
            self.nosensetrans = 2 # start non-sense-specific translation            
        if self.nosensetrans == 2:
            if line.startswith('{{bottom}}'):
                self.nosensetrans = None
                return
            if line.startswith('*{{'):
                lang = line[3:].split('}')[0]
                if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
                    lang = LANGUAGES_3_TO_2[lang]
                translations = re.findall(r"\[\[(.*?)\]\]", line)
                for translation in translations:
                    self.output_sense_translation(lang, translation, title, '')
    
    def output_monolingual(self, lang, relation, term1, term2):
        # skip Wiktionary: links and templates
        if u'ウィク' in term1 or u'ウィク' in term2:
            return
        if u'テンプレート' in term1 or u'テンプレート' in term2:
            return

        if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
            lang = LANGUAGES_3_TO_2[lang]
        source = make_concept_uri_safe(term1, lang)
        if self.pos:
            target = make_concept_uri_safe(term2, lang, self.pos)
        else:
            target = make_concept_uri_safe(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/ja/%s' % (lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, translated, disambiguation):
        if u':' in foreign or u':' in translated:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri_safe(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri_safe(
          translated, self.langcode, disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        if disambiguation and '/' in disambiguation:
            surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' '))
        else:
            surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated)
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, japanese, locale=''):
        source = make_concept_uri_safe(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri_safe(
          japanese, 'ja'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese)
        edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #7
0
"""
Get data from DBPedia.
"""

__author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])'

from metanl.english import normalize_topic, un_camel_case
from conceptnet5.nodes import make_concept_uri, normalize_uri
from conceptnet5.edges import make_edge, MultiWriter, FlatEdgeWriter
import urllib
import urllib2

source = '/s/web/dbpedia.org'
WRITER_NUM = 1
writer = MultiWriter('dbpedia.%d' % WRITER_NUM)
sw_map = FlatEdgeWriter('data/sw/dbpedia.map.json')
sw_map_used = set()


def cycle_writer():
    global writer, WRITER_NUM
    writer.close()
    WRITER_NUM += 1
    writer = MultiWriter('dbpedia.%d' % WRITER_NUM)


def translate_wp_url(url):
    url = urllib.unquote(url).decode('utf-8', 'ignore')
    return un_camel_case(url.strip('/').split('/')[-1].split('#')[-1])

Example #8
0
def run_verbosity(infile, outfile):
    maxscore = 0
    count = 0
    counts = defaultdict(int)
    text_similarities = []

    sources = ['/s/site/verbosity']

    writer = FlatEdgeWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            counts['blank'] += 1
            continue
        left, relation, right, freq, orderscore = parts[:5]


        # catch bad stuff
        flagged = False

        for rword in right.split():
            if bad_regex_no_biscuit.match(rword):
                flagged = True
                break
        if flagged:
            #print "FLAGGED:", right
            counts['flag word'] += 1
            continue
        if len(right) < 3:
            counts['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            counts['letter'] += 1
            continue
        if right.startswith('add') or right.startswith('delete') or right.startswith('remove'):
            counts['flag word'] += 1
            continue

        freq = int(freq)
        orderscore = int(orderscore)
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/Antonym'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        rightwords = [right]
        if ' ' in right:
            rightwords.extend(right.split(' '))

        sls = sounds_like_score(left, right)
        text_similarities.append(sls)
        if sls > 0.35:
            counts['text similarity'] += 1
            continue
        
        for i, rightword in enumerate(rightwords):
            edge_sources = list(sources)
            if i > 0:
                edge_sources.append('/s/rule/split_words')
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)
            
            sls = sounds_like_score(left, rightword)
            text_similarities.append(sls)
            if sls > 0.35:
                counts['text similarity'] += 1
                continue
            
            score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000
            if score <= 0:
                counts['low score'] += 1
                continue

            #weight = math.log(1 + score/10.0) / math.log(2)
            weight = score / 100.0

            count += 1
            counts['success'] += 1
            
            leftc = make_concept_uri(unicode(left), 'en')
            rightc = make_concept_uri(unicode(rightword), 'en')
            edge = make_edge(rel, leftc, rightc, '/d/verbosity',
                             '/l/CC/By', sources, surfaceText=text,
                             weight=weight)
            writer.write(edge)
Example #9
0
    #            break
    #    if disambig is None:
    #        disambig = glossary[synset]
    #if disambig is None:
    #    disambig = '*'
    node = make_concept_uri(synset_name, 'en', pos+'/'+disambig)
    if synset not in mapping:
        mapping[synset] = node

# Map senses to the same nodes.
for sense, synset in sense_synsets.items():
    mapping[sense] = mapping[synset]

sources = ['/s/wordnet/3.0']
writer = MultiWriter('wordnet3')
sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json')
sw_map_used = set()

for line in chain(
    open('raw_data/wordnet-attribute.ttl'),
    open('raw_data/wordnet-causes.ttl'),
    open('raw_data/wordnet-classifiedby.ttl'),
    open('raw_data/wordnet-entailment.ttl'),
    open('raw_data/wordnet-hyponym.ttl'),
    open('raw_data/wordnet-instances.ttl'),
    open('raw_data/wordnet-membermeronym.ttl'),
    open('raw_data/wordnet-partmeronym.ttl'),
    open('raw_data/wordnet-sameverbgroupas.ttl'),
    open('raw_data/wordnet-similarity.ttl'),
    open('raw_data/wordnet-substancemeronym.ttl'),
    open('raw_data/full/wordnet-antonym.ttl'),
Example #10
0
class FindTranslations(ContentHandler):
    def __init__(self, out_filename='wiktionary.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = FlatEdgeWriter(out_filename)

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False
    
    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos+'/'+sense
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)
    
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign), lang
        )
        target = make_concept_uri(
          english, 'en', disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        #print surfaceText
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
        
    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(
          unicodedata.normalize('NFKC', foreign),
          self.langcode+locale
        )
        target = make_concept_uri(
          english, 'en'
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #11
0
class FindTranslations(ContentHandler):
    def __init__(self, out_filename='wiktionary.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = FlatEdgeWriter(out_filename)

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'ConceptuallyRelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[
                0:
                1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = pos + '/' + sense
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, disambiguation):
        if 'Wik' in foreign or 'Wik' in english:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang)
        target = make_concept_uri(english, 'en', disambiguation)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign, surfaceRel, english,
            disambiguation.split('/')[-1].replace('_', ' '))
        #print surfaceText
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % lang,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        source = make_concept_uri(unicodedata.normalize('NFKC', foreign),
                                  self.langcode + locale)
        target = make_concept_uri(english, 'en')
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Example #12
0
class FindTranslations(ContentHandler):
    def __init__(self, out_filename='wiktionary_ja.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = FlatEdgeWriter(out_filename)
        self.nosensetrans = None  # non-sense-specific translation

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_bottom_match = TRANS_BOTTOM.match(line)
        trans_match = TRANS.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)

        if language_match:
            self.langcode = get_language_code(language_match.group(1))

        ### Get sense-specific translation
        if trans_top_match:  # start translation part
            pos = self.pos or 'n'
            # get translation sense
            if trans_top_match.group(1):
                sense = trans_top_match.group(1).lstrip('|')
                self.curSense = pos + '/' + sense
                return
            else:
                self.curSense = pos
                return
        if trans_bottom_match:  # end translation part
            self.curSense = None
            return
        if self.curSense and line[0:5] == '*[[{{':  # get translation
            lang = line[5:].split('}')[0]  # get language of translation
            if lang in LANGUAGES_3_TO_2:  # convert 3-letter code to 2-letter code
                lang = LANGUAGES_3_TO_2[lang]
            # find all translations of that language
            translations = re.findall(r"\[\[(.*?)\]\]", line)[1:]
            for translation in translations:  # iterate over translations
                self.output_sense_translation(lang, translation, title, \
                                              self.curSense)
            return

        ### Get relation
        if line.startswith('===={{rel}}===='):  # start relation part
            self.curRelation = 'ConceptuallyRelatedTo'
            return
        if self.curRelation and self.langcode:  # within relation part
            if line.startswith('*'):  # get relation
                relations = re.findall(r"\{\{(.*?)\}\}", line)
                if len(relations) > 0:
                    if relations[0] == 'syn':  # synonym
                        self.curRelation = 'Synonym'
                    if relations[0] == 'drv':  # derivative
                        self.curRelation = 'Derivative'
                related_words = re.findall(r"\[\[(.*?)\]\]", line)
                for related_word in related_words:
                    self.output_monolingual(self.langcode, self.curRelation, \
                                            related_word, title)
                self.curRelation = 'ConceptuallyRelatedTo'  # back to default
            else:
                self.curRelation = None

        ### Get non-sense-specific translation
        if trans_match:
            self.nosensetrans = 1  # *maybe* start non-sense-specific translation
        if self.nosensetrans == 1 and line.startswith('{{top}}'):
            self.nosensetrans = 2  # start non-sense-specific translation
        if self.nosensetrans == 2:
            if line.startswith('{{bottom}}'):
                self.nosensetrans = None
                return
            if line.startswith('*{{'):
                lang = line[3:].split('}')[0]
                if lang in LANGUAGES_3_TO_2:  # convert 3-letter code to 2-letter code
                    lang = LANGUAGES_3_TO_2[lang]
                translations = re.findall(r"\[\[(.*?)\]\]", line)
                for translation in translations:
                    self.output_sense_translation(lang, translation, title, '')

    def output_monolingual(self, lang, relation, term1, term2):
        # skip Wiktionary: links and templates
        if u'ウィク' in term1 or u'ウィク' in term2:
            return
        if u'テンプレート' in term1 or u'テンプレート' in term2:
            return

        if lang in LANGUAGES_3_TO_2:  # convert 3-letter code to 2-letter code
            lang = LANGUAGES_3_TO_2[lang]
        source = make_concept_uri_safe(term1, lang)
        if self.pos:
            target = make_concept_uri_safe(term2, lang, self.pos)
        else:
            target = make_concept_uri_safe(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/ja/%s' % (lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, translated,
                                 disambiguation):
        if u':' in foreign or u':' in translated:
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = make_concept_uri_safe(unicodedata.normalize('NFKC', foreign),
                                       lang)
        target = make_concept_uri_safe(translated, self.langcode,
                                       disambiguation)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(lang))
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        if disambiguation and '/' in disambiguation:
            surfaceText = "[[%s]] %s [[%s (%s)]]" % (
                foreign, surfaceRel, translated,
                disambiguation.split('/')[-1].replace('_', ' '))
        else:
            surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel,
                                                translated)
        #print surfaceText
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/ja/%s' % (self.langcode),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, TRANSLATE],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, japanese, locale=''):
        source = make_concept_uri_safe(unicodedata.normalize('NFKC', foreign),
                                       self.langcode + locale)
        target = make_concept_uri_safe(japanese, 'ja')
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (langs.english_name(self.langcode))
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese)
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/ja/%s' % self.langcode,
                         license='/l/CC/By-SA',
                         sources=[SOURCE, INTERLINGUAL],
                         context='/ctx/all',
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)