コード例 #1
0
def handle_file(input_filename, output_file):
    out = JSONStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
コード例 #2
0
ファイル: ptt_petgame.py プロジェクト: artpar/conceptnet5
def handle_file(input_filename, output_file):
    out = JSONStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
コード例 #3
0
ファイル: convert.py プロジェクト: commonsense/conceptnet5
def msgpack_to_json(input_filename, output_filename):
    """
    Convert a msgpack stream to a JSON stream (with one object per line).
    """
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
コード例 #4
0
 def __init__(self, output_file='wiktionary.json'):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = JSONStreamWriter(output_file)
コード例 #5
0
def test_json_to_msgpack():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')
        
        writer = JSONStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        json_to_msgpack(json_path, msgpack_path)
        reader = read_msgpack_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
コード例 #6
0
ファイル: wiktionary_en.py プロジェクト: artpar/conceptnet5
 def __init__(self, output_file='wiktionary.json'):
     self.lang = None
     self.langcode = None
     self.inArticle = False
     self.inTitle = False
     self.curSense = None
     self.curTitle = ''
     self.curText = ''
     self.locales = []
     self.curRelation = None
     self.writer = JSONStreamWriter(output_file)
コード例 #7
0
def msgpack_to_json(input_filename, output_filename):
    """
    Convert a msgpack stream to a JSON stream (with one object per line).
    """
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
コード例 #8
0
def test_json_to_msgpack():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')

        writer = JSONStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        json_to_msgpack(json_path, msgpack_path)
        reader = read_msgpack_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
コード例 #9
0
def handle_file(infile, outfile):
    count = 0
    outcomes = defaultdict(int)

    sources = ['/s/site/verbosity']

    writer = JSONStreamWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            outcomes['blank'] += 1
            continue

        # The first 5 columns of the Verbosity output file are:
        #
        #   left: the word being clued
        #   relation: the relation between the word and the clue that the
        #             clue-giver chose, in a form such as "it is part of"
        #   right: the one or two words used as the clue
        #   freq: the number of different times this clue was given
        #   orderscore: the average position in the list of clues
        #
        # 'orderscore' is a number from 0 to 999, representing the average
        # quantile of its position in the list of clues. (It's like a
        # percentile, except there are 1000 of them, not 100.)
        #
        # A clue that's always given first has an orderscore of 0. A clue
        # that always appears halfway through the list has an orderscore of
        # 500.
        #
        # This may seem like a strange thing to measure, and I didn't come up
        # with it, but it actually turns out to be somewhat informative.
        # A clue with an orderscore of 0 is probably a good common-sense
        # relation, representing the first thing that comes to mind. A clue
        # with a high order score may be a move of desperation after several
        # other clues have failed. It causes the guesser to get the answer
        # soon afterward, but perhaps because it's a "cheating" move. So,
        # low orderscores represent better common sense relations.
        left, relation, right, freq, orderscore = parts[:5]
        freq = int(freq)
        orderscore = int(orderscore)

        # Test each word
        flagged = False
        for rword in right.split():
            if BAD_CLUE_REGEX.match(rword):
                flagged = True
                break

        if flagged:
            outcomes['flag word'] += 1
            continue
        if len(right) < 3:
            outcomes['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            outcomes['letter'] += 1
            continue

        # The Verbosity interface and gameplay did not particularly encourage
        # players to choose an appropriate relation. In practice, players seem
        # to have used them all interchangeably, except for the negative
        # relation "it is the opposite of", expressing /r/Antonym.
        #
        # Another way that players expressed negative relations was to use
        # 'not' as the first word of their clue; we make that into an instance
        # of /r/Antonym as well.
        #
        # In other cases, the relation is a positive relation, so we replace it
        # with the most general positive relation, /r/RelatedTo.
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/Antonym'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        # The "sounds-like score" determines whether this clue seems to be a
        # pun or rhyme, rather than an actual common-sense relationship. If
        # the sounds-like score is over 0.35, skip the assertion.
        sls = sounds_like_score(left, right)
        if sls > 0.35:
            outcomes['text similarity'] += 1
            continue

        # Calculate a score for the assertion:
        #
        #   - The number of times it's been used as a clue
        #   - ...with a linear penalty for a high sounds-like score
        #   - ...and a linear penalty for high orderscores
        #
        # The penalties are multiplicative factors from 0 to 1, which decrease
        # linearly as the relevant penalties increase. If a clue is given N
        # times, with a sounds-like score of 0 and an orderscore of 0, it will
        # get an overall score of 2N - 1. This is a formula we should probably
        # revisit.
        #
        # The weight is the score divided by 100. All divisions are floating
        # point, as defined by the __future__ import at the top of this module.
        score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000)
        if score <= 0.5:
            outcomes['low score'] += 1
            continue

        weight = score / 100

        # If the clue on the right is a two-word phrase, we make additional
        # connections to both words individually. We label them with the
        # rule-based source '/s/rule/split_words' to track that this happened.
        rightwords = [right]
        if ' ' in right:
            morewords = [
                word for word in right.split(' ') if word not in STOPWORDS
            ]
            rightwords.extend(morewords)

        for i, rightword in enumerate(rightwords):
            edge_sources = list(sources)
            if i > 0:
                edge_sources.append('/s/rule/split_words')

            # Build the natural-language-ish surface text for this clue
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)

            count += 1
            outcomes['success'] += 1

            leftc = normalized_concept_uri('en', left)
            rightc = normalized_concept_uri('en', rightword)
            edge = make_edge(rel,
                             leftc,
                             rightc,
                             dataset='/d/verbosity',
                             license=Licenses.cc_attribution,
                             sources=sources,
                             surfaceText=text,
                             weight=weight)
            writer.write(edge)

    # Count the various outcomes. This can be used as a sanity-check. It
    # also was used for a graph in a ConceptNet 5 paper.
    print("Verbosity outcomes: %s" % outcomes)
コード例 #10
0
ファイル: convert.py プロジェクト: jungle-cat/conceptnet5
def msgpack_to_json(input_filename, output_filename):
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
コード例 #11
0
ファイル: globalmind.py プロジェクト: artpar/conceptnet5
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a JSONStreamWriter.
    """
    out = JSONStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # GlobalMind provides information about what country the user is from, which
        # we can preserve in the contributor URI.
        #
        # If I got to re-choose these URIs, I would distinguish usernames with
        # a country code from those without a country code by something more
        # than the number of slashes, and I would write the country code in
        # capital letters.
        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/assert"
        ]

        lang = LANG_CODES[obj['lcode']]
        start = normalized_concept_uri(lang, obj['node1'])
        end = normalized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2)
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/translate"
        ]
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
コード例 #12
0
def combine_assertions(csv_filename, output_file, license):
    """
    Take in a tab-separated, sorted "CSV" file, named `csv_filename`, of
    distinct edges which should be grouped together into assertions. Output a
    JSON stream of assertions to `output_file`.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined
    (which should be passed in as `license`).

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """
    # The current_... variables accumulate information about the current
    # assertion. When the URI changes, we can output this assertion.
    current_uri = None
    current_data = {}
    current_contributors = set()
    current_surface = None
    current_dataset = None
    current_weight = 0.
    current_sources = []

    out = JSONStreamWriter(output_file)
    for line in codecs.open(csv_filename, encoding='utf-8'):
        line = line.rstrip('\n')
        if not line:
            continue
        # Interpret the columns of the file.
        parts = line.split('\t')
        (uri, rel, start, end, context, weight, source_uri, id, this_dataset,
         surface) = parts[:10]
        surface = surface.strip()
        weight = float(weight)

        # If the uri is 'uri', this was a header line, which isn't supposed
        # to be there.
        assert uri != 'uri'

        # If the uri is the same as current_uri, accumulate more information.
        if uri == current_uri:
            current_weight += weight
            if source_uri not in current_sources:
                contributors = extract_contributors(source_uri)
                if not contributors & current_contributors:
                    current_sources.append(source_uri)
                    current_contributors |= contributors
            # We use the first surface form we see as the surface form for
            # the whole assertion.
            if (current_surface is None) and surface:
                current_surface = surface

        # Otherwise, it's a new assertion.
        else:
            if current_uri is not None:
                output_assertion(out,
                                 dataset=current_dataset,
                                 license=license,
                                 sources=current_sources,
                                 surfaceText=current_surface,
                                 weight=weight_scale(current_weight),
                                 uri=current_uri,
                                 **current_data)
            current_uri = uri
            current_data = {'rel': rel, 'start': start, 'end': end}
            current_weight = weight
            current_sources = [source_uri]
            current_contributors = extract_contributors(source_uri)
            current_surface = surface or None
            current_dataset = this_dataset

    if current_uri is not None:
        output_assertion(out,
                         rel=rel,
                         start=start,
                         end=end,
                         dataset=current_dataset,
                         license=license,
                         sources=current_sources,
                         surfaceText=current_surface,
                         weight=weight_scale(current_weight),
                         uri=current_uri)
コード例 #13
0
ファイル: dbpedia.py プロジェクト: vamsijkrishna/conceptnet5
def handle_file(filename, output_file, sw_map_file):
    reader = NTriplesReader()
    out = JSONStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    for line in open(filename, 'rb'):
        handle_triple(line.decode('utf-8').strip(), reader, out, map_out)
コード例 #14
0
ファイル: msgpack_to_json.py プロジェクト: 205113/conceptnet5
def convert_to_json(input_filename, output_filename):
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
コード例 #15
0
 def transform_file(self, input_filename, output_file):
     out = JSONStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
コード例 #16
0
class FindTranslations(ContentHandler):
    def __init__(self, output_file='wiktionary.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = JSONStreamWriter(output_file)

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'RelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[
                0:
                1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = (pos, sense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if term_is_bad(term1) or term_is_bad(term2):
            return
        source = normalized_concept_uri(lang, term1)
        if self.pos:
            target = normalized_concept_uri(lang, term2, self.pos)
        else:
            target = normalized_concept_uri(lang, term2)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/%s/%s' % (lang, lang),
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, MONOLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, sense):
        pos, disambiguation = sense
        if 'Wik' in foreign or 'Wik' in english or term_is_bad(
                foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if lang == 'jbo' and re.search(r'x[1-5]', english):
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = normalized_concept_uri(lang,
                                        unicodedata.normalize('NFKC', foreign))
        target = normalized_concept_uri('en', english, pos, disambiguation)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (
                CODE_TO_ENGLISH_NAME[lang.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign, surfaceRel, english,
            disambiguation.split('/')[-1].replace('_', ' '))
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % lang,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, TRANSLATE],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        if term_is_bad(foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if self.langcode == 'jbo' and re.search(r'x[1-5]', english):
            return
        source = normalized_concept_uri(self.langcode + locale, foreign)
        target = normalized_concept_uri('en', english)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (
                CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % self.langcode,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, INTERLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
コード例 #17
0
ファイル: verbosity.py プロジェクト: artpar/conceptnet5
def handle_file(infile, outfile):
    count = 0
    outcomes = defaultdict(int)

    sources = ['/s/site/verbosity']

    writer = JSONStreamWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            outcomes['blank'] += 1
            continue

        # The first 5 columns of the Verbosity output file are:
        #
        #   left: the word being clued
        #   relation: the relation between the word and the clue that the
        #             clue-giver chose, in a form such as "it is part of"
        #   right: the one or two words used as the clue
        #   freq: the number of different times this clue was given
        #   orderscore: the average position in the list of clues
        #
        # 'orderscore' is a number from 0 to 999, representing the average
        # quantile of its position in the list of clues. (It's like a
        # percentile, except there are 1000 of them, not 100.)
        #
        # A clue that's always given first has an orderscore of 0. A clue
        # that always appears halfway through the list has an orderscore of
        # 500.
        #
        # This may seem like a strange thing to measure, and I didn't come up
        # with it, but it actually turns out to be somewhat informative.
        # A clue with an orderscore of 0 is probably a good common-sense
        # relation, representing the first thing that comes to mind. A clue
        # with a high order score may be a move of desperation after several
        # other clues have failed. It causes the guesser to get the answer
        # soon afterward, but perhaps because it's a "cheating" move. So,
        # low orderscores represent better common sense relations.
        left, relation, right, freq, orderscore = parts[:5]
        freq = int(freq)
        orderscore = int(orderscore)

        # Test each word 
        flagged = False
        for rword in right.split():
            if BAD_CLUE_REGEX.match(rword):
                flagged = True
                break

        if flagged:
            outcomes['flag word'] += 1
            continue
        if len(right) < 3:
            outcomes['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            outcomes['letter'] += 1
            continue

        # The Verbosity interface and gameplay did not particularly encourage
        # players to choose an appropriate relation. In practice, players seem
        # to have used them all interchangeably, except for the negative
        # relation "it is the opposite of", expressing /r/Antonym.
        #
        # Another way that players expressed negative relations was to use
        # 'not' as the first word of their clue; we make that into an instance
        # of /r/Antonym as well.
        #
        # In other cases, the relation is a positive relation, so we replace it
        # with the most general positive relation, /r/RelatedTo.
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/Antonym'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        # The "sounds-like score" determines whether this clue seems to be a
        # pun or rhyme, rather than an actual common-sense relationship. If
        # the sounds-like score is over 0.35, skip the assertion.
        sls = sounds_like_score(left, right)
        if sls > 0.35:
            outcomes['text similarity'] += 1
            continue
        
        # Calculate a score for the assertion:
        #
        #   - The number of times it's been used as a clue
        #   - ...with a linear penalty for a high sounds-like score
        #   - ...and a linear penalty for high orderscores
        #
        # The penalties are multiplicative factors from 0 to 1, which decrease
        # linearly as the relevant penalties increase. If a clue is given N
        # times, with a sounds-like score of 0 and an orderscore of 0, it will
        # get an overall score of 2N - 1. This is a formula we should probably
        # revisit.
        #
        # The weight is the score divided by 100. All divisions are floating
        # point, as defined by the __future__ import at the top of this module.
        score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000)
        if score <= 0.5:
            outcomes['low score'] += 1
            continue

        weight = score / 100

        # If the clue on the right is a two-word phrase, we make additional
        # connections to both words individually. We label them with the
        # rule-based source '/s/rule/split_words' to track that this happened.
        rightwords = [right]
        if ' ' in right:
            morewords = [word for word in right.split(' ') if word not in STOPWORDS]
            rightwords.extend(morewords)

        for i, rightword in enumerate(rightwords):
            edge_sources = list(sources)
            if i > 0:
                edge_sources.append('/s/rule/split_words')
            
            # Build the natural-language-ish surface text for this clue
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)
            
            count += 1
            outcomes['success'] += 1
            
            leftc = normalized_concept_uri('en', left)
            rightc = normalized_concept_uri('en', rightword)
            edge = make_edge(rel, leftc, rightc, dataset='/d/verbosity',
                             license=Licenses.cc_attribution,
                             sources=sources, surfaceText=text,
                             weight=weight)
            writer.write(edge)

    # Count the various outcomes. This can be used as a sanity-check. It
    # also was used for a graph in a ConceptNet 5 paper.
    print("Verbosity outcomes: %s" % outcomes)
コード例 #18
0
ファイル: wiktionary_en.py プロジェクト: artpar/conceptnet5
class FindTranslations(ContentHandler):
    def __init__(self, output_file='wiktionary.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = JSONStreamWriter(output_file)

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'RelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = (pos, sense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if term_is_bad(term1) or term_is_bad(term2):
            return
        source = normalized_concept_uri(lang, term1)
        if self.pos:
            target = normalized_concept_uri(lang, term2, self.pos)
        else:
            target = normalized_concept_uri(lang, term2)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, MONOLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, sense):
        pos, disambiguation = sense
        if 'Wik' in foreign or 'Wik' in english or term_is_bad(foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if lang == 'jbo' and re.search(r'x[1-5]', english):
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = normalized_concept_uri(
          lang, unicodedata.normalize('NFKC', foreign)
        )
        target = normalized_concept_uri(
          'en', english, pos, disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[lang.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, TRANSLATE],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        if term_is_bad(foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if self.langcode == 'jbo' and re.search(r'x[1-5]', english):
            return
        source = normalized_concept_uri(
            self.langcode + locale,
            foreign
        )
        target = normalized_concept_uri(
          'en', english
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, INTERLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
コード例 #19
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = JSONStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir,
                         'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = normalized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl',
                     'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
                     'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
                     'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
                     'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
                     'wordnet-substancemeronym.ttl',
                     'full/wordnet-antonym.ttl',
                     'full/wordnet-derivationallyrelated.ttl',
                     'full/wordnet-participleof.ttl',
                     'full/wordnet-pertainsto.ttl',
                     'full/wordnet-seealso.ttl'):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(
                    filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(rel,
                                 subj,
                                 obj,
                                 dataset='/d/wordnet/3.0',
                                 license='/l/CC/By',
                                 sources=SOURCE,
                                 weight=2.0)
                out.write(edge)
コード例 #20
0
ファイル: wordnet.py プロジェクト: artpar/conceptnet5
def run_wordnet(input_dir, output_file, sw_map_file):
    out = JSONStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = normalized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in (
        'wordnet-attribute.ttl', 'wordnet-causes.ttl',
        'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
        'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
        'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
        'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
        'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl',
        'full/wordnet-derivationallyrelated.ttl',
        'full/wordnet-participleof.ttl',
        'full/wordnet-pertainsto.ttl',
        'full/wordnet-seealso.ttl'
    ):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(
                    rel, subj, obj, dataset='/d/wordnet/3.0',
                    license='/l/CC/By', sources=SOURCE, weight=2.0
                )
                out.write(edge)
コード例 #21
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a JSONStreamWriter.
    """
    out = JSONStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # GlobalMind provides information about what country the user is from, which
        # we can preserve in the contributor URI.
        #
        # If I got to re-choose these URIs, I would distinguish usernames with
        # a country code from those without a country code by something more
        # than the number of slashes, and I would write the country code in
        # capital letters.
        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale,
                                                               username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [user_source, "/s/activity/globalmind/assert"]

        lang = LANG_CODES[obj['lcode']]
        start = normalized_concept_uri(lang, obj['node1'])
        end = normalized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]',
                                      ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]',
                                                          node1).replace(
                                                              '[node2]', node2)
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1,
                                                             text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale,
                                                               username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [user_source, "/s/activity/globalmind/translate"]
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
コード例 #22
0
ファイル: jmdict.py プロジェクト: vamsijkrishna/conceptnet5
def handle_file(filename, output_file):
    """
    JMdict is a Japanese translation dictionary, targeting multiple languages,
    released under a Creative Commons Attribution-ShareAlike license. That's
    awesome.

    It's released as a kind of annoying XML structure, using fancy XML features
    like entities, so in order to read it we need a full-blown XML parser. Python's
    built-in XML parsers are horrifying, so here we use the 'xmltodict' module, which
    is also horrifying but gets the job done.

    The majorly weird thing about xmltodict that we have to work around is that
    it gives results of different *types* when you get 0, 1, or many child nodes.
    This is what get_list is for.
    """
    # Read the XML file as UTF-8, and parse it into a dictionary.
    file = codecs.open(filename, encoding='utf-8')
    out = JSONStreamWriter(output_file)
    data = file.read()
    file.close()
    xml = xmltodict.parse(data)

    # The dictionary contains a list of <entry> tags.
    root_node = xml['JMdict']
    for entry in get_list(root_node, 'entry'):
        # From JMdict's documentation: "The kanji element, or in its absence,
        # the reading element, is the defining component of each entry."
        #
        # Quick summary of what's going on here: most Japanese words can be
        # written using kanji or kana.
        #
        # Kana are phonetic characters. Every word can be written in kana, in
        # one of two alphabets (hiragana or katakana). Words that are homonyms
        # have the same kana, unless they're written in different alphabets.
        #
        # Kanji are Chinese-based characters that are related to the meaning of
        # the word. They're compact and good at disambiguating homonyms, so
        # kanji are usually used as the canonical representation of a word.
        # However, some words have no kanji.
        #
        # The kana version of a word written in kanji is called its 'reading'.
        # Words that are pronounced differently in different contexts have
        # multiple readings.
        #
        # Okay, done with the intro to Japanese orthography. In JMdict, if
        # a word can be written in kanji, it has a <k_ele> element, containing
        # a <keb> element that contains the text. Every word also has an
        # <r_ele> element, containing one or more <reb> elements that are phonetic
        # readings of the word.
        #
        # We get the "defining text" of a word by taking its <keb> if it exists,
        # or all of its <reb>s if not. There's no way to tell which <reb> is the
        # most "defining" in the case where there's no <keb>.
        headwords = [word['keb'] for word in get_list(entry, 'k_ele')]
        if not headwords:
            headwords = [word['reb'] for word in get_list(entry, 'r_ele')]

        # An entry might have different word senses that are translated
        # differently to other languages. Ideally, we'd remember that they're
        # different senses. However, we have no way to refer to the different
        # senses. So for now, we disregard word senses. One day we might have
        # a better overall plan for word senses in ConceptNet.
        for sense in get_list(entry, 'sense'):
            # Glosses are translations of the word to different languages.
            # If the word is a loan-word, the foreign word it was derived from
            # will be marked with the <lsource> tag instead of <gloss>.
            #
            # Get all the glosses, including the lsource if it's there.
            glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource')
            for gloss in glosses:
                text = lang = None
                if '#text' in gloss:
                    # A gloss node might be marked with a 'lang' attribute. If so,
                    # xmltodict represents it as a dictionary with '#text' and
                    # '@xml:lang' elements.
                    text = parse_gloss(gloss['#text'])
                    lang = convert_lang_code(gloss['@xml:lang'])
                elif isinstance(gloss, STRING_TYPE):
                    # If there's no 'lang' attribute, the gloss is in English,
                    # and xmltodict gives it to us as a plain Unicode string.
                    lang = 'en'
                    text = parse_gloss(gloss)

                # If we parsed the node at all and the text looks good, then we can
                # add edges to ConceptNet.
                #
                # We don't want to deal with texts with periods (these might be
                # dictionary-style abbreviations, which are sort of unhelpful when
                # we can't expand them), and we also don't want to deal with texts
                # that are more than five words long.
                if (text is not None and '.' not in text
                        and text.count(' ') <= 4
                        and text not in BAD_NAMES_FOR_THINGS):
                    for head in headwords:
                        ja_concept = normalized_concept_uri('ja', head)
                        other_concept = normalized_concept_uri(lang, text)
                        output_edge(out, ja_concept, other_concept)