Esempio n. 1
0
def handle_file(input_filename, output_file):
    out = JSONStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
Esempio n. 2
0
def handle_file(input_filename, output_file):
    out = JSONStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
Esempio n. 3
0
def msgpack_to_json(input_filename, output_filename):
    """
    Convert a msgpack stream to a JSON stream (with one object per line).
    """
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 4
0
def msgpack_to_json(input_filename, output_filename):
    """
    Convert a msgpack stream to a JSON stream (with one object per line).
    """
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 5
0
def test_json_to_msgpack():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')

        writer = JSONStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        json_to_msgpack(json_path, msgpack_path)
        reader = read_msgpack_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Esempio n. 6
0
def test_json_to_msgpack():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')
        
        writer = JSONStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        json_to_msgpack(json_path, msgpack_path)
        reader = read_msgpack_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Esempio n. 7
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a JSONStreamWriter.
    """
    out = JSONStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # GlobalMind provides information about what country the user is from, which
        # we can preserve in the contributor URI.
        #
        # If I got to re-choose these URIs, I would distinguish usernames with
        # a country code from those without a country code by something more
        # than the number of slashes, and I would write the country code in
        # capital letters.
        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/assert"
        ]

        lang = LANG_CODES[obj['lcode']]
        start = normalized_concept_uri(lang, obj['node1'])
        end = normalized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2)
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/translate"
        ]
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
Esempio n. 8
0
def convert_to_json(input_filename, output_filename):
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 9
0
class FindTranslations(ContentHandler):
    def __init__(self, output_file='wiktionary.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = JSONStreamWriter(output_file)

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'RelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[
                0:
                1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = (pos, sense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if term_is_bad(term1) or term_is_bad(term2):
            return
        source = normalized_concept_uri(lang, term1)
        if self.pos:
            target = normalized_concept_uri(lang, term2, self.pos)
        else:
            target = normalized_concept_uri(lang, term2)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)

        edge = make_edge('/r/' + relation,
                         source,
                         target,
                         '/d/wiktionary/%s/%s' % (lang, lang),
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, MONOLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, sense):
        pos, disambiguation = sense
        if 'Wik' in foreign or 'Wik' in english or term_is_bad(
                foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if lang == 'jbo' and re.search(r'x[1-5]', english):
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = normalized_concept_uri(lang,
                                        unicodedata.normalize('NFKC', foreign))
        target = normalized_concept_uri('en', english, pos, disambiguation)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (
                CODE_TO_ENGLISH_NAME[lang.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (
            foreign, surfaceRel, english,
            disambiguation.split('/')[-1].replace('_', ' '))
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % lang,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, TRANSLATE],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        if term_is_bad(foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if self.langcode == 'jbo' and re.search(r'x[1-5]', english):
            return
        source = normalized_concept_uri(self.langcode + locale, foreign)
        target = normalized_concept_uri('en', english)
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (
                CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation,
                         source,
                         target,
                         '/d/wiktionary/en/%s' % self.langcode,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, INTERLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Esempio n. 10
0
class FindTranslations(ContentHandler):
    def __init__(self, output_file='wiktionary.json'):
        self.lang = None
        self.langcode = None
        self.inArticle = False
        self.inTitle = False
        self.curSense = None
        self.curTitle = ''
        self.curText = ''
        self.locales = []
        self.curRelation = None
        self.writer = JSONStreamWriter(output_file)

    def startElement(self, name, attrs):
        if name == 'page':
            self.inArticle = True
            self.curText = []
        elif name == 'title':
            self.inTitle = True
            self.curTitle = ''

    def endElement(self, name):
        if name == 'page':
            self.inArticle = False
            self.handleArticle(self.curTitle, ''.join(self.curText))
        elif name == 'title':
            self.inTitle = False

    def characters(self, text):
        if self.inTitle:
            self.curTitle += text
        elif self.inArticle:
            self.curText.append(text)
            if len(self.curText) > 10000:
                # bail out
                self.inArticle = False

    def handleArticle(self, title, text):
        lines = text.split('\n')
        self.pos = None
        for line in lines:
            self.handleLine(title, line.strip())

    def handleLine(self, title, line):
        language_match = LANGUAGE_HEADER.match(line)
        trans_top_match = TRANS_TOP.match(line)
        trans_tag_match = TRANS_TAG.search(line)
        chinese_match = CHINESE_TAG.search(line)
        if line.startswith('===') and line.endswith('==='):
            pos = line.strip('= ')
            if pos == 'Synonyms':
                self.curRelation = 'Synonym'
            elif pos == 'Antonym':
                self.curRelation = 'Antonym'
            elif pos == 'Related terms':
                self.curRelation = 'RelatedTo'
            elif pos == 'Derived terms':
                if not line.startswith('===='):
                    # this is at the same level as the part of speech;
                    # now we don't know what POS these apply to
                    self.pos = None
                self.curRelation = 'DerivedFrom'
            else:
                self.curRelation = None
                if pos in PARTS_OF_SPEECH:
                    self.pos = PARTS_OF_SPEECH[pos]
        elif language_match:
            self.lang = language_match.group(1)
            self.langcode = LANGUAGES.get(self.lang)
        elif chinese_match:
            scripttag = chinese_match.group(2)
            self.locales = []
            if 's' in scripttag:
                self.locales.append('_CN')
            if 't' in scripttag:
                self.locales.append('_TW')
        elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None:
            defn = line[1:].strip()
            if defn[0:1] not in ':*#':
                for defn2 in filter_line(defn):
                    if not ascii_enough(defn2): continue
                    if 'Index:' in title: continue
                    if self.langcode == 'zh':
                        for locale in self.locales:
                            self.output_translation(title, defn2, locale)
                    elif self.langcode:
                        self.output_translation(title, defn2)
        elif line[0:4] == '----':
            self.pos = None
            self.lang = None
            self.langcode = None
            self.curRelation = None
        elif trans_top_match:
            pos = self.pos or 'n'
            sense = trans_top_match.group(1).split(';')[0].strip('.')
            if 'translations' in sense.lower():
                self.curSense = None
            else:
                self.curSense = (pos, sense)
        elif trans_tag_match:
            lang = trans_tag_match.group(1)
            translation = trans_tag_match.group(2)
            if self.curSense is not None and self.lang == 'English':
                # handle Chinese separately
                if lang not in ('cmn', 'yue', 'zh-yue', 'zh'):
                    self.output_sense_translation(lang, translation, title,
                                                  self.curSense)
        elif '{{trans-bottom}}' in line:
            self.curSense = None
        elif line.startswith('* ') and self.curRelation and self.langcode:
            relatedmatch = WIKILINK.search(line)
            if relatedmatch:
                related = relatedmatch.group(1)
                self.output_monolingual(self.langcode, self.curRelation,
                                        related, title)

    def output_monolingual(self, lang, relation, term1, term2):
        if term_is_bad(term1) or term_is_bad(term2):
            return
        source = normalized_concept_uri(lang, term1)
        if self.pos:
            target = normalized_concept_uri(lang, term2, self.pos)
        else:
            target = normalized_concept_uri(lang, term2)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, MONOLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_sense_translation(self, lang, foreign, english, sense):
        pos, disambiguation = sense
        if 'Wik' in foreign or 'Wik' in english or term_is_bad(foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if lang == 'jbo' and re.search(r'x[1-5]', english):
            return
        if lang == 'zh-cn':
            lang = 'zh_CN'
        elif lang == 'zh-tw':
            lang = 'zh_TW'
        source = normalized_concept_uri(
          lang, unicodedata.normalize('NFKC', foreign)
        )
        target = normalized_concept_uri(
          'en', english, pos, disambiguation
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[lang.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % lang
        surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, TRANSLATE],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)

    def output_translation(self, foreign, english, locale=''):
        if term_is_bad(foreign) or term_is_bad(english):
            return
        # Quick fix that drops definitions written in Lojban syntax
        if self.langcode == 'jbo' and re.search(r'x[1-5]', english):
            return
        source = normalized_concept_uri(
            self.langcode + locale,
            foreign
        )
        target = normalized_concept_uri(
          'en', english
        )
        relation = '/r/TranslationOf'
        try:
            surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]])
        except KeyError:
            surfaceRel = "is [language %s] for" % self.langcode
        surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
        edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE, INTERLINGUAL],
                         weight=1.0,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Esempio n. 11
0
def handle_file(infile, outfile):
    count = 0
    outcomes = defaultdict(int)

    sources = ['/s/site/verbosity']

    writer = JSONStreamWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            outcomes['blank'] += 1
            continue

        # The first 5 columns of the Verbosity output file are:
        #
        #   left: the word being clued
        #   relation: the relation between the word and the clue that the
        #             clue-giver chose, in a form such as "it is part of"
        #   right: the one or two words used as the clue
        #   freq: the number of different times this clue was given
        #   orderscore: the average position in the list of clues
        #
        # 'orderscore' is a number from 0 to 999, representing the average
        # quantile of its position in the list of clues. (It's like a
        # percentile, except there are 1000 of them, not 100.)
        #
        # A clue that's always given first has an orderscore of 0. A clue
        # that always appears halfway through the list has an orderscore of
        # 500.
        #
        # This may seem like a strange thing to measure, and I didn't come up
        # with it, but it actually turns out to be somewhat informative.
        # A clue with an orderscore of 0 is probably a good common-sense
        # relation, representing the first thing that comes to mind. A clue
        # with a high order score may be a move of desperation after several
        # other clues have failed. It causes the guesser to get the answer
        # soon afterward, but perhaps because it's a "cheating" move. So,
        # low orderscores represent better common sense relations.
        left, relation, right, freq, orderscore = parts[:5]
        freq = int(freq)
        orderscore = int(orderscore)

        # Test each word 
        flagged = False
        for rword in right.split():
            if BAD_CLUE_REGEX.match(rword):
                flagged = True
                break

        if flagged:
            outcomes['flag word'] += 1
            continue
        if len(right) < 3:
            outcomes['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            outcomes['letter'] += 1
            continue

        # The Verbosity interface and gameplay did not particularly encourage
        # players to choose an appropriate relation. In practice, players seem
        # to have used them all interchangeably, except for the negative
        # relation "it is the opposite of", expressing /r/Antonym.
        #
        # Another way that players expressed negative relations was to use
        # 'not' as the first word of their clue; we make that into an instance
        # of /r/Antonym as well.
        #
        # In other cases, the relation is a positive relation, so we replace it
        # with the most general positive relation, /r/RelatedTo.
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/Antonym'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        # The "sounds-like score" determines whether this clue seems to be a
        # pun or rhyme, rather than an actual common-sense relationship. If
        # the sounds-like score is over 0.35, skip the assertion.
        sls = sounds_like_score(left, right)
        if sls > 0.35:
            outcomes['text similarity'] += 1
            continue
        
        # Calculate a score for the assertion:
        #
        #   - The number of times it's been used as a clue
        #   - ...with a linear penalty for a high sounds-like score
        #   - ...and a linear penalty for high orderscores
        #
        # The penalties are multiplicative factors from 0 to 1, which decrease
        # linearly as the relevant penalties increase. If a clue is given N
        # times, with a sounds-like score of 0 and an orderscore of 0, it will
        # get an overall score of 2N - 1. This is a formula we should probably
        # revisit.
        #
        # The weight is the score divided by 100. All divisions are floating
        # point, as defined by the __future__ import at the top of this module.
        score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000)
        if score <= 0.5:
            outcomes['low score'] += 1
            continue

        weight = score / 100

        # If the clue on the right is a two-word phrase, we make additional
        # connections to both words individually. We label them with the
        # rule-based source '/s/rule/split_words' to track that this happened.
        rightwords = [right]
        if ' ' in right:
            morewords = [word for word in right.split(' ') if word not in STOPWORDS]
            rightwords.extend(morewords)

        for i, rightword in enumerate(rightwords):
            edge_sources = list(sources)
            if i > 0:
                edge_sources.append('/s/rule/split_words')
            
            # Build the natural-language-ish surface text for this clue
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)
            
            count += 1
            outcomes['success'] += 1
            
            leftc = normalized_concept_uri('en', left)
            rightc = normalized_concept_uri('en', rightword)
            edge = make_edge(rel, leftc, rightc, dataset='/d/verbosity',
                             license=Licenses.cc_attribution,
                             sources=sources, surfaceText=text,
                             weight=weight)
            writer.write(edge)

    # Count the various outcomes. This can be used as a sanity-check. It
    # also was used for a graph in a ConceptNet 5 paper.
    print("Verbosity outcomes: %s" % outcomes)
Esempio n. 12
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = JSONStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = normalized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in (
        'wordnet-attribute.ttl', 'wordnet-causes.ttl',
        'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
        'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
        'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
        'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
        'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl',
        'full/wordnet-derivationallyrelated.ttl',
        'full/wordnet-participleof.ttl',
        'full/wordnet-pertainsto.ttl',
        'full/wordnet-seealso.ttl'
    ):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(
                    rel, subj, obj, dataset='/d/wordnet/3.0',
                    license='/l/CC/By', sources=SOURCE, weight=2.0
                )
                out.write(edge)
Esempio n. 13
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a JSONStreamWriter.
    """
    out = JSONStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # GlobalMind provides information about what country the user is from, which
        # we can preserve in the contributor URI.
        #
        # If I got to re-choose these URIs, I would distinguish usernames with
        # a country code from those without a country code by something more
        # than the number of slashes, and I would write the country code in
        # capital letters.
        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale,
                                                               username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [user_source, "/s/activity/globalmind/assert"]

        lang = LANG_CODES[obj['lcode']]
        start = normalized_concept_uri(lang, obj['node1'])
        end = normalized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]',
                                      ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]',
                                                          node1).replace(
                                                              '[node2]', node2)
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1,
                                                             text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale,
                                                               username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [user_source, "/s/activity/globalmind/translate"]
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
Esempio n. 14
0
 def transform_file(self, input_filename, output_file):
     out = JSONStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Esempio n. 15
0
def msgpack_to_json(input_filename, output_filename):
    out_stream = JSONStreamWriter(output_filename)
    for obj in read_msgpack_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 16
0
def handle_file(infile, outfile):
    count = 0
    outcomes = defaultdict(int)

    sources = ['/s/site/verbosity']

    writer = JSONStreamWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            outcomes['blank'] += 1
            continue

        # The first 5 columns of the Verbosity output file are:
        #
        #   left: the word being clued
        #   relation: the relation between the word and the clue that the
        #             clue-giver chose, in a form such as "it is part of"
        #   right: the one or two words used as the clue
        #   freq: the number of different times this clue was given
        #   orderscore: the average position in the list of clues
        #
        # 'orderscore' is a number from 0 to 999, representing the average
        # quantile of its position in the list of clues. (It's like a
        # percentile, except there are 1000 of them, not 100.)
        #
        # A clue that's always given first has an orderscore of 0. A clue
        # that always appears halfway through the list has an orderscore of
        # 500.
        #
        # This may seem like a strange thing to measure, and I didn't come up
        # with it, but it actually turns out to be somewhat informative.
        # A clue with an orderscore of 0 is probably a good common-sense
        # relation, representing the first thing that comes to mind. A clue
        # with a high order score may be a move of desperation after several
        # other clues have failed. It causes the guesser to get the answer
        # soon afterward, but perhaps because it's a "cheating" move. So,
        # low orderscores represent better common sense relations.
        left, relation, right, freq, orderscore = parts[:5]
        freq = int(freq)
        orderscore = int(orderscore)

        # Test each word
        flagged = False
        for rword in right.split():
            if BAD_CLUE_REGEX.match(rword):
                flagged = True
                break

        if flagged:
            outcomes['flag word'] += 1
            continue
        if len(right) < 3:
            outcomes['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            outcomes['letter'] += 1
            continue

        # The Verbosity interface and gameplay did not particularly encourage
        # players to choose an appropriate relation. In practice, players seem
        # to have used them all interchangeably, except for the negative
        # relation "it is the opposite of", expressing /r/Antonym.
        #
        # Another way that players expressed negative relations was to use
        # 'not' as the first word of their clue; we make that into an instance
        # of /r/Antonym as well.
        #
        # In other cases, the relation is a positive relation, so we replace it
        # with the most general positive relation, /r/RelatedTo.
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/Antonym'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        # The "sounds-like score" determines whether this clue seems to be a
        # pun or rhyme, rather than an actual common-sense relationship. If
        # the sounds-like score is over 0.35, skip the assertion.
        sls = sounds_like_score(left, right)
        if sls > 0.35:
            outcomes['text similarity'] += 1
            continue

        # Calculate a score for the assertion:
        #
        #   - The number of times it's been used as a clue
        #   - ...with a linear penalty for a high sounds-like score
        #   - ...and a linear penalty for high orderscores
        #
        # The penalties are multiplicative factors from 0 to 1, which decrease
        # linearly as the relevant penalties increase. If a clue is given N
        # times, with a sounds-like score of 0 and an orderscore of 0, it will
        # get an overall score of 2N - 1. This is a formula we should probably
        # revisit.
        #
        # The weight is the score divided by 100. All divisions are floating
        # point, as defined by the __future__ import at the top of this module.
        score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000)
        if score <= 0.5:
            outcomes['low score'] += 1
            continue

        weight = score / 100

        # If the clue on the right is a two-word phrase, we make additional
        # connections to both words individually. We label them with the
        # rule-based source '/s/rule/split_words' to track that this happened.
        rightwords = [right]
        if ' ' in right:
            morewords = [
                word for word in right.split(' ') if word not in STOPWORDS
            ]
            rightwords.extend(morewords)

        for i, rightword in enumerate(rightwords):
            edge_sources = list(sources)
            if i > 0:
                edge_sources.append('/s/rule/split_words')

            # Build the natural-language-ish surface text for this clue
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)

            count += 1
            outcomes['success'] += 1

            leftc = normalized_concept_uri('en', left)
            rightc = normalized_concept_uri('en', rightword)
            edge = make_edge(rel,
                             leftc,
                             rightc,
                             dataset='/d/verbosity',
                             license=Licenses.cc_attribution,
                             sources=sources,
                             surfaceText=text,
                             weight=weight)
            writer.write(edge)

    # Count the various outcomes. This can be used as a sanity-check. It
    # also was used for a graph in a ConceptNet 5 paper.
    print("Verbosity outcomes: %s" % outcomes)
Esempio n. 17
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = JSONStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir,
                         'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = normalized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl',
                     'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
                     'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
                     'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
                     'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
                     'wordnet-substancemeronym.ttl',
                     'full/wordnet-antonym.ttl',
                     'full/wordnet-derivationallyrelated.ttl',
                     'full/wordnet-participleof.ttl',
                     'full/wordnet-pertainsto.ttl',
                     'full/wordnet-seealso.ttl'):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(
                    filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(rel,
                                 subj,
                                 obj,
                                 dataset='/d/wordnet/3.0',
                                 license='/l/CC/By',
                                 sources=SOURCE,
                                 weight=2.0)
                out.write(edge)