Esempio n. 1
0
def handle_file(input_filename, output_file):
    builder = CN4Builder(weight=0.05)
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        # Get a line from the file
        for new_obj in handle_line(line, builder):
            out.write(new_obj)
Esempio n. 2
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
Esempio n. 3
0
def handle_file(input_filename, output_file):
    builder = CN4Builder()
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        # Get a line from the file
        for new_obj in handle_line(line, builder):
            out.write(new_obj)
Esempio n. 4
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
Esempio n. 5
0
def json_to_msgpack(input_filename, output_filename):
    """
    Convert a JSON stream (with one object per line) to a msgpack stream.
    """
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 6
0
def handle_file(input_file, output_file):
    tree = ET.parse(input_file)
    out = MsgpackStreamWriter(output_file)
    root = tree.getroot()
    lang = root[0][1].attrib['type']
    for annotation in root[1]:

        for word in strip_words(annotation.text):
            start = standardized_concept_uri('mul', annotation.attrib['cp'])
            end = standardized_concept_uri(lang, word)
            edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
            out.write(edge)
Esempio n. 7
0
def test_msgpack_to_json():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')
        
        writer = MsgpackStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        msgpack_to_json(json_path, msgpack_path)
        reader = read_json_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Esempio n. 8
0
def run_wiktionary(input_file, output_file, titledb=None, language='en',
                   verbosity=0, logger=None):
    if titledb is None:
        titledb = os.path.dirname(input_file) + '/titles.db'

    trace = (verbosity >= 2)
    sem = SEMANTICS[language](language, titledb=titledb, trace=trace,
                              logger=logger)
    output = MsgpackStreamWriter(output_file)
    for structure in read_msgpack_stream(input_file):
        for edge in sem.parse_structured_entry(structure):
            if verbosity >= 1:
                print(edge['rel'], edge['start'], edge['end'])
            output.write(edge)
Esempio n. 9
0
def combine_assertions(input_filename, output_file):
    """
    Take in a tab-separated, sorted "CSV" files, indicated by
    `input_filename`, that should be grouped together into assertions.
    Output a msgpack stream of assertions to `output_file`.

    The input file should be made from multiple sources of assertions by
    concatenating and sorting them.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined.

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """
    def group_func(line):
        "Group lines by their URI (their first column)."
        return line.split('\t', 1)[0]

    out = MsgpackStreamWriter(output_file)
    out_bad = MsgpackStreamWriter(output_file + '.reject')

    with open(input_filename, encoding='utf-8') as stream:
        for key, line_group in itertools.groupby(stream, group_func):
            assertion = make_assertion(line_group)
            if assertion is None:
                continue
            if assertion['weight'] > 0:
                destination = out
            else:
                destination = out_bad
            destination.write(assertion)

    out.close()
    out_bad.close()
def combine_assertions(input_filename, core_filename, output_filename):
    """
    Take in a tab-separated, sorted "CSV" files, indicated by
    `input_filename`, that should be grouped together into assertions.
    Output a msgpack stream of assertions the file indicated by
    `output_filename`.

    The input file should be made from multiple sources of assertions by
    concatenating and sorting them.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined.

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """

    def group_func(line):
        "Group lines by their URI (their first column)."
        return line.split('\t', 1)[0]

    out = MsgpackStreamWriter(output_filename)
    out_bad = MsgpackStreamWriter(output_filename + '.reject')

    core_prefixes = set()
    for line in open(core_filename, encoding='utf-8'):
        core_prefixes.add(uri_prefix(line.strip(), 3))

    # Scan through the assertions twice to add derived words to the blocklist
    blocklist = Blocklist.load(get_support_data_filename(BLOCK_FILENAME))
    for iter in range(2):
        with open(input_filename, encoding='utf-8') as stream:
            for line in stream:
                tmp_assertion = _make_assertion([line.strip()])
                if tmp_assertion is None:
                    continue
                blocklist.propagate_blocks(tmp_assertion)

    with open(input_filename, encoding='utf-8') as stream:
        for key, line_group in itertools.groupby(stream, group_func):
            assertion = _make_assertion(line_group)
            destination = out
            if assertion is None:
                continue
            if assertion['weight'] <= 0:
                destination = out_bad
            if blocklist.is_blocked(assertion):
                destination = out_bad
            if assertion['rel'] == 'ExternalURL':
                # discard ExternalURL edges for things that aren't otherwise
                # in ConceptNet
                prefix = uri_prefix(assertion['start'], 3)
                if prefix not in core_prefixes:
                    destination = out_bad
            destination.write(assertion)

    out.close()
    out_bad.close()
Esempio n. 11
0
 def __init__(self, output_dir, nfiles=20):
     self.nfiles = nfiles
     self.writers = [
         MsgpackStreamWriter(output_dir + '/wiktionary_%02d.msgpack' % i)
         for i in range(nfiles)
     ]
     self.title_db = TitleDBWriter(output_dir + '/titles.db', clear=True)
Esempio n. 12
0
def handle_file(filename, output_file, sw_map_file):
    reader = NTriplesReader()
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    for line in open(filename, 'rb'):
        if not line.startswith(b'#'):
            handle_triple(line.decode('utf-8').strip(), reader, out, map_out)
Esempio n. 13
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        parts = line.rstrip('\n').split('\t')
        uri, start, rel, end, weight, source = parts
        if uri == 'uri':
            continue

        edge = make_edge(
            rel=rel,
            start=start,
            end=end,
            dataset=DATASET,
            sources=[{'activity': SOURCE}],
            license=Licenses.cc_attribution,
            weight=WEIGHT_TABLE[weight],
        )
        out.write(edge)
Esempio n. 14
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        parts = line.rstrip('\n').split('\t')
        uri, start, rel, end, weight, source = parts
        if uri == 'uri':
            return

        edge = make_edge(rel=rel,
                         start=start,
                         end=end,
                         dataset=DATASET,
                         sources=[{
                             'activity': SOURCE
                         }],
                         license=Licenses.cc_attribution,
                         weight=WEIGHT_TABLE[weight])
        out.write(edge)
Esempio n. 15
0
def run_wiktionary(input_file,
                   output_file,
                   titledb=None,
                   language='en',
                   verbosity=0,
                   logger=None):
    if titledb is None:
        titledb = os.path.dirname(input_file) + '/titles.db'

    trace = (verbosity >= 2)
    sem = SEMANTICS[language](language,
                              titledb=titledb,
                              trace=trace,
                              logger=logger)
    output = MsgpackStreamWriter(output_file)
    for structure in read_msgpack_stream(input_file):
        for edge in sem.parse_structured_entry(structure):
            if verbosity >= 1:
                print(edge['rel'], edge['start'], edge['end'])
            output.write(edge)
Esempio n. 16
0
def combine_assertions(input_filename, output_file):
    """
    Take in a tab-separated, sorted "CSV" files, indicated by
    `input_filename`, that should be grouped together into assertions.
    Output a msgpack stream of assertions to `output_file`.

    The input file should be made from multiple sources of assertions by
    concatenating and sorting them.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined.

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """
    def group_func(line):
        "Group lines by their URI (their first column)."
        return line.split('\t', 1)[0]

    out = MsgpackStreamWriter(output_file)
    out_bad = MsgpackStreamWriter(output_file + '.reject')

    with open(input_filename, encoding='utf-8') as stream:
        for key, line_group in itertools.groupby(stream, group_func):
            assertion = make_assertion(line_group)
            if assertion is None:
                continue
            if assertion['weight'] > 0:
                destination = out
            else:
                destination = out_bad
            destination.write(assertion)

    out.close()
    out_bad.close()
Esempio n. 17
0
def subwords_to_edges(language, input, output):
    """
    Morfessor hypothesizes ways to break words into sub-word chunks. Produce
    edges from these sub-words that can be used in retrofitting.
    """
    writer = MsgpackStreamWriter(output)
    for line in input:
        line = line.rstrip()
        if not line or line.startswith('#'):
            continue

        # Remove the unnecessary count ("1 ") from the start of each line
        line = line.split(' ', 1)[1]
        chunks = line.split(' + ')

        # Strip a possible trailing underscore, which would particularly show
        # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese)
        full_text = ''.join(chunks).strip('_')
        end = join_uri('c', language, full_text)
        for chunk in chunks:
            if chunk != '_':
                start = join_uri('x', language, chunk.strip('_'))
                edge = make_edge(
                    '/r/SubwordOf',
                    start,
                    end,
                    dataset='/d/morphology',
                    license=Licenses.cc_attribution,
                    sources=MORPH_SOURCES,
                    weight=0.01,
                )
                writer.write(edge)
    writer.close()
Esempio n. 18
0
def json_to_msgpack(input_filename, output_filename):
    """
    Convert a JSON stream (with one object per line) to a msgpack stream.
    """
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 19
0
def test_msgpack_to_json():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')

        writer = MsgpackStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        msgpack_to_json(json_path, msgpack_path)
        reader = read_json_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Esempio n. 20
0
def handle_file(input_file, output_file):
    tree = ET.parse(input_file)
    out = MsgpackStreamWriter(output_file)
    root = tree.getroot()
    lang = root[0][1].attrib[
        'type'
    ]  # language is at position [1] within the child node [0]

    if len(root) >= 2:
        for annotation in root[1]:
            for word in strip_words(annotation.text):
                start = standardized_concept_uri('mul', annotation.attrib['cp'])
                end = standardized_concept_uri(lang, word)
                edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
                out.write(edge)
    else:
        print("No emoji data in {!r}".format(input_file))

    out.close()
Esempio n. 21
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if (
            rel_name == 'subClassOf'
            and web_obj is not None
            and web_subj in labels
            and web_obj in labels
        ):
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif (
            rel_name == 'sameAs'
            and web_subj in labels
            and web_obj.startswith('http://umbel.org/')
        ):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
Esempio n. 22
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a MsgpackStreamWriter.
    """
    out = MsgpackStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # As far as I can tell, GlobalMind used the same namespace of
        # usernames as the original Open Mind.
        user_source = "/s/contributor/omcs/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/assert"
        ]

        lang = LANG_CODES[obj['lcode']]
        start = normalized_concept_uri(lang, obj['node1'])
        end = normalized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2)
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)

        # Avoid duplication with the ConceptNet reader, but still save every edge so that we can
        # handle translations.
        if username != 'openmind':
            out.write(edge)

        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/translate"
        ]
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
Esempio n. 23
0
def json_to_msgpack(input_filename, output_filename):
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Esempio n. 24
0
def combine_assertions(csv_filename, output_file, license):
    """
    Take in a tab-separated, sorted "CSV" file, named `csv_filename`, of
    distinct edges which should be grouped together into assertions. Output a
    Msgpack stream of assertions to `output_file`.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined
    (which should be passed in as `license`).

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """
    # The current_... variables accumulate information about the current
    # assertion. When the URI changes, we can output this assertion.
    current_uri = None
    current_data = {}
    current_contributors = set()
    current_surface = None
    current_dataset = None
    current_weight = 0.
    current_sources = []

    out = MsgpackStreamWriter(output_file)
    for line in codecs.open(csv_filename, encoding='utf-8'):
        line = line.rstrip('\n')
        if not line:
            continue
        # Interpret the columns of the file.
        parts = line.split('\t')
        (uri, rel, start, end, context, weight, source_uri, id, this_dataset,
         surface) = parts[:10]
        surface = surface.strip()
        weight = float(weight)

        # If the uri is 'uri', this was a header line, which isn't supposed
        # to be there.
        assert uri != 'uri'

        # If the uri is the same as current_uri, accumulate more information.
        if uri == current_uri:
            current_weight += weight
            if source_uri not in current_sources:
                contributors = extract_contributors(source_uri)
                if not contributors & current_contributors:
                    current_sources.append(source_uri)
                    current_contributors |= contributors
            # We use the first surface form we see as the surface form for
            # the whole assertion.
            if (current_surface is None) and surface:
                current_surface = surface

        # Otherwise, it's a new assertion.
        else:
            if current_uri is not None:
                output_assertion(out,
                                 dataset=current_dataset,
                                 license=license,
                                 sources=current_sources,
                                 surfaceText=current_surface,
                                 weight=weight_scale(current_weight),
                                 uri=current_uri,
                                 **current_data)
            current_uri = uri
            current_data = {'rel': rel, 'start': start, 'end': end}
            current_weight = weight
            current_sources = [source_uri]
            current_contributors = extract_contributors(source_uri)
            current_surface = surface or None
            current_dataset = this_dataset

    if current_uri is not None:
        output_assertion(out,
                         rel=rel,
                         start=start,
                         end=end,
                         dataset=current_dataset,
                         license=license,
                         sources=current_sources,
                         surfaceText=current_surface,
                         weight=weight_scale(current_weight),
                         uri=current_uri)
Esempio n. 25
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if (
            'Category:' in subj_url or 'File:' in subj_url or
            'List_of' in subj_url or '__' in subj_url or
            'Template:' in subj_url
        ):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge(
                        '/r/IsA', subj_concept, obj_concept,
                        dataset='/d/dbpedia/en',
                        license=Licenses.cc_sharealike,
                        sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                        weight=0.5,
                        surfaceStart=url_to_label(subj['url']),
                        surfaceEnd=url_to_label(obj['url'])
                    )
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge(
                            '/r/ExternalURL',
                            subj_concept, other_url,
                            dataset='/d/dbpedia/en',
                            license=Licenses.cc_sharealike,
                            sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                            weight=1.0
                        )
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept, other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=1.0
                            )
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept, subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url)
                            )
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (
            subj_concept and obj_concept and
            subj['url'] in mapped_urls and obj['url'] in mapped_urls
        ):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(
                    rel, subj_concept, obj_concept,
                    dataset='/d/dbpedia/en',
                    license=Licenses.cc_sharealike,
                    sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                    weight=0.5,
                    surfaceStart=url_to_label(subj['url']),
                    surfaceEnd=url_to_label(obj['url'])
                )
                out.write(edge)

    out.close()
Esempio n. 26
0
def handle_file(filename, output_file):
    out = MsgpackStreamWriter(output_file)

    for line in gzip.open(filename, 'rt'):

        # skip the intro information
        if line.startswith('#'):
            continue

        # parse the data to extract the traditional form, simplified form and the English definition
        traditional, simplified, definitions = re.match(LINE_REGEX,
                                                        line).groups()

        # Make an edge between the traditional and simplified version
        edge = make_edge(
            rel='/r/Synonym',
            start=standardized_concept_uri('zh-Hant', traditional),
            end=standardized_concept_uri('zh-Hans', simplified),
            dataset=DATASET,
            license=LICENSE,
            sources=SOURCE,
        )
        out.write(edge)

        for definition in re.split(DEFINITIONS_REGEX, definitions):

            # Skip pronunciation information
            if 'Taiwan pr.' in definition or 'also pr.' in definition:
                continue

            # Check if it's the definition matches a person syntax, i.e. includes a date range
            person_match = re.match(DATE_RANGE_REGEX, definition)
            if person_match:
                persons = extract_person(person_match)
                for person in persons:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Check if a word is a measure word
            if definition.startswith('CL:'):
                related_words = extract_measure_words(definition)
                for word in related_words:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Remove clarifying information in parenthesis
            definition = PAREN_REGEX.sub('', definition)

            # Handle variants/word forms and abbreviations
            if re.match(VARIANT_REGEX, definition) or re.match(
                    ABBR_REGEX, definition):
                variants = extract_han_characters(definition)
                for variant in variants:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            if re.match(SEE_ALSO_REGEX, definition):
                references = extract_han_characters(definition)
                for reference in references:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

            # Remove 'lit.', 'fig.'
            definition = LIT_FIG_REGEX.sub('', definition)

            # Expand sth and sb
            definition = SB_REGEX.sub('someone', definition)
            definition = STH_REGEX.sub('something', definition)

            # Additional cleanups
            definition = remove_reference_syntax(definition)
            definition = remove_additional_info(definition)

            # Skip long definitions and make an edge out of remaining information
            if len(definition.split()) < 6:
                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hant', traditional),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)

                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hans', simplified),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)
Esempio n. 27
0
 def transform_file(self, input_filename, output_file):
     out = MsgpackStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Esempio n. 28
0
def handle_file(filename, output_file):
    """
    JMdict is a Japanese translation dictionary, targeting multiple languages,
    released under a Creative Commons Attribution-ShareAlike license. That's
    awesome.

    It's released as a kind of annoying XML structure, using fancy XML features
    like entities, so in order to read it we need a full-blown XML parser. Python's
    built-in XML parsers are horrifying, so here we use the 'xmltodict' module, which
    is also horrifying but gets the job done.

    The majorly weird thing about xmltodict that we have to work around is that
    it gives results of different *types* when you get 0, 1, or many child nodes.
    This is what get_list is for.
    """
    # Read the XML file as UTF-8, and parse it into a dictionary.
    file = codecs.open(filename, encoding='utf-8')
    out = MsgpackStreamWriter(output_file)
    data = file.read()
    file.close()
    xml = xmltodict.parse(data)

    # The dictionary contains a list of <entry> tags.
    root_node = xml['JMdict']
    for entry in get_list(root_node, 'entry'):
        # From JMdict's documentation: "The kanji element, or in its absence,
        # the reading element, is the defining component of each entry."
        #
        # Quick summary of what's going on here: most Japanese words can be
        # written using kanji or kana.
        #
        # Kana are phonetic characters. Every word can be written in kana, in
        # one of two alphabets (hiragana or katakana). Words that are homonyms
        # have the same kana, unless they're written in different alphabets.
        #
        # Kanji are Chinese-based characters that are related to the meaning of
        # the word. They're compact and good at disambiguating homonyms, so
        # kanji are usually used as the canonical representation of a word.
        # However, some words have no kanji.
        #
        # The kana version of a word written in kanji is called its 'reading'.
        # Words that are pronounced differently in different contexts have
        # multiple readings.
        #
        # Okay, done with the intro to Japanese orthography. In JMdict, if
        # a word can be written in kanji, it has a <k_ele> element, containing
        # a <keb> element that contains the text. Every word also has an
        # <r_ele> element, containing one or more <reb> elements that are phonetic
        # readings of the word.
        #
        # We get the "defining text" of a word by taking its <keb> if it exists,
        # or all of its <reb>s if not. There's no way to tell which <reb> is the
        # most "defining" in the case where there's no <keb>.
        headwords = [word['keb'] for word in get_list(entry, 'k_ele')]
        if not headwords:
            headwords = [word['reb'] for word in get_list(entry, 'r_ele')]

        # An entry might have different word senses that are translated
        # differently to other languages. Ideally, we'd remember that they're
        # different senses. However, we have no way to refer to the different
        # senses. So for now, we disregard word senses. One day we might have
        # a better overall plan for word senses in ConceptNet.
        senses = get_list(entry, 'sense')
        for sense_num, sense in enumerate(senses):
            # Glosses are translations of the word to different languages.
            # If the word is a loan-word, the foreign word it was derived from
            # will be marked with the <lsource> tag instead of <gloss>.
            #
            # Get all the glosses, including the lsource if it's there.
            glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource')
            contexts = [
                fix_context(context)
                for context in get_list(sense, 'field')
            ]
            pos = '_'
            for pos_tag in get_list(sense, 'pos'):
                if pos_tag[:10] in NOUN_TYPES:
                    pos = 'n'
                elif pos_tag[:10] in VERB_TYPES:
                    pos = 'v'
                elif pos_tag[:10] in ADJ_TYPES:
                    pos = 'a'
                elif pos_tag[:10] in ADV_TYPES:
                    pos = 'r'

            for gloss in glosses:
                if '#text' in gloss:
                    # A gloss node might be marked with a 'lang' attribute. If so,
                    # xmltodict represents it as a dictionary with '#text' and
                    # '@xml:lang' elements.
                    text = parse_gloss(gloss['#text'])
                    lang = convert_lang_code(gloss['@xml:lang'])
                elif isinstance(gloss, STRING_TYPE):
                    # If there's no 'lang' attribute, the gloss is in English,
                    # and xmltodict gives it to us as a plain Unicode string.
                    lang = 'en'
                    text = parse_gloss(gloss)

                # If we parsed the node at all and the text looks good, then we can
                # add edges to ConceptNet.
                #
                # We don't want to deal with texts with periods (these might be
                # dictionary-style abbreviations, which are sort of unhelpful when
                # we can't expand them), and we also don't want to deal with texts
                # that are more than five words long.
                if (
                    text is not None and '.' not in text and
                    text.count(' ') <= 4 and valid_concept_name(text)
                ):
                    for head in headwords:
                        if len(senses) >= 2:
                            sensekey = '%d' % (sense_num + 1)
                            ja_concept = standardized_concept_uri('ja', head, pos, 'jmdict', sensekey)
                        else:
                            ja_concept = standardized_concept_uri('ja', head, pos)
                        other_concept = standardized_concept_uri(lang, text)
                        output_edge(out, '/r/Synonym', ja_concept, other_concept)

                        for context in contexts:
                            context_node = standardized_concept_uri('en', context)
                            output_edge(out, '/r/HasContext', ja_concept, context_node)
Esempio n. 29
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace(
                    '_s-',
                    "'s_").replace("s__",
                                   "s'_").replace("s_-",
                                                  "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(
            key=lambda label: (label in used_labels, ) + label_sort_key(label))
        if (synset not in synset_canonical_labels
                or synset_canonical_labels[synset][0].isupper()
                and synset_domains.get(synset) == 'person'):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn',
                                             category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn',
                                                     category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(
                    label, canon)
                edge = make_edge(rel_uri,
                                 other_uri,
                                 canon_uri,
                                 dataset=DATASET,
                                 surfaceText=surface,
                                 license=Licenses.cc_attribution,
                                 sources=[SOURCE],
                                 weight=2.0)
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == 'hypernym' and pos == 'v':
                relname = 'hypernym-v'
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn',
                                                   sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(rel_uri,
                             subj_uri,
                             obj_uri,
                             dataset=DATASET,
                             surfaceText=surface,
                             license=license,
                             sources=[SOURCE],
                             weight=2.0)
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge('/r/ExternalURL',
                         cn_uri,
                         wn_url,
                         dataset=DATASET,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE],
                         weight=1.0)
        out.write(edge)

    out.close()
Esempio n. 30
0
def read_wiktionary(input_file, db_file, output_file):
    """
    Convert a stream of parsed Wiktionary data into ConceptNet edges.

    A `db_file` containing all known words in all languages must have already
    been prepared from the same data.
    """
    db = sqlite3.connect(db_file)
    out = MsgpackStreamWriter(output_file)
    for heading, items in segmented_stream(input_file):
        language = heading['language']
        title = heading['title']
        dataset = '/d/wiktionary/{}'.format(language)
        url_title = heading['title'].replace(' ', '_')
        web_url = 'http://{}.wiktionary.org/wiki/{}'.format(
            language, url_title)
        web_source = '/s/resource/wiktionary/{}'.format(language)

        source = {'contributor': web_source, 'process': PARSER_RULE}

        # Scan through the 'from' items, such as the start nodes of
        # translations, looking for distinct etymologies. If we get more than
        # one etymology for a language, we need to distinguish them as
        # different senses in that language.
        all_etyms = {
            (item['from']['language'], etym_label(language, item['from']))
            for item in items
            if 'language' in item['from'] and item['from']['text'] == title
            and etym_label(language, item['from']) is not None
        }
        word_languages = {wlang for (wlang, _) in all_etyms}
        for wlang in sorted(word_languages):
            if valid_language(wlang):
                cpage = standardized_concept_uri(wlang, title)
                ld_edge = make_edge('/r/ExternalURL',
                                    cpage,
                                    web_url,
                                    dataset=dataset,
                                    weight=0.25,
                                    sources=[source],
                                    license=Licenses.cc_sharealike)
                out.write(ld_edge)
        etym_to_translation_sense = {}
        language_etym_counts = Counter(lang for (lang, etym) in all_etyms)
        polysemous_languages = {
            lang
            for lang in language_etym_counts if language_etym_counts[lang] > 1
        }

        for item in items:
            tfrom = item['from']
            tto = item['to']
            assumed_languages = [language]
            lang1 = tfrom.get('language')
            lang2 = tto.get('language')
            if lang1 and (lang1
                          not in assumed_languages) and valid_language(lang1):
                assumed_languages.append(lang1)
            if lang2 and (lang2
                          not in assumed_languages) and valid_language(lang2):
                assumed_languages.append(lang2)

            cfrom = transform_term(language,
                                   tfrom,
                                   assumed_languages,
                                   db,
                                   use_etyms=(lang1 in polysemous_languages))
            cpage = cfrom
            cto = transform_term(language,
                                 tto,
                                 assumed_languages,
                                 db,
                                 use_etyms=(lang2 in polysemous_languages))

            if cfrom is None or cto is None:
                continue
            if uri_prefix(cfrom, 3) == uri_prefix(cto, 3):
                continue

            rel, switch = transform_relation(item['rel'])
            if rel is None:
                continue
            if switch:
                cfrom, cto = cto, cfrom

            # When translations are separated by sense, use only the first
            # sense we see for each etymology. That will have the most
            # representative translations.
            if item['rel'] == 'translation':
                etym_key = (tfrom['language'], etym_label(language, tfrom))
                sense = tfrom.get('sense', '')
                if etym_key in etym_to_translation_sense:
                    if etym_to_translation_sense[etym_key] != sense:
                        continue
                else:
                    etym_to_translation_sense[etym_key] = sense

            weight = 1.
            if rel == '/r/EtymologicallyRelatedTo':
                weight = 0.25
            edge = make_edge(rel,
                             cfrom,
                             cto,
                             dataset=dataset,
                             weight=weight,
                             sources=[source],
                             surfaceStart=tfrom['text'],
                             surfaceEnd=tto['text'],
                             license=Licenses.cc_sharealike)
            out.write(edge)

    out.close()
Esempio n. 31
0
def json_to_msgpack(input_filename, output_filename):
	out_stream = MsgpackStreamWriter(output_filename)
	for obj in read_json_stream(input_filename):
		out_stream.write(obj)
	out_stream.close()
Esempio n. 32
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        objtext = obj_dict.get("text")

        relname = resource_name(rel)
        if relname == "label":
            if obj_dict["lang"] == "en":
                synset_labels[subj].append(objtext)
        elif relname == "sameAs":
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split("-")[1:-2]

                # Handle missing apostrophes
                label = (
                    "-".join(parts)
                    .replace("_s_", "'s_")
                    .replace("_s-", "'s_")
                    .replace("s__", "s'_")
                    .replace("s_-", "s'-")
                    .replace("_", " ")
                )
                synset_canonical_labels[subj] = label

        elif relname == "domain_category":
            synset_categories[subj] = obj
        elif relname == "lexical_domain":
            target = resource_name(obj)
            if "." in target:
                domain = target.split(".")[1]
                synset_domains[subj] = domain
        elif relname == "gloss":
            synset_glosses[subj] = objtext
        elif relname == "reference":
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels
            or synset_canonical_labels[synset][0].isupper()
            and synset_domains.get(synset) == "person"
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split("#")[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in "nvarsp", synset
        if pos == "s":
            pos = "a"
        elif pos == "p":
            pos = "-"
        if category_name in ("pert", "all", "tops"):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri(
                    "en", label, pos, "wn", category_name
                )
                rel_uri = "/r/Synonym"
                surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon)
                edge = make_edge(
                    rel_uri,
                    other_uri,
                    canon_uri,
                    dataset=DATASET,
                    surfaceText=surface,
                    license=Licenses.cc_attribution,
                    sources=[SOURCE],
                    weight=2.0,
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == "hypernym" and pos == "v":
                relname = "hypernym-v"
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith("~"):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = "/r/" + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict["text"]
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or "!" in text:
                    continue
                lang = obj_dict["lang"]
                obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split("/")[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label)

            edge = make_edge(
                rel_uri,
                subj_uri,
                obj_uri,
                dataset=DATASET,
                surfaceText=surface,
                license=license,
                sources=[SOURCE],
                weight=2.0,
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            "/r/ExternalURL",
            cn_uri,
            wn_url,
            dataset=DATASET,
            license=Licenses.cc_sharealike,
            sources=[SOURCE],
            weight=1.0,
        )
        out.write(edge)

    out.close()
Esempio n. 33
0
def read_wiktionary(input_file, db_file, output_file):
    """
    Convert a stream of parsed Wiktionary data into ConceptNet edges.

    A `db_file` containing all known words in all languages must have already
    been prepared from the same data.
    """
    db = sqlite3.connect(db_file)
    out = MsgpackStreamWriter(output_file)
    for heading, items in segmented_stream(input_file):
        language = heading['language']
        title = heading['title']
        dataset = '/d/wiktionary/{}'.format(language)
        url_title = heading['title'].replace(' ', '_')
        web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title)
        web_source = '/s/resource/wiktionary/{}'.format(language)

        source = {
            'contributor': web_source,
            'process': PARSER_RULE
        }

        # Scan through the 'from' items, such as the start nodes of
        # translations, looking for distinct etymologies. If we get more than
        # one etymology for a language, we need to distinguish them as
        # different senses in that language.
        all_etyms = {
            (item['from']['language'], etym_label(language, item['from']))
            for item in items
            if 'language' in item['from'] and item['from']['text'] == title
            and etym_label(language, item['from']) is not None
        }
        word_languages = {wlang for (wlang, _) in all_etyms}
        for wlang in sorted(word_languages):
            cpage = standardized_concept_uri(wlang, title)
            ld_edge = make_edge(
                '/r/ExternalURL', cpage, web_url,
                dataset=dataset, weight=0.25, sources=[source],
                license=Licenses.cc_sharealike
            )
            out.write(ld_edge)
        etym_to_translation_sense = {}
        language_etym_counts = Counter(lang for (lang, etym) in all_etyms)
        polysemous_languages = {
            lang for lang in language_etym_counts
            if language_etym_counts[lang] > 1
        }

        for item in items:
            tfrom = item['from']
            tto = item['to']
            assumed_languages = [language]
            lang1 = tfrom.get('language')
            lang2 = tto.get('language')
            if lang1 and (lang1 not in assumed_languages) and valid_language(lang1):
                assumed_languages.append(lang1)
            if lang2 and (lang2 not in assumed_languages) and valid_language(lang2):
                assumed_languages.append(lang2)

            cfrom = transform_term(
                language, tfrom, assumed_languages, db,
                use_etyms=(lang1 in polysemous_languages)
            )
            cpage = cfrom
            cto = transform_term(
                language, tto, assumed_languages, db,
                use_etyms=(lang2 in polysemous_languages)
            )

            if cfrom is None or cto is None:
                continue
            if uri_prefix(cfrom, 3) == uri_prefix(cto, 3):
                continue

            rel, switch = transform_relation(item['rel'])
            if rel is None:
                continue
            if switch:
                cfrom, cto = cto, cfrom

            # When translations are separated by sense, use only the first
            # sense we see for each etymology. That will have the most
            # representative translations.
            if item['rel'] == 'translation':
                etym_key = (tfrom['language'], etym_label(language, tfrom))
                sense = tfrom.get('sense', '')
                if etym_key in etym_to_translation_sense:
                    if etym_to_translation_sense[etym_key] != sense:
                        continue
                else:
                    etym_to_translation_sense[etym_key] = sense

            weight = 1.
            if rel == '/r/EtymologicallyRelatedTo':
                weight = 0.25
            edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight,
                             sources=[source],
                             surfaceStart=tfrom['text'],
                             surfaceEnd=tto['text'],
                             license=Licenses.cc_sharealike)
            out.write(edge)

    out.close()
Esempio n. 34
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels or
            synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person'
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon)
                edge = make_edge(
                    rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface,
                    license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                pos, sense = synset_disambig.get(subj, (None, None))
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(
                rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface,
                license=license, sources=[SOURCE], weight=2.0
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET,
            license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0
        )
        out.write(edge)

    out.close()
Esempio n. 35
0
def handle_file(filename, output_file):
    out = MsgpackStreamWriter(output_file)

    for line in gzip.open(filename, 'rt'):

        # skip the intro information
        if line.startswith('#'):
            continue

        # parse the data to extract the traditional form, simplified form and the English definition
        traditional, simplified, definitions = re.match(LINE_REGEX, line).groups()

        # Make an edge between the traditional and simplified version
        edge = make_edge(
            rel='/r/Synonym',
            start=standardized_concept_uri('zh-Hant', traditional),
            end=standardized_concept_uri('zh-Hans', simplified),
            dataset=DATASET,
            license=LICENSE,
            sources=SOURCE,
        )
        out.write(edge)

        for definition in re.split(DEFINITIONS_REGEX, definitions):

            # Skip pronunciation information
            if 'Taiwan pr.' in definition or 'also pr.' in definition:
                continue

            # Check if it's the definition matches a person syntax, i.e. includes a date range
            person_match = re.match(DATE_RANGE_REGEX, definition)
            if person_match:
                persons = extract_person(person_match)
                for person in persons:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Check if a word is a measure word
            if definition.startswith('CL:'):
                related_words = extract_measure_words(definition)
                for word in related_words:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Remove clarifying information in parenthesis
            definition = PAREN_REGEX.sub('', definition)

            # Handle variants/word forms and abbreviations
            if re.match(VARIANT_REGEX, definition) or re.match(ABBR_REGEX, definition):
                variants = extract_han_characters(definition)
                for variant in variants:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            if re.match(SEE_ALSO_REGEX, definition):
                references = extract_han_characters(definition)
                for reference in references:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

            # Remove 'lit.', 'fig.'
            definition = LIT_FIG_REGEX.sub('', definition)

            # Expand sth and sb
            definition = SB_REGEX.sub('someone', definition)
            definition = STH_REGEX.sub('something', definition)

            # Additional cleanups
            definition = remove_reference_syntax(definition)
            definition = remove_additional_info(definition)

            # Skip long definitions and make an edge out of remaining information
            if len(definition.split()) < 6:
                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hant', traditional),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)

                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hans', simplified),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)
Esempio n. 36
0
 def transform_file(self, input_filename, output_file):
     out = MsgpackStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Esempio n. 37
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir,
                         'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl',
                     'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
                     'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
                     'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
                     'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
                     'wordnet-substancemeronym.ttl',
                     'full/wordnet-antonym.ttl',
                     'full/wordnet-derivationallyrelated.ttl',
                     'full/wordnet-participleof.ttl',
                     'full/wordnet-pertainsto.ttl',
                     'full/wordnet-seealso.ttl'):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(
                    filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(rel,
                                 subj,
                                 obj,
                                 dataset='/d/wordnet/3.0',
                                 license='/l/CC/By',
                                 sources=SOURCE,
                                 weight=2.0)
                out.write(edge)
Esempio n. 38
0
def handle_file(infile, outfile):
    count = 0
    outcomes = defaultdict(int)

    writer = MsgpackStreamWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            outcomes['blank'] += 1
            continue

        # The first 5 columns of the Verbosity output file are:
        #
        #   left: the word being clued
        #   relation: the relation between the word and the clue that the
        #             clue-giver chose, in a form such as "it is part of"
        #   right: the one or two words used as the clue
        #   freq: the number of different times this clue was given
        #   orderscore: the average position in the list of clues
        #
        # 'orderscore' is a number from 0 to 999, representing the average
        # quantile of its position in the list of clues. (It's like a
        # percentile, except there are 1000 of them, not 100.)
        #
        # A clue that's always given first has an orderscore of 0. A clue
        # that always appears halfway through the list has an orderscore of
        # 500.
        #
        # This may seem like a strange thing to measure, and I didn't come up
        # with it, but it actually turns out to be somewhat informative.
        # A clue with an orderscore of 0 is probably a good common-sense
        # relation, representing the first thing that comes to mind. A clue
        # with a high order score may be a move of desperation after several
        # other clues have failed. It causes the guesser to get the answer
        # soon afterward, but perhaps because it's a "cheating" move. So,
        # low orderscores represent better common sense relations.
        left, relation, right, freq, orderscore = parts[:5]
        freq = int(freq)
        orderscore = int(orderscore)

        # Test each word
        flagged = False
        for rword in right.split():
            if BAD_CLUE_REGEX.match(rword):
                flagged = True
                break

        if flagged:
            outcomes['flag word'] += 1
            continue
        if len(right) < 3:
            outcomes['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            outcomes['letter'] += 1
            continue

        # The Verbosity interface and gameplay did not particularly encourage
        # players to choose an appropriate relation. In practice, players seem
        # to have used them all interchangeably, except for the negative
        # relation "it is the opposite of", expressing /r/Antonym.
        #
        # Another way that players expressed negative relations was to use
        # 'not' as the first word of their clue; we make that into an instance
        # of /r/Antonym as well.
        #
        # In other cases, the relation is a positive relation, so we replace it
        # with the most general positive relation, /r/RelatedTo.
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/DistinctFrom'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        # The "sounds-like score" determines whether this clue seems to be a
        # pun or rhyme, rather than an actual common-sense relationship. If
        # the sounds-like score is over 0.35, skip the assertion.
        sls = sounds_like_score(left, right)
        if sls > 0.35:
            outcomes['text similarity'] += 1
            continue

        # Calculate a score for the assertion:
        #
        #   - The number of times it's been used as a clue
        #   - ...with a linear penalty for a high sounds-like score
        #   - ...and a linear penalty for high orderscores
        #
        # The penalties are multiplicative factors from 0 to 1, which decrease
        # linearly as the relevant penalties increase. If a clue is given N
        # times, with a sounds-like score of 0 and an orderscore of 0, it will
        # get an overall score of 2N - 1. This is a formula we should probably
        # revisit.
        #
        # The weight is the score divided by 100. All divisions are floating
        # point.
        score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000)
        if score <= 1.:
            outcomes['low score'] += 1
            continue

        weight = score ** .5 / 10

        # If the clue on the right is a two-word phrase, we make additional
        # connections to both words individually. We label them with the
        # rule-based source '/s/process/split_words' to track that this
        # happened.
        rightwords = [right]
        if ' ' in right:
            morewords = [word for word in right.split(' ') if word not in STOPWORDS]
            rightwords.extend(morewords)

        for i, rightword in enumerate(rightwords):
            source = {'contributor': '/s/resource/verbosity'}
            if i > 0:
                source['process'] = '/s/process/split_words'

            # Build the natural-language-ish surface text for this clue
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)

            count += 1
            outcomes['success'] += 1
            leftc = standardized_concept_uri('en', left)
            rightc = standardized_concept_uri('en', rightword)
            edge = make_edge(
                rel,
                leftc,
                rightc,
                dataset='/d/verbosity',
                license=Licenses.cc_attribution,
                sources=[source],
                surfaceText=text,
                weight=weight,
            )
            writer.write(edge)
Esempio n. 39
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a MsgpackStreamWriter.
    """
    out = MsgpackStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # As far as I can tell, GlobalMind used the same namespace of
        # usernames as the original Open Mind.
        user_source = "/s/contributor/omcs/%s" % username

        sources = [user_source, "/s/activity/globalmind/assert"]

        lang = LANG_CODES[obj['lcode']]
        start = standardized_concept_uri(lang, obj['node1'])
        end = standardized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]',
                                      ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]',
                                                          node1).replace(
                                                              '[node2]', node2)
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)

        # Avoid duplication with the ConceptNet reader, but still save every edge so that we can
        # handle translations.
        if username != 'openmind':
            out.write(edge)

        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1,
                                                             text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale,
                                                               username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [user_source, "/s/activity/globalmind/translate"]
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
Esempio n. 40
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if ('Category:' in subj_url or 'File:' in subj_url
                or 'List_of' in subj_url or '__' in subj_url
                or 'Template:' in subj_url):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge('/r/IsA',
                                     subj_concept,
                                     obj_concept,
                                     dataset='/d/dbpedia/en',
                                     license=Licenses.cc_sharealike,
                                     sources=[{
                                         'contributor':
                                         '/s/resource/dbpedia/2015/en'
                                     }],
                                     weight=0.5,
                                     surfaceStart=url_to_label(subj['url']),
                                     surfaceEnd=url_to_label(obj['url']))
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge('/r/ExternalURL',
                                            subj_concept,
                                            other_url,
                                            dataset='/d/dbpedia/en',
                                            license=Licenses.cc_sharealike,
                                            sources=[{
                                                'contributor':
                                                '/s/resource/dbpedia/2015/en'
                                            }],
                                            weight=1.0)
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept,
                                other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=1.0)
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept,
                                subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url))
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (subj_concept and obj_concept and subj['url'] in mapped_urls
                and obj['url'] in mapped_urls):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(rel,
                                 subj_concept,
                                 obj_concept,
                                 dataset='/d/dbpedia/en',
                                 license=Licenses.cc_sharealike,
                                 sources=[{
                                     'contributor':
                                     '/s/resource/dbpedia/2015/en'
                                 }],
                                 weight=0.5,
                                 surfaceStart=url_to_label(subj['url']),
                                 surfaceEnd=url_to_label(obj['url']))
                out.write(edge)

    out.close()
Esempio n. 41
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in (
        'wordnet-attribute.ttl', 'wordnet-causes.ttl',
        'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
        'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
        'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
        'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
        'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl',
        'full/wordnet-derivationallyrelated.ttl',
        'full/wordnet-participleof.ttl',
        'full/wordnet-pertainsto.ttl',
        'full/wordnet-seealso.ttl'
    ):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(
                    rel, subj, obj, dataset='/d/wordnet/3.0',
                    license='/l/CC/By', sources=SOURCE, weight=2.0
                )
                out.write(edge)
Esempio n. 42
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri('en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri('en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
Esempio n. 43
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name(
                        'en', name) != standardized_concept_name(
                            'en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri(
                            'en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(
                                name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri(
                                'en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(
                            umbel_edge('/r/Synonym', alt_label, main_label,
                                       surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(
            dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))