Example #1
0
def transform_for_linked_data(edge):
    """
    Modify an edge (assertion) in place to contain values that are appropriate
    for a Linked Data API.

    Although this code isn't actually responsible for what an API returns
    (see the conceptnet-web repository for that), it helps to deal with what
    edge dictionaries should contain here.

    The relevant changes are:

    - Remove the 'features' list
    - Rename 'uri' to '@id'
    - Make 'start', 'end', and 'rel' into dictionaries with an '@id' and
      'label', removing the separate 'surfaceStart' and 'surfaceEnd'
      attributes
    - All dictionaries should have an '@id'. For the edge itself, it's the
      URI. Without this, we get RDF blank nodes, which are awful.
    - Set '@type' on objects representing edges and sources. (Nodes get their
      @type from the `ld_node` function.)
    """
    if 'features' in edge:
        del edge['features']
    for source in edge['sources']:
        conj = conjunction_uri(*sorted(source.values()))
        source['@id'] = conj
        source['@type'] = 'Source'
    edge['@id'] = edge['uri']
    del edge['uri']
    edge['@type'] = 'Edge'

    start_uri = edge['start']
    end_uri = edge['end']
    rel_uri = edge['rel']
    start_label = edge.get('surfaceStart')
    end_label = edge.get('surfaceEnd')
    del edge['surfaceStart']
    del edge['surfaceEnd']
    edge['start'] = ld_node(start_uri, start_label)
    edge['end'] = ld_node(end_uri, end_label)
    edge['rel'] = ld_node(rel_uri, None)
    if 'other' in edge:
        # TODO: Find out when we use this, or remove it if we don't use it
        if edge['other'] == start_uri:
            edge['other'] = edge['start']
        elif edge['other'] == end_uri:
            edge['other'] = edge['end']
        else:
            edge['rel'] = ld_node(rel_uri, None)

    return edge
Example #2
0
def transform_for_linked_data(edge):
    """
    Modify an edge (assertion) in place to contain values that are appropriate
    for a Linked Data API.

    Although this code isn't actually responsible for what an API returns
    (see the conceptnet-web repository for that), it helps to deal with what
    edge dictionaries should contain here.

    The relevant changes are:

    - Remove the 'features' list
    - Rename 'uri' to '@id'
    - Make 'start', 'end', and 'rel' into dictionaries with an '@id' and
      'label', removing the separate 'surfaceStart' and 'surfaceEnd'
      attributes
    - All dictionaries should have an '@id'. For the edge itself, it's the
      URI. Without this, we get RDF blank nodes, which are awful.
    - Set '@type' on objects representing edges and sources. (Nodes get their
      @type from the `ld_node` function.)
    """
    if 'features' in edge:
        del edge['features']
    for source in edge['sources']:
        conj = conjunction_uri(*sorted(source.values()))
        source['@id'] = conj
        source['@type'] = 'Source'
    edge['@id'] = edge['uri']
    del edge['uri']
    edge['@type'] = 'Edge'

    start_uri = edge['start']
    end_uri = edge['end']
    rel_uri = edge['rel']
    start_label = edge.get('surfaceStart')
    end_label = edge.get('surfaceEnd')
    del edge['surfaceStart']
    del edge['surfaceEnd']
    edge['start'] = ld_node(start_uri, start_label)
    edge['end'] = ld_node(end_uri, end_label)
    edge['rel'] = ld_node(rel_uri, None)
    if 'other' in edge:
        # TODO: Find out when we use this, or remove it if we don't use it
        if edge['other'] == start_uri:
            edge['other'] = edge['start']
        elif edge['other'] == end_uri:
            edge['other'] = edge['end']
        else:
            edge['rel'] = ld_node(rel_uri, None)

    return edge
def _make_assertion(line_group):
    """
    When a generator of tab-separated lines has been grouped by their assertion
    URI, this function takes all the lines with the same URI and makes a single
    assertion out of them.
    """
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
    uri, rel, start, end, _ = lines[0].split('\t')

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(
        rel=rel,
        start=start,
        end=end,
        weight=weight,
        dataset=dataset,
        license=license,
        sources=sources,
        surfaceText=surface_text,
    )
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
    uri, rel, start, end, _ = lines[0].split('\t')

    # We can't distinguish word senses well enough yet, so only keep them
    # up to the part of speech
    start = uri_prefix(start, 4)
    end = uri_prefix(end, 4)

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(rel=rel,
                     start=start,
                     end=end,
                     weight=weight,
                     dataset=dataset,
                     license=license,
                     sources=sources,
                     surfaceText=surface_text)
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
    uri, rel, start, end, _ = lines[0].split('\t')

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(
        rel=rel,
        start=start,
        end=end,
        weight=weight,
        dataset=dataset,
        license=license,
        sources=sources,
        surfaceText=surface_text,
    )
Example #6
0
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    uri, rel, start, end, _ = lines[0].split('\t')

    # We can't distinguish word senses well enough yet, so only keep them
    # up to the part of speech
    start = uri_prefix(start, 4)
    end = uri_prefix(end, 4)

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(
        rel=rel, start=start, end=end, weight=weight,
        dataset=dataset, license=license, sources=sources,
        surfaceText=surface_text
    )
Example #7
0
def make_edge(rel, start, end, dataset, license, sources,
              context='/ctx/all', surfaceText=None, weight=1.0):
    """
    Take in the information representing an edge (a justified assertion),
    and output that edge in dictionary form.

        >>> e = make_edge(rel='/r/HasProperty',
        ...               start='/c/en/fire',
        ...               end='/c/en/hot',
        ...               dataset='/d/conceptnet/4/en',
        ...               license=Licenses.cc_attribution,
        ...               sources='/and/[/.../]',
        ...               surfaceText='[[Fire]] is [[hot]]',
        ...               weight=1.0)
        >>> pprint(e)
        {'context': '/ctx/all',
         'dataset': '/d/conceptnet/4/en',
         'end': '/c/en/hot',
         'features': ['/c/en/fire /r/HasProperty -',
                      '/c/en/fire - /c/en/hot',
                      '- /r/HasProperty /c/en/hot'],
         'id': '/e/ee13e234ee835eabfcf7c906b358cc2229366b42',
         'license': '/l/CC/By',
         'rel': '/r/HasProperty',
         'source_uri': '/and/[/.../]',
         'sources': ['/...'],
         'start': '/c/en/fire',
         'surfaceText': '[[Fire]] is [[hot]]',
         'uri': '/a/[/r/HasProperty/,/c/en/fire/,/c/en/hot/]',
         'weight': 1.0}
    """
    features = [
        "%s %s -" % (start, rel),
        "%s - %s" % (start, end),
        "- %s %s" % (rel, end)
    ]
    uri = assertion_uri(rel, start, end)
    if isinstance(sources, list):
        source_tree = conjunction_uri(*sources)
        source_list = sources
    else:
        source_tree = sources
        source_list = parse_possible_compound_uri('or', sources)
    
    separate_source_lists = [
        parse_possible_compound_uri('and', source)
        for source in source_list
    ]
    flat_sources = [inner for outer in separate_source_lists
                          for inner in outer]
    flat_sources = sorted(set(flat_sources))

    # Generate a unique ID for the edge. This is the only opaque ID
    # that appears in ConceptNet objects. You can use it as a
    # pseudo-random sort order over edges.
    edge_unique_data = [uri, context, source_tree]
    edge_unique = ' '.join(edge_unique_data).encode('utf-8')
    id = '/e/'+sha1(edge_unique).hexdigest()
    obj = {
        'id': id,
        'uri': uri,
        'rel': rel,
        'start': start,
        'end': end,
        'context': context,
        'dataset': dataset,
        'sources': flat_sources,
        'source_uri': source_tree,
        'features': features,
        'license': license,
        'weight': weight,
        'surfaceText': surfaceText
    }
    return obj