Example #1
0
def subwords_to_edges(language, input, output):
    """
    Morfessor hypothesizes ways to break words into sub-word chunks. Produce
    edges from these sub-words that can be used in retrofitting.
    """
    writer = MsgpackStreamWriter(output)
    for line in input:
        line = line.rstrip()
        if not line or line.startswith('#'):
            continue

        # Remove the unnecessary count ("1 ") from the start of each line
        line = line.split(' ', 1)[1]
        chunks = line.split(' + ')

        # Strip a possible trailing underscore, which would particularly show
        # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese)
        full_text = ''.join(chunks).strip('_')
        end = join_uri('c', language, full_text)
        for chunk in chunks:
            if chunk != '_':
                start = join_uri('x', language, chunk.strip('_'))
                edge = make_edge(
                    '/r/SubwordOf',
                    start,
                    end,
                    dataset='/d/morphology',
                    license=Licenses.cc_attribution,
                    sources=MORPH_SOURCES,
                    weight=0.01,
                )
                writer.write(edge)
    writer.close()
Example #2
0
def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    activity = parts_dict["activity"]

    creator_node = join_uri(
        '/s/contributor/omcs',
        normalize_text(parts_dict["creator"], lowercase=False)
    )
    activity_node = join_uri('/s/activity/omcs', normalize_text(activity))
    if preposition_fix:
        conjunction = [creator_node, activity_node, '/s/rule/preposition_fix']
    else:
        conjunction = [creator_node, activity_node]
    weighted_sources = [(conjunction, 1)]

    for vote in parts_dict["votes"]:
        username = vote[0]
        vote_int = vote[1]
        conjunction = [
            join_uri('/s/contributor/omcs', username),
            '/s/activity/omcs/vote'
        ]
        weighted_sources.append((conjunction, vote_int))
    return weighted_sources
Example #3
0
def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri("/s/contributor/omcs", standardize_username(parts_dict["creator"]))
    creator_source["contributor"] = creator_node

    activity = parts_dict["activity"]
    activity_node = join_uri("/s/activity/omcs", standardize_text(activity))
    creator_source["activity"] = activity_node

    if preposition_fix:
        creator_source["process"] = "/s/process/preposition_fix"
    creator_source["weight"] = 1.0
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            "contributor": join_uri("/s/contributor/omcs", standardize_username(username)),
            "activity": "/s/activity/omcs/vote",
            "weight": float(vote_int),
        }
        sources.append(vote_source)
    return sources
def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    activity = parts_dict["activity"]

    creator_node = join_uri('/s/contributor/omcs',
                            standardize_text(parts_dict["creator"]))
    activity_node = join_uri('/s/activity/omcs', standardize_text(activity))
    if preposition_fix:
        conjunction = [creator_node, activity_node, '/s/rule/preposition_fix']
    else:
        conjunction = [creator_node, activity_node]
    weighted_sources = [(conjunction, 1)]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        conjunction = [
            join_uri('/s/contributor/omcs', standardize_text(username)),
            '/s/activity/omcs/vote'
        ]
        weighted_sources.append((conjunction, vote_int))
    return weighted_sources
Example #5
0
def build_relation(parts_dict):
    """
    Update relation names to ConceptNet 5's names. Mostly we preserve the same
    names, but any instance of "ConceptuallyRelatedTo" becomes "RelatedTo".
    Statements with negative polarity get new negative relations.
    """
    relname = parts_dict["relname"]
    polarity = polarity = parts_dict["polarity"]
    if relname == 'ConceptuallyRelatedTo':
        relname = 'RelatedTo'

    if polarity > 0:
        relation = join_uri('/r', relname)
    else:
        relation = join_uri('/r', 'Not' + relname)

    return relation
Example #6
0
def build_relation(parts_dict):
    """
    Update relation names to ConceptNet 5's names. Mostly we preserve the same
    names, but any instance of "ConceptuallyRelatedTo" becomes "RelatedTo".
    Statements with negative polarity get new negative relations.
    """
    relname = parts_dict["relname"]
    polarity = polarity = parts_dict["polarity"]
    if relname == 'ConceptuallyRelatedTo':
        relname = 'RelatedTo'

    if polarity > 0:
        relation = join_uri('/r', relname)
    else:
        relation = join_uri('/r', 'Not' + relname)

    return relation
def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri('/s/contributor/omcs',
                            standardize_username(parts_dict["creator"]))
    creator_source['contributor'] = creator_node

    activity = parts_dict["activity"]
    activity = '_'.join(simple_tokenize(activity.replace('_', ' ')))
    activity_node = join_uri('/s/activity/omcs', activity)
    creator_source['activity'] = activity_node

    if preposition_fix:
        creator_source['process'] = '/s/process/preposition_fix'
    creator_source['weight'] = 1.
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            'contributor':
            join_uri('/s/contributor/omcs', standardize_username(username)),
            'activity':
            '/s/activity/omcs/vote',
            'weight':
            float(vote_int)
        }
        sources.append(vote_source)
    return sources
Example #8
0
def build_sources(parts_dict, preposition_fix=False):
    """
    Create the 'source' information for an assertion.

    The output is a list of (conjunction, weight) tuples, where 'conjunction'
    is a list of sources that combined to produce this assertion. Later,
    inside the 'make_edge' function, these will be combined into an '/and'
    node.
    """
    creator_source = {}
    creator_node = join_uri(
        '/s/contributor/omcs', standardize_username(parts_dict["creator"])
    )
    creator_source['contributor'] = creator_node

    activity = parts_dict["activity"]
    activity = '_'.join(simple_tokenize(activity.replace('_', ' ')))
    activity_node = join_uri('/s/activity/omcs', activity)
    creator_source['activity'] = activity_node

    if preposition_fix:
        creator_source['process'] = '/s/process/preposition_fix'
    creator_source['weight'] = 1.
    sources = [creator_source]

    for vote in parts_dict["votes"]:
        username = vote[0]
        if username == parts_dict["creator"]:
            continue

        vote_int = vote[1]
        vote_source = {
            'contributor': join_uri(
                '/s/contributor/omcs', standardize_username(username)
            ),
            'activity': '/s/activity/omcs/vote',
            'weight': float(vote_int),
        }
        sources.append(vote_source)
    return sources
Example #9
0
    def lemmatize_uri(self, uri):
        pieces = split_uri(uri)
        if len(pieces) < 2:
            return uri
        language = pieces[1]
        text = pieces[2]
        rest = pieces[3:]
        if rest:
            pos = rest[0]
        else:
            pos = None

        root, _form = self.lookup(language, text, pos)
        return join_uri('c', language, root, *rest)
Example #10
0
    def lemmatize_uri(self, uri):
        pieces = split_uri(uri)
        if len(pieces) < 2:
            return uri
        language = pieces[1]
        text = pieces[2]
        rest = pieces[3:]
        if rest:
            pos = rest[0]
        else:
            pos = None

        root, _form = self.lookup(language, text, pos)
        return join_uri('c', language, root, *rest)
Example #11
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, remove the region tag from Chinese assertions, so they are
    considered simply as assertions about Chinese regardless of whether it is
    Traditional or Simplified Chinese. In the cases where they overlap, this
    helps to make the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    # Unify simplified and traditional Chinese in associations.
    if parts[1] == 'zh_CN' or parts[1] == 'zh_TW':
        parts[1] = 'zh'
    return join_uri(*parts[:3])
Example #12
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, remove the region tag from Chinese assertions, so they are
    considered simply as assertions about Chinese regardless of whether it is
    Traditional or Simplified Chinese. In the cases where they overlap, this
    helps to make the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    # Unify simplified and traditional Chinese in associations.
    if parts[1] == 'zh_CN' or parts[1] == 'zh_TW':
        parts[1] = 'zh'
    return join_uri(*parts[:3])
Example #13
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, simplify language tags to a bare language. The main purpose
    is to remove the region tag from Chinese assertions, so they are considered
    simply as assertions about Chinese regardless of whether it is Traditional
    or Simplified Chinese. In the cases where they overlap, this helps to make
    the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    langtag = parts[1]
    if parts[1] != '[':
        langcode = langcodes.get(langtag).language
        if langcode:
            parts[1] = langcode
    return join_uri(*parts[:3])
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, simplify language tags to a bare language. The main purpose
    is to remove the region tag from Chinese assertions, so they are considered
    simply as assertions about Chinese regardless of whether it is Traditional
    or Simplified Chinese. In the cases where they overlap, this helps to make
    the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    langtag = parts[1]
    if parts[1] != '[':
        langcode = langcodes.get(langtag).language
        if langcode:
            parts[1] = langcode
    return join_uri(*parts[:3])
Example #15
0
def build_data_set(parts_dict):
    lang = parts_dict['lang']
    dataset = join_uri('/d/conceptnet/4', lang)
    return dataset
Example #16
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (get_uri_language(start_uri) in COMMON_LANGUAGES
                    and get_uri_language(end_uri) in COMMON_LANGUAGES):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf')
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start,
                    end=end,
                    weight=weight,
                    dataset=dataset,
                    rel=rel)
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Example #17
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in (
        'wordnet-attribute.ttl', 'wordnet-causes.ttl',
        'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
        'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
        'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
        'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
        'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl',
        'full/wordnet-derivationallyrelated.ttl',
        'full/wordnet-participleof.ttl',
        'full/wordnet-pertainsto.ttl',
        'full/wordnet-seealso.ttl'
    ):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(
                    rel, subj, obj, dataset='/d/wordnet/3.0',
                    license='/l/CC/By', sources=SOURCE, weight=2.0
                )
                out.write(edge)
Example #18
0
def build_data_set(parts_dict):
    lang = parts_dict["lang"]
    dataset = join_uri("/d/conceptnet/4", lang)
    return dataset
Example #19
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    FIXME: the above is out of date, we use conceptnet5.vectors now

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Example #20
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                start_uri.startswith('/c/') and end_uri.startswith('/c/') and
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
Example #21
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir,
                         'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl',
                     'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
                     'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
                     'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
                     'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
                     'wordnet-substancemeronym.ttl',
                     'full/wordnet-antonym.ttl',
                     'full/wordnet-derivationallyrelated.ttl',
                     'full/wordnet-participleof.ttl',
                     'full/wordnet-pertainsto.ttl',
                     'full/wordnet-seealso.ttl'):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(
                    filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(rel,
                                 subj,
                                 obj,
                                 dataset='/d/wordnet/3.0',
                                 license='/l/CC/By',
                                 sources=SOURCE,
                                 weight=2.0)
                out.write(edge)
Example #22
0
def build_data_set(parts_dict):
    lang = parts_dict['lang']
    dataset = join_uri('/d/conceptnet/4', lang)
    return dataset
Example #23
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES
                and get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf',
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset, rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)