Beispiel #1
0
def read_edges(root):
    """
    Read all csv files (assumed to be ConceptNet edge files) under the given 
    root (e.g. $CONCEPTNET_DATA/edges or a single edge file) and generate one 
    five-tuple for every edge they contain, consisting of the relation, the 
    (uri-prefixes of the) left and right endpoints, the dataset, and the data 
    sources (as a string).
    """
    if not os.path.isdir(root):
        files = [root]
    else:
        files = []
        for root_dir, _, filenames in os.walk(root):
            for filename in filenames:
                if filename.lower().endswith('.csv'):
                    path = os.path.join(root_dir, filename)
                    files.append(path)
    for path in files:
        with open(path, 'rt', encoding='utf-8') as fp:
            for line in fp:
                _, rel, left, right, json_data = line.split('\t')
                left = uri_prefix(left)
                right = uri_prefix(right)
                data = json.loads(json_data, encoding='utf-8')
                dataset = data['dataset']
                source = data['sources']
                yield (rel, left, right, dataset, source)
Beispiel #2
0
def make_conceptnet_association_graph(filename,
                                      save_edge_list=True,
                                      concept_filter=None,
                                      bad_concept=concept_is_bad,
                                      bad_relation=is_negative_relation):
    """
    Reads an association file and builds an (undirected) graph from it, 
    """
    graph = ConceptNetAssociationGraph(save_edge_list)
    if concept_filter is None:
        concept_filter = lambda concept: True
    if bad_concept is None:
        bad_concept = lambda concept: False
    if bad_relation is None:
        bad_relation = lambda rel: False

    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, value, dataset, rel = line.rstrip().split('\t', 4)
            if bad_concept(left) or bad_concept(right) or bad_relation(rel):
                continue
            fvalue = float(value)
            gleft = uri_prefix(left)
            gright = uri_prefix(right)
            if concept_filter(gleft) and concept_filter(gright) \
               and fvalue != 0 and gleft != gright:
                graph.add_edge(gleft, gright, value, dataset, rel)
    return graph
def make_filtered_concepts(filename, cutoff=3, en_cutoff=3):
    """
    Takes in a file of tab-separated associations, and returns a set of
    concepts from which those which are unlikely to be useful have been
    removed.

    All concepts that occur fewer than `cutoff` times will be removed.
    All English concepts that occur fewer than `en_cutoff` times will be removed.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, _value, _dataset, rel = line.rstrip().split('\t')
            if rel == '/r/SenseOf':
                pass
            else:
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if is_concept(gright):
                    counts[gleft] += 1
                if is_concept(gleft):
                    counts[gright] += 1

    filtered_concepts = {
        concept
        for (concept, count) in counts.items()
        if (count >= en_cutoff or (not is_concept(concept) and count >= cutoff))
    }
    return filtered_concepts
Beispiel #4
0
def make_filtered_concepts(filename, cutoff=3, en_cutoff=3):
    """
    Takes in a file of tab-separated associations, and returns a set of 
    concepts from which those which are unlikely to be useful have been 
    removed. 
    
    All concepts that occur fewer than `cutoff` times will be removed.
    All English concepts that occur fewer than `en_cutoff` times will be removed.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, _value, _dataset, rel = line.rstrip().split('\t')
            if rel == '/r/SenseOf':
                pass
            else:
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if is_concept(gright):
                    counts[gleft] += 1
                if is_concept(gleft):
                    counts[gright] += 1

    filtered_concepts = {
        concept
        for (concept, count) in counts.items()
        if (count >= en_cutoff or (not is_concept(concept) and count >= cutoff)
            )
    }
    return filtered_concepts
Beispiel #5
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    Args:
        frame (DataFrame): Term vectors DataFrame with indexed with terms.
        language (str, default='en): Use this language for labels that aren't
            already standardized.
        forms (bool, default=True): Combine terms with the same lemma.
    """
    # Re-label the DataFrame with standardized, non-unique row labels
    # (this used to be a bug, see previous and new behavior comment below)
    #if all('/' in label for label in frame.index[10:20]):
    if all(label.count('/') == 1 for label in frame.index[10:20]):

        # previously partitioned label='/c/en/term' into tuple=('', '/', 'c/en/term')
        # into new label='/c//en_term', now partitions label='en/term' into tuple=('en',
        # '/', 'term') into new label=/c/en/term
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # `language` argument is only used here for labels that aren't already standardized
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(
        1, nrows + 1)  # "with earlier rows given more weight"
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights,
                          axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[
                    lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
def combine_assertions(input_filename, core_filename, output_filename):
    """
    Take in a tab-separated, sorted "CSV" files, indicated by
    `input_filename`, that should be grouped together into assertions.
    Output a msgpack stream of assertions the file indicated by
    `output_filename`.

    The input file should be made from multiple sources of assertions by
    concatenating and sorting them.

    The combined assertions will all have the dataset of the first edge that
    produces them, and the license of the strongest license being combined.

    This process requires its input to be a sorted CSV so that all edges for
    the same assertion will appear consecutively.
    """

    def group_func(line):
        "Group lines by their URI (their first column)."
        return line.split('\t', 1)[0]

    out = MsgpackStreamWriter(output_filename)
    out_bad = MsgpackStreamWriter(output_filename + '.reject')

    core_prefixes = set()
    for line in open(core_filename, encoding='utf-8'):
        core_prefixes.add(uri_prefix(line.strip(), 3))

    # Scan through the assertions twice to add derived words to the blocklist
    blocklist = Blocklist.load(get_support_data_filename(BLOCK_FILENAME))
    for iter in range(2):
        with open(input_filename, encoding='utf-8') as stream:
            for line in stream:
                tmp_assertion = _make_assertion([line.strip()])
                if tmp_assertion is None:
                    continue
                blocklist.propagate_blocks(tmp_assertion)

    with open(input_filename, encoding='utf-8') as stream:
        for key, line_group in itertools.groupby(stream, group_func):
            assertion = _make_assertion(line_group)
            destination = out
            if assertion is None:
                continue
            if assertion['weight'] <= 0:
                destination = out_bad
            if blocklist.is_blocked(assertion):
                destination = out_bad
            if assertion['rel'] == 'ExternalURL':
                # discard ExternalURL edges for things that aren't otherwise
                # in ConceptNet
                prefix = uri_prefix(assertion['start'], 3)
                if prefix not in core_prefixes:
                    destination = out_bad
            destination.write(assertion)

    out.close()
    out_bad.close()
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    # FIXME: the steps leading up to this produce URIs that can differ based
    # on word senses. These don't get merged together, but they should.
    uri, rel, start, end, _ = lines[0].split('\t')

    # We can't distinguish word senses well enough yet, so only keep them
    # up to the part of speech
    start = uri_prefix(start, 4)
    end = uri_prefix(end, 4)

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(rel=rel,
                     start=start,
                     end=end,
                     weight=weight,
                     dataset=dataset,
                     license=license,
                     sources=sources,
                     surfaceText=surface_text)
Beispiel #8
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
Beispiel #9
0
def read_concept_file(concept_file):
    # TODO: docstring
    concepts = set()
    for line in open(concept_file, encoding='utf-8'):
        concept = uri_prefix(line.strip())
        concepts.add(concept)
    return concepts
Beispiel #10
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms to one whose row
    labels are standardized ConceptNet URIs (with some extra word2vec-style
    normalization of digits). Rows whose labels get the same
    standardized URI get combined, with earlier rows given more weight.
    """
    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [uri_prefix(standardized_uri(language, label)) for label in frame.index]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort(ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
Beispiel #11
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors. This forms a reasonable
        approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            expanded.append((term, weight / 10))
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'],
                                   term) and not field_match(
                                       edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'],
                                     term) and not field_match(
                                         edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    neighbor_weight = weight * min(10, edge['weight']) * 0.001
                    expanded.append((neighbor, neighbor_weight))

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]
Beispiel #12
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'],
                                   term) and not field_match(
                                       edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'],
                                     term) and not field_match(
                                         edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    # TODO: explain this formula
                    neighbor_weight = weight * min(10, edge['weight']) * 0.01
                    expanded.append((neighbor, neighbor_weight))

                prefix_weight = 0.01
                if not term.startswith('/c/en/'):
                    # FIXME: better language code handling
                    englishified = '/c/en/' + term[6:]
                    expanded.append((englishified, prefix_weight))

                while term:
                    # Skip excessively general lookups, for either an entire
                    # language, or all terms starting with a single
                    # non-ideographic letter
                    if term.endswith('/') or (term[-2] == '/'
                                              and term[-1] < chr(0x3000)):
                        break
                    prefixed = self.terms_with_prefix(term)
                    if prefixed:
                        n_prefixed = len(prefixed)
                        for prefixed_term in prefixed:
                            expanded.append(
                                (prefixed_term, prefix_weight / n_prefixed))
                        break
                    term = term[:-1]

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]
Beispiel #13
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors. This forms a reasonable
        approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            expanded.append((term, weight / 10))
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge["start"]["term"], term) and not field_match(edge["end"]["term"], term):
                        neighbor = edge["end"]["term"]
                    elif field_match(edge["end"]["term"], term) and not field_match(edge["start"]["term"], term):
                        neighbor = edge["start"]["term"]
                    else:
                        continue
                    neighbor_weight = weight * min(10, edge["weight"]) * 0.001
                    expanded.append((neighbor, neighbor_weight))

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
Beispiel #14
0
    def expand_terms(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index and self.finder is not None:
                neighbors = self._find_neighbors(term, limit_per_term, weight)
                expanded.extend(neighbors)

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [
                (uri_prefix(term), weight / total_weight) for (term, weight) in expanded
            ]
Beispiel #15
0
    def expand_terms(self, terms, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, if any of the terms
        are OOV, find approximations to those terms: the same term in English, or terms
        that share a prefix that's as long as possible with the given term.

        This helps increase the recall power of the vector space, because it means
        you can find terms that are too infrequent to have their own vector, getting
        a reasonable guess at the vector they might have.
        """
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index:
                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    if englishified is not None:
                        expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]
Beispiel #16
0
    def expand_terms(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index and self.finder is not None:
                neighbors = self._find_neighbors(term, limit_per_term, weight)
                expanded.extend(neighbors)

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [
                (uri_prefix(term), weight / total_weight) for (term, weight) in expanded
            ]
Beispiel #17
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
Beispiel #18
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
Beispiel #19
0
def read_concept_file(concept_file):
    # TODO: docstring
    concepts = set()
    for line in open(concept_file, encoding='utf-8'):
        concept = uri_prefix(line.strip())
        concepts.add(concept)
    return concepts
Beispiel #20
0
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
    """
    Takes in a file of tab-separated simple associations, and removes
    uncommon associations and associations unlikely to be useful.

    All concepts that occur fewer than `cutoff` times will be removed.
    All English concepts that occur fewer than `en_cutoff` times will be removed.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, _value, _dataset, rel = line.rstrip().split('\t')
            if rel == '/r/SenseOf':
                pass
            else:
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if gright.startswith('/c/'):
                    counts[gleft] += 1
                if gleft.startswith('/c/'):
                    counts[gright] += 1

    filtered_concepts = {
        concept for (concept, count) in counts.items()
        if (
            count >= en_cutoff or
            (not concept.startswith('/c/en/') and count >= cutoff)
        )
    }

    with open(output_filename, 'w', encoding='utf-8') as out:
        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if (
                    gleft in filtered_concepts and
                    gright in filtered_concepts and
                    fvalue != 0
                ):
                    if gleft != gright:
                        line = '\t'.join([gleft, gright, value, dataset, rel])
                        print(line, file=out)
Beispiel #21
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    """
    # Check for en/term format we use to train fastText on OpenSubtitles data
    if all(label.count('/') == 1 for label in frame.index[0:5]):
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights,
                          axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[
                    lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
Beispiel #22
0
 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.simple_tokenize(text)
     weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.)
                       for token in tokens]
     return self.get_vector(weighted_terms, oov_vector=False)
Beispiel #23
0
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
    """
    Takes in a file of tab-separated simple associations, and removes
    uncommon associations and associations unlikely to be useful.

    All concepts that occur fewer than `cutoff` times will be removed.
    All English concepts that occur fewer than `en_cutoff` times will be removed.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, _value, _dataset, rel = line.rstrip().split('\t')
            if rel == '/r/SenseOf':
                pass
            else:
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                counts[gleft] += 1
                counts[gright] += 1

    filtered_concepts = {
        concept for (concept, count) in counts.items()
        if (
            count >= en_cutoff or
            (not concept.startswith('/c/en/') and count >= cutoff)
        )
    }

    with open(output_filename, 'w', encoding='utf-8') as out:
        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if (
                    gleft in filtered_concepts and
                    gright in filtered_concepts and
                    fvalue != 0
                ):
                    if gleft != gright:
                        line = '\t'.join([gleft, gright, value, dataset, rel])
                        print(line, file=out)
Beispiel #24
0
    def from_csv(cls,
                 filename,
                 filtered_concepts=None,
                 reject_negative_relations=True):
        """
        Reads an association file and builds an (undirected) graph from it.

        If filtered_concepts isn't None, it should be a collection of concepts,
        and only vertices from this collection and edges that link two such
        vertices will be added to the graph.  If it _is_ None (the default),
        however, please note that no such filtering will be done (i.e. the
        effective filter collection is then the universal set of concepts, not
        the empty set).

        If reject_negative_relations is True (the default), only edges not
        corresponding to negative relations will be added to the graph.
        """
        graph = cls()

        if filtered_concepts is None:
            filter_concepts = False
        else:
            filter_concepts = True

        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right):
                    continue
                if reject_negative_relations and is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if fvalue == 0:
                    continue
                if gleft == gright:
                    continue
                if filter_concepts and gleft not in filtered_concepts:
                    continue
                if filter_concepts and gright not in filtered_concepts:
                    continue
                graph.add_edge(gleft, gright, value, dataset, rel)

        return graph
Beispiel #25
0
def standardize_row_labels(frame, language='en', forms=True):
    """
    Convert a frame whose row labels are bare English terms (e.g. of the
    form 'en/term') to one whose row labels are standardized ConceptNet URIs
    (e.g. of the form '/c/en/term'; and with some extra word2vec-style
    normalization of digits). Rows whose labels get the same standardized
    URI get combined, with earlier rows given more weight.
    """
    # Check for en/term format we use to train fastText on OpenSubtitles data
    if all(label.count('/') == 1 for label in frame.index[0:5]):
        tuples = [label.partition('/') for label in frame.index]
        frame.index = [
            uri_prefix(standardized_uri(language, text))
            for language, _slash, text in tuples
        ]

    # Re-label the DataFrame with standardized, non-unique row labels
    frame.index = [
        uri_prefix(standardized_uri(language, label)) for label in frame.index
    ]

    # Assign row n a weight of 1/(n+1) for weighted averaging
    nrows = frame.shape[0]
    weights = 1.0 / np.arange(1, nrows + 1)
    label_weights = pd.Series(weights, index=frame.index)

    # groupby(level=0).sum() means to add rows that have the same label
    relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum()
    combined_weights = label_weights.sort_index().groupby(level=0).sum()

    # Optionally adjust words to be more like their word forms
    if forms:
        for label in relabeled.index:
            lemmatized = lemmatize_uri(label)
            if lemmatized != label and lemmatized in relabeled.index:
                relabeled.loc[lemmatized] += relabeled.loc[label] / 2
                combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2

    scaled = relabeled.div(combined_weights, axis='rows')

    # Rearrange the items in descending order of weight, similar to the order
    # we get them in from word2vec and GloVe
    combined_weights.sort_values(inplace=True, ascending=False)
    result = scaled.loc[combined_weights.index]
    return result
Beispiel #26
0
def uri_to_label(uri):
    """
    Convert a ConceptNet uri into a label to be used in nodes. This
    function replaces an underscore with a space, so while '/c/en/example' will be converted into
    'example', '/c/en/canary_islands' will be converted into 'canary islands'.
    """
    if uri.startswith('/c/'):
        uri = uri_prefix(uri)
    return uri.split('/')[-1].replace('_', ' ')
Beispiel #27
0
 def text_to_vector(self, language, text):
     """
     Used in Story Cloze Test to create a vector for text.
     """
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [
         (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens
     ]
     return self.get_vector(weighted_terms, oov_vector=False)
Beispiel #28
0
def make_assertion(line_group):
    lines = [line.rstrip() for line in line_group]
    lines = [line for line in lines if line]
    if not lines:
        return None

    uri, rel, start, end, _ = lines[0].split('\t')

    # We can't distinguish word senses well enough yet, so only keep them
    # up to the part of speech
    start = uri_prefix(start, 4)
    end = uri_prefix(end, 4)

    if not (keep_concept(start) and keep_concept(end)):
        return None

    info_dicts = [json.loads(line.split('\t')[4]) for line in lines]
    unscaled_weight = sum(info['weight'] for info in info_dicts)
    licenses = {info['license'] for info in info_dicts}
    dataset = info_dicts[0]['dataset']
    surface_text = None
    sources = []
    seen_sources = set()
    for info in info_dicts:
        if surface_text is None and 'surfaceText' in info:
            surface_text = info['surfaceText']
        for subsource in info['sources']:
            conjunction = conjunction_uri(*sorted(subsource.values()))
            if conjunction not in seen_sources:
                sources.append(subsource)
                seen_sources.add(conjunction)

    weight = weight_scale(unscaled_weight)
    if Licenses.cc_sharealike in licenses:
        license = Licenses.cc_sharealike
    else:
        license = Licenses.cc_attribution

    return make_edge(
        rel=rel, start=start, end=end, weight=weight,
        dataset=dataset, license=license, sources=sources,
        surfaceText=surface_text
    )
Beispiel #29
0
    def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True):
        """
        Reads an association file and builds an (undirected) graph from it.

        If filtered_concepts isn't None, it should be a collection of concepts,
        and only vertices from this collection and edges that link two such
        vertices will be added to the graph.  If it _is_ None (the default),
        however, please note that no such filtering will be done (i.e. the
        effective filter collection is then the universal set of concepts, not
        the empty set).

        If reject_negative_relations is True (the default), only edges not
        corresponding to negative relations will be added to the graph.
        """
        graph = cls()

        if filtered_concepts is None:
            filter_concepts = False
        else:
            filter_concepts = True

        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right):
                    continue
                if reject_negative_relations and is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if fvalue == 0:
                    continue
                if gleft == gright:
                    continue
                if filter_concepts and gleft not in filtered_concepts:
                    continue
                if filter_concepts and gright not in filtered_concepts:
                    continue
                graph.add_edge(gleft, gright, value, dataset, rel)

        return graph
 def propagate_blocks(self, edge, verbose=False):
     """
     Scan an edge and see if it is a DerivedFrom or FormOf edge whose right
     side matches a derivation block. If so, add its left side as a simple
     block and a derivation block.
     """
     if edge['rel'].endswith('DerivedFrom') or edge['rel'].endswith('FormOf'):
         if set(uri_prefixes(edge['end'])) & self.derivation_blocks:
             prefix = uri_prefix(edge['start'], 3)
             self.simple_blocks.add(prefix)
             self.derivation_blocks.add(prefix)
             if verbose:
                 print(f"Added derivation block: {prefix}")
Beispiel #31
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors. This forms a reasonable
        approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            # TODO: this disagrees with the docstring about whether neighbors
            # are added to non-OOV terms
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'], term) and not field_match(edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'], term) and not field_match(edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    # TODO: explain this formula
                    neighbor_weight = weight * min(10, edge['weight']) * 0.01
                    expanded.append((neighbor, neighbor_weight))

                prefix_weight = 0.01
                if not term.startswith('/c/en/'):
                    # FIXME: better language code handling
                    englishified = '/c/en/' + term[6:]
                    expanded.append((englishified, prefix_weight))

                while term:
                    if term.endswith('/'):
                        break
                    start_idx, end_idx = index_prefix_range(self.frame, term)
                    if end_idx > start_idx:
                        n_prefixed = end_idx - start_idx
                        for prefixed_term in self.frame.index[start_idx:end_idx]:
                            expanded.append((prefixed_term, prefix_weight / n_prefixed))
                        break
                    term = term[:-1]

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
Beispiel #32
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if uri.startswith('/c/'):
        pieces = split_uri(uri)
        ld['language'] = get_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
Beispiel #33
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {
        '@id': uri,
        'label': label
    }
    if uri.startswith('/c/'):
        pieces = split_uri(uri)
        ld['language'] = pieces[1]
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
Beispiel #34
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = True
        if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}:
            ld['site_available'] = False
        ld['path'] = urlparse(uri).path
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
Beispiel #35
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
Beispiel #36
0
def describe_sources(sources, specific=True):
    """
    Build a marked-up text phrase describing the sources of our data.

    If `specific` is True, sources with many known individual contributors
    will list up to MAX_INDIVIDUALS of those contributors. If False, only
    the source as a whole will be credited. specific=False is used for the
    credit at the top of a page.
    """
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source[
                'activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'activity' in source and source[
                'activity'] == '/s/activity/kyoto_yahoo':
            more_sources.add(
                source_link(source['activity'], KYOTO_YAHOO_CREDIT))
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(
                    source_link(contributor,
                                CONTRIBUTOR_NAME_MAP[contributor]))
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count -
                                                          MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors))
            source_chunks.append(omcs_str)
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors'
            )
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.
                format(count_str))
        else:
            source_chunks.append(
                'the <a href="/s/contributor/petgame">PTT Pet Game</a>')

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.
                format(count_str))
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(
            oxford_comma(source_chunks))
    return Markup(source_markup)
Beispiel #37
0
def read_wiktionary(input_file, db_file, output_file):
    """
    Convert a stream of parsed Wiktionary data into ConceptNet edges.

    A `db_file` containing all known words in all languages must have already
    been prepared from the same data.
    """
    db = sqlite3.connect(db_file)
    out = MsgpackStreamWriter(output_file)
    for heading, items in segmented_stream(input_file):
        language = heading['language']
        title = heading['title']
        dataset = '/d/wiktionary/{}'.format(language)
        url_title = heading['title'].replace(' ', '_')
        web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title)
        web_source = '/s/resource/wiktionary/{}'.format(language)

        source = {
            'contributor': web_source,
            'process': PARSER_RULE
        }

        # Scan through the 'from' items, such as the start nodes of
        # translations, looking for distinct etymologies. If we get more than
        # one etymology for a language, we need to distinguish them as
        # different senses in that language.
        all_etyms = {
            (item['from']['language'], etym_label(language, item['from']))
            for item in items
            if 'language' in item['from'] and item['from']['text'] == title
            and etym_label(language, item['from']) is not None
        }
        word_languages = {wlang for (wlang, _) in all_etyms}
        for wlang in sorted(word_languages):
            cpage = standardized_concept_uri(wlang, title)
            ld_edge = make_edge(
                '/r/ExternalURL', cpage, web_url,
                dataset=dataset, weight=0.25, sources=[source],
                license=Licenses.cc_sharealike
            )
            out.write(ld_edge)
        etym_to_translation_sense = {}
        language_etym_counts = Counter(lang for (lang, etym) in all_etyms)
        polysemous_languages = {
            lang for lang in language_etym_counts
            if language_etym_counts[lang] > 1
        }

        for item in items:
            tfrom = item['from']
            tto = item['to']
            assumed_languages = [language]
            lang1 = tfrom.get('language')
            lang2 = tto.get('language')
            if lang1 and (lang1 not in assumed_languages) and valid_language(lang1):
                assumed_languages.append(lang1)
            if lang2 and (lang2 not in assumed_languages) and valid_language(lang2):
                assumed_languages.append(lang2)

            cfrom = transform_term(
                language, tfrom, assumed_languages, db,
                use_etyms=(lang1 in polysemous_languages)
            )
            cpage = cfrom
            cto = transform_term(
                language, tto, assumed_languages, db,
                use_etyms=(lang2 in polysemous_languages)
            )

            if cfrom is None or cto is None:
                continue
            if uri_prefix(cfrom, 3) == uri_prefix(cto, 3):
                continue

            rel, switch = transform_relation(item['rel'])
            if rel is None:
                continue
            if switch:
                cfrom, cto = cto, cfrom

            # When translations are separated by sense, use only the first
            # sense we see for each etymology. That will have the most
            # representative translations.
            if item['rel'] == 'translation':
                etym_key = (tfrom['language'], etym_label(language, tfrom))
                sense = tfrom.get('sense', '')
                if etym_key in etym_to_translation_sense:
                    if etym_to_translation_sense[etym_key] != sense:
                        continue
                else:
                    etym_to_translation_sense[etym_key] = sense

            weight = 1.
            if rel == '/r/EtymologicallyRelatedTo':
                weight = 0.25
            edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight,
                             sources=[source],
                             surfaceStart=tfrom['text'],
                             surfaceEnd=tto['text'],
                             license=Licenses.cc_sharealike)
            out.write(edge)

    out.close()
Beispiel #38
0
def make_edge(rel,
              start,
              end,
              dataset,
              license,
              sources,
              surfaceText=None,
              surfaceStart=None,
              surfaceEnd=None,
              weight=1.0):
    """
    Take in the information representing an edge (a justified assertion),
    and output that edge in dictionary form.

        >>> from pprint import pprint
        >>> from conceptnet5.uri import Licenses
        >>> e = make_edge(rel='/r/HasProperty',
        ...               start='/c/en/fire',
        ...               end='/c/en/hot',
        ...               dataset='/d/conceptnet/4/en',
        ...               license=Licenses.cc_attribution,
        ...               sources=[{'contributor': '/s/contributor/omcs/dev'}],
        ...               surfaceText='[[Fire]] is [[hot]]',
        ...               weight=1.0)
        >>> pprint(e)
        {'dataset': '/d/conceptnet/4/en',
         'end': '/c/en/hot',
         'features': ['/c/en/fire /r/HasProperty -',
                      '/c/en/fire - /c/en/hot',
                      '- /r/HasProperty /c/en/hot'],
         'license': 'cc:by/4.0',
         'rel': '/r/HasProperty',
         'sources': [{'contributor': '/s/contributor/omcs/dev'}],
         'start': '/c/en/fire',
         'surfaceEnd': 'hot',
         'surfaceStart': 'Fire',
         'surfaceText': '[[Fire]] is [[hot]]',
         'uri': '/a/[/r/HasProperty/,/c/en/fire/,/c/en/hot/]',
         'weight': 1.0}
    """
    pstart = uri_prefix(start)
    pend = uri_prefix(end)
    if is_concept(pstart) and is_concept(pend):
        features = [
            "%s %s -" % (pstart, rel),
            "%s - %s" % (pstart, pend),
            "- %s %s" % (rel, pend)
        ]
    else:
        features = []
    uri = assertion_uri(rel, start, end)

    assert isinstance(sources, list), sources
    assert all([isinstance(source, dict) for source in sources]), sources

    if surfaceStart is None or surfaceEnd is None:
        surfaceStart, surfaceEnd = extract_surface_terms(surfaceText)
    obj = {
        'uri': uri,
        'rel': rel,
        'start': start,
        'end': end,
        'dataset': dataset,
        'sources': sources,
        'features': features,
        'license': license,
        'weight': weight,
        'surfaceText': surfaceText,
        'surfaceStart': surfaceStart,
        'surfaceEnd': surfaceEnd
    }
    return obj
Beispiel #39
0
def describe_sources(sources, specific=True):
    """
    Build a marked-up text phrase describing the sources of our data.

    If `specific` is True, sources with many known individual contributors
    will list up to MAX_INDIVIDUALS of those contributors. If False, only
    the source as a whole will be credited. specific=False is used for the
    credit at the top of a page.
    """
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'activity' in source and source['activity'] == '/s/activity/kyoto_yahoo':
            more_sources.add(source_link(source['activity'], KYOTO_YAHOO_CREDIT))
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(
                    source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])
                )
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors)
            )
            source_chunks.append(omcs_str)
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors'
            )
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format(
                    count_str
                )
            )
        else:
            source_chunks.append(
                'the <a href="/s/contributor/petgame">PTT Pet Game</a>'
            )

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format(
                    count_str
                )
            )
        else:
            source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(
            oxford_comma(source_chunks)
        )
    return Markup(source_markup)
Beispiel #40
0
def describe_sources(sources, specific=True):
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif prefix == '/s/resource/en.wiktionary.org':
                more_sources.add(source_link(prefix, "English Wiktionary"))
            elif prefix == '/s/resource/de.wiktionary.org':
                more_sources.add(source_link(prefix, "German Wiktionary"))
            elif prefix == '/s/resource/fr.wiktionary.org':
                more_sources.add(source_link(prefix, "French Wiktionary"))
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor]))
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors)
            )
            source_chunks.append(omcs_str)
        else:
            source_chunks.append('<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors')
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format(count_str)
            )
        else:
            source_chunks.append('the <a href="/s/contributor/petgame">PTT Pet Game</a>')

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format(count_str)
            )
        else:
            source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(oxford_comma(source_chunks))
    return Markup(source_markup)
Beispiel #41
0
def uri_to_label(uri):
    # FIXME: add docstring
    if uri.startswith('/c/'):
        uri = uri_prefix(uri)
    return uri.split('/')[-1].replace('_', ' ')
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip(
            ).split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2),
                                          relation), concept1))
            else:
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2),
                                          relation), concept1))

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
Beispiel #43
0
def uri_to_label(uri):
    if uri.startswith('/c/'):
        uri = uri_prefix(uri)
    return uri.split('/')[-1].replace('_', ' ')
Beispiel #44
0
 def text_to_vector(self, language, text):
     """Used in Story Cloze Test to create a vector for text """
     tokens = wordfreq.tokenize(text, language)
     weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.) for token in tokens]
     return self.get_vector(weighted_terms, include_neighbors=False)
Beispiel #45
0
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip().split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1), relation), concept2)
                    )
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2), relation), concept1)
                    )
            else:
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1), relation), concept2)
                    )
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2), relation), concept1)
                    )

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
Beispiel #46
0
def make_edge(rel, start, end, dataset, license, sources,
              surfaceText=None, surfaceStart=None, surfaceEnd=None, weight=1.0):
    """
    Take in the information representing an edge (a justified assertion),
    and output that edge in dictionary form.

        >>> from pprint import pprint
        >>> from conceptnet5.uri import Licenses
        >>> e = make_edge(rel='/r/HasProperty',
        ...               start='/c/en/fire',
        ...               end='/c/en/hot',
        ...               dataset='/d/conceptnet/4/en',
        ...               license=Licenses.cc_attribution,
        ...               sources=[{'contributor': '/s/contributor/omcs/dev'}],
        ...               surfaceText='[[Fire]] is [[hot]]',
        ...               weight=1.0)
        >>> pprint(e)
        {'dataset': '/d/conceptnet/4/en',
         'end': '/c/en/hot',
         'features': ['/c/en/fire /r/HasProperty -',
                      '/c/en/fire - /c/en/hot',
                      '- /r/HasProperty /c/en/hot'],
         'license': 'cc:by/4.0',
         'rel': '/r/HasProperty',
         'sources': [{'contributor': '/s/contributor/omcs/dev'}],
         'start': '/c/en/fire',
         'surfaceEnd': 'hot',
         'surfaceStart': 'Fire',
         'surfaceText': '[[Fire]] is [[hot]]',
         'uri': '/a/[/r/HasProperty/,/c/en/fire/,/c/en/hot/]',
         'weight': 1.0}
    """
    pstart = uri_prefix(start)
    pend = uri_prefix(end)
    if is_concept(pstart) and is_concept(pend):
        features = [
            "%s %s -" % (pstart, rel),
            "%s - %s" % (pstart, pend),
            "- %s %s" % (rel, pend)
        ]
    else:
        features = []
    uri = assertion_uri(rel, start, end)

    assert isinstance(sources, list), sources
    assert all([isinstance(source, dict) for source in sources]), sources

    if surfaceStart is None or surfaceEnd is None:
        surfaceStart, surfaceEnd = extract_surface_terms(surfaceText)
    obj = {
        'uri': uri,
        'rel': rel,
        'start': start,
        'end': end,
        'dataset': dataset,
        'sources': sources,
        'features': features,
        'license': license,
        'weight': weight,
        'surfaceText': surfaceText,
        'surfaceStart': surfaceStart,
        'surfaceEnd': surfaceEnd
    }
    return obj
def read_wiktionary(input_file, db_file, output_file):
    """
    Convert a stream of parsed Wiktionary data into ConceptNet edges.

    A `db_file` containing all known words in all languages must have already
    been prepared from the same data.
    """
    db = sqlite3.connect(db_file)
    out = MsgpackStreamWriter(output_file)
    for heading, items in segmented_stream(input_file):
        language = heading['language']
        title = heading['title']
        dataset = '/d/wiktionary/{}'.format(language)
        url_title = heading['title'].replace(' ', '_')
        web_url = 'http://{}.wiktionary.org/wiki/{}'.format(
            language, url_title)
        web_source = '/s/resource/wiktionary/{}'.format(language)

        source = {'contributor': web_source, 'process': PARSER_RULE}

        # Scan through the 'from' items, such as the start nodes of
        # translations, looking for distinct etymologies. If we get more than
        # one etymology for a language, we need to distinguish them as
        # different senses in that language.
        all_etyms = {
            (item['from']['language'], etym_label(language, item['from']))
            for item in items
            if 'language' in item['from'] and item['from']['text'] == title
            and etym_label(language, item['from']) is not None
        }
        word_languages = {wlang for (wlang, _) in all_etyms}
        for wlang in sorted(word_languages):
            if valid_language(wlang):
                cpage = standardized_concept_uri(wlang, title)
                ld_edge = make_edge('/r/ExternalURL',
                                    cpage,
                                    web_url,
                                    dataset=dataset,
                                    weight=0.25,
                                    sources=[source],
                                    license=Licenses.cc_sharealike)
                out.write(ld_edge)
        etym_to_translation_sense = {}
        language_etym_counts = Counter(lang for (lang, etym) in all_etyms)
        polysemous_languages = {
            lang
            for lang in language_etym_counts if language_etym_counts[lang] > 1
        }

        for item in items:
            tfrom = item['from']
            tto = item['to']
            assumed_languages = [language]
            lang1 = tfrom.get('language')
            lang2 = tto.get('language')
            if lang1 and (lang1
                          not in assumed_languages) and valid_language(lang1):
                assumed_languages.append(lang1)
            if lang2 and (lang2
                          not in assumed_languages) and valid_language(lang2):
                assumed_languages.append(lang2)

            cfrom = transform_term(language,
                                   tfrom,
                                   assumed_languages,
                                   db,
                                   use_etyms=(lang1 in polysemous_languages))
            cpage = cfrom
            cto = transform_term(language,
                                 tto,
                                 assumed_languages,
                                 db,
                                 use_etyms=(lang2 in polysemous_languages))

            if cfrom is None or cto is None:
                continue
            if uri_prefix(cfrom, 3) == uri_prefix(cto, 3):
                continue

            rel, switch = transform_relation(item['rel'])
            if rel is None:
                continue
            if switch:
                cfrom, cto = cto, cfrom

            # When translations are separated by sense, use only the first
            # sense we see for each etymology. That will have the most
            # representative translations.
            if item['rel'] == 'translation':
                etym_key = (tfrom['language'], etym_label(language, tfrom))
                sense = tfrom.get('sense', '')
                if etym_key in etym_to_translation_sense:
                    if etym_to_translation_sense[etym_key] != sense:
                        continue
                else:
                    etym_to_translation_sense[etym_key] = sense

            weight = 1.
            if rel == '/r/EtymologicallyRelatedTo':
                weight = 0.25
            edge = make_edge(rel,
                             cfrom,
                             cto,
                             dataset=dataset,
                             weight=weight,
                             sources=[source],
                             surfaceStart=tfrom['text'],
                             surfaceEnd=tto['text'],
                             license=Licenses.cc_sharealike)
            out.write(edge)

    out.close()
Beispiel #48
0
def describe_sources(sources, specific=True):
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source[
                'activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif prefix == '/s/resource/en.wiktionary.org':
                more_sources.add(source_link(prefix, "English Wiktionary"))
            elif prefix == '/s/resource/de.wiktionary.org':
                more_sources.add(source_link(prefix, "German Wiktionary"))
            elif prefix == '/s/resource/fr.wiktionary.org':
                more_sources.add(source_link(prefix, "French Wiktionary"))
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(
                    source_link(contributor,
                                CONTRIBUTOR_NAME_MAP[contributor]))
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count -
                                                          MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors))
            source_chunks.append(omcs_str)
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors'
            )
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.
                format(count_str))
        else:
            source_chunks.append(
                'the <a href="/s/contributor/petgame">PTT Pet Game</a>')

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.
                format(count_str))
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(
            oxford_comma(source_chunks))
    return Markup(source_markup)