コード例 #1
0
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
    """
    Takes in a file of tab-separated simple associations, and removes
    uncommon associations and associations unlikely to be useful.

    All concepts that occur fewer than `cutoff` times will be removed.
    All English concepts that occur fewer than `en_cutoff` times will be removed.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, _value, _dataset, rel = line.rstrip().split('\t')
            if rel == '/r/SenseOf':
                pass
            else:
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if gright.startswith('/c/'):
                    counts[gleft] += 1
                if gleft.startswith('/c/'):
                    counts[gright] += 1

    filtered_concepts = {
        concept for (concept, count) in counts.items()
        if (
            count >= en_cutoff or
            (not concept.startswith('/c/en/') and count >= cutoff)
        )
    }

    with open(output_filename, 'w', encoding='utf-8') as out:
        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if (
                    gleft in filtered_concepts and
                    gright in filtered_concepts and
                    fvalue != 0
                ):
                    if gleft != gright:
                        line = '\t'.join([gleft, gright, value, dataset, rel])
                        print(line, file=out)
コード例 #2
0
    def from_csv(cls,
                 filename,
                 filtered_concepts=None,
                 reject_negative_relations=True):
        """
        Reads an association file and builds an (undirected) graph from it.

        If filtered_concepts isn't None, it should be a collection of concepts,
        and only vertices from this collection and edges that link two such
        vertices will be added to the graph.  If it _is_ None (the default),
        however, please note that no such filtering will be done (i.e. the
        effective filter collection is then the universal set of concepts, not
        the empty set).

        If reject_negative_relations is True (the default), only edges not
        corresponding to negative relations will be added to the graph.
        """
        graph = cls()

        if filtered_concepts is None:
            filter_concepts = False
        else:
            filter_concepts = True

        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right):
                    continue
                if reject_negative_relations and is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if fvalue == 0:
                    continue
                if gleft == gright:
                    continue
                if filter_concepts and gleft not in filtered_concepts:
                    continue
                if filter_concepts and gright not in filtered_concepts:
                    continue
                graph.add_edge(gleft, gright, value, dataset, rel)

        return graph
コード例 #3
0
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3):
    """
    Takes in a file of tab-separated simple associations, and removes
    uncommon associations and associations unlikely to be useful.

    All concepts that occur fewer than `cutoff` times will be removed.
    All English concepts that occur fewer than `en_cutoff` times will be removed.
    """
    counts = defaultdict(int)
    with open(filename, encoding='utf-8') as file:
        for line in file:
            left, right, _value, _dataset, rel = line.rstrip().split('\t')
            if rel == '/r/SenseOf':
                pass
            else:
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                counts[gleft] += 1
                counts[gright] += 1

    filtered_concepts = {
        concept for (concept, count) in counts.items()
        if (
            count >= en_cutoff or
            (not concept.startswith('/c/en/') and count >= cutoff)
        )
    }

    with open(output_filename, 'w', encoding='utf-8') as out:
        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if (
                    gleft in filtered_concepts and
                    gright in filtered_concepts and
                    fvalue != 0
                ):
                    if gleft != gright:
                        line = '\t'.join([gleft, gright, value, dataset, rel])
                        print(line, file=out)
コード例 #4
0
    def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True):
        """
        Reads an association file and builds an (undirected) graph from it.

        If filtered_concepts isn't None, it should be a collection of concepts,
        and only vertices from this collection and edges that link two such
        vertices will be added to the graph.  If it _is_ None (the default),
        however, please note that no such filtering will be done (i.e. the
        effective filter collection is then the universal set of concepts, not
        the empty set).

        If reject_negative_relations is True (the default), only edges not
        corresponding to negative relations will be added to the graph.
        """
        graph = cls()

        if filtered_concepts is None:
            filter_concepts = False
        else:
            filter_concepts = True

        with open(filename, encoding='utf-8') as file:
            for line in file:
                left, right, value, dataset, rel = line.rstrip().split('\t', 4)
                if concept_is_bad(left) or concept_is_bad(right):
                    continue
                if reject_negative_relations and is_negative_relation(rel):
                    continue
                fvalue = float(value)
                gleft = uri_prefix(left)
                gright = uri_prefix(right)
                if fvalue == 0:
                    continue
                if gleft == gright:
                    continue
                if filter_concepts and gleft not in filtered_concepts:
                    continue
                if filter_concepts and gright not in filtered_concepts:
                    continue
                graph.add_edge(gleft, gright, value, dataset, rel)

        return graph