Beispiel #1
0
def gin_indexable_edge(edge):
    """
    Convert an edge into a dictionary that can be matched with the JSONB @>
    operator, which tests if one dictionary includes all the information in
    another. This operator can be indexed by GIN.

    We replace the 'start', 'end', 'rel', and 'dataset' URIs with lists
    of their URI prefixes. We query those slots with a single-element list,
    which will be a sub-list of the prefix list if it's a match.

    As an example, a query for {'start': '/c/en'} will become the GIN
    query {'start': ['/c/en']}, which will match indexed edges such as
    {
        'start': ['/c/en', '/c/en/dog'],
        'end': ['/c/en', '/c/en/bark'],
        'rel': ['/r/CapableOf'],
        ...
    }
    """
    gin_edge = {}
    gin_edge['uri'] = edge['uri']
    gin_edge['start'] = uri_prefixes(edge['start'])
    gin_edge['end'] = uri_prefixes(edge['end'])
    gin_edge['rel'] = uri_prefixes(edge['rel'])
    gin_edge['dataset'] = uri_prefixes(edge['dataset'])
    flat_sources = set()
    for source in edge['sources']:
        for value in source.values():
            flat_sources.update(uri_prefixes(value, min_pieces=3))
    gin_edge['sources'] = sorted(flat_sources)
    return gin_edge
def gin_indexable_edge(edge):
    """
    Convert an edge into a dictionary that can be matched with the JSONB @>
    operator, which tests if one dictionary includes all the information in
    another. This operator can be indexed by GIN.

    We replace the 'start', 'end', 'rel', and 'dataset' URIs with lists
    of their URI prefixes. We query those slots with a single-element list,
    which will be a sub-list of the prefix list if it's a match.

    As an example, a query for {'start': '/c/en'} will become the GIN
    query {'start': ['/c/en']}, which will match indexed edges such as
    {
        'start': ['/c/en', '/c/en/dog'],
        'end': ['/c/en', '/c/en/bark'],
        'rel': ['/r/CapableOf'],
        ...
    }
    """
    gin_edge = {}
    gin_edge['uri'] = edge['uri']
    gin_edge['start'] = uri_prefixes(edge['start'])
    gin_edge['end'] = uri_prefixes(edge['end'])
    gin_edge['rel'] = uri_prefixes(edge['rel'])
    gin_edge['dataset'] = uri_prefixes(edge['dataset'])
    flat_sources = set()
    for source in edge['sources']:
        for value in source.values():
            flat_sources.update(uri_prefixes(value, min_pieces=3))
    gin_edge['sources'] = sorted(flat_sources)
    return gin_edge
Beispiel #3
0
def write_prefixes(prefix_file, seen_prefixes, node_list, node):
    for prefix in uri_prefixes(node):
        if (node, prefix) not in seen_prefixes:
            seen_prefixes.add((node, prefix))
            node_idx = node_list.add(node)
            prefix_idx = node_list.add(prefix)
            write_row(prefix_file, [node_idx, prefix_idx])
Beispiel #4
0
def expand_terms(terms, limit_per_term=20):
	start = itemgetter('start')
	end = itemgetter('end')
	results = []
	uris = set()
	expanded = terms[:]
	for term in expanded:
		for edge in FINDER.lookup(term, limit=limit_per_term):

			if field_match(start(edge), term) and split_uri(end(edge))[1] == 'en':
				neighbor = edge['end']
			elif field_match(end(edge), term) and split_uri(start(edge))[1] == 'en':
				neighbor = edge['start']
			else:
				continue
			neighbor_weight = 1.0 * min(10, edge['weight'])
			if edge['rel'].startswith('/r/Not'):
				neighbor_weight *= -1
			for prefix in uri_prefixes(neighbor):
				uris.add(prefix)
			results.append((neighbor, neighbor_weight))
	total_weight = sum(abs(weight) for (term, weight) in results)
	if total_weight == 0:
		return []
	return [(term, weight, weight / total_weight) for (term, weight) in results]
Beispiel #5
0
def write_prefixes(prefix_file, seen_prefixes, node_list, node):
    for prefix in uri_prefixes(node):
        if (node, prefix) not in seen_prefixes:
            seen_prefixes.add((node, prefix))
            node_idx = node_list.add(node)
            prefix_idx = node_list.add(prefix)
            write_row(prefix_file, [node_idx, prefix_idx])
Beispiel #6
0
 def add_prefixes(self, filenum, offset, path, weight):
     for prefix in uri_prefixes(path):
         complete = (prefix == path)
         queryhash = minihash(prefix)
         shard = queryhash % self.nshards
         if shard == self.shard_num:
             c = self.db.cursor()
             c.execute(
                 "INSERT OR IGNORE INTO text_index "
                 "(queryhash, filenum, offset, weight, complete) "
                 "VALUES (?, ?, ?, ?, ?)",
                 (queryhash, filenum, offset, weight, complete))
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True):
    """
    Read a file of tab-separated association data from ConceptNet, such as
    `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations,
    and a pandas Index of labels.

    If you specify `orig_index`, then the index of labels will be pre-populated
    with existing labels, and any new labels will get index numbers that are
    higher than the index numbers the existing labels use. This is important
    for producing a sparse matrix that can be used for retrofitting onto an
    existing dense labeled matrix (see retrofit.py).
    """
    mat = SparseMatrixBuilder()

    labels = OrderedSet(orig_index)

    totals = defaultdict(float)
    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip(
            ).split('\t')

            index1 = labels.add(replace_numbers(concept1))
            index2 = labels.add(replace_numbers(concept2))
            value = float(value_str)

            mat[index1, index2] = value
            mat[index2, index1] = value
            totals[index1] += value
            totals[index2] += value

    # Link nodes to their more general versions
    for label in labels:
        prefixes = list(uri_prefixes(label, 3))
        if len(prefixes) >= 2:
            parent_uri = prefixes[-2]
            if parent_uri in labels:
                index1 = labels.index(label)
                index2 = labels.index(parent_uri)
                mat[index1, index2] = 1
                mat[index2, index1] = 1
                totals[index1] += 1
                totals[index2] += 1

    # add self-loops on the diagonal with equal weight to the rest of the row
    if self_loops:
        for key, value in totals.items():
            mat[key, key] = value

    shape = (len(labels), len(labels))
    index = pd.Index(labels)
    return normalize(mat.tocsr(shape), norm='l1', axis=1), index
Beispiel #8
0
 def add_prefixes(self, filenum, offset, path, weight):
     for prefix in uri_prefixes(path):
         complete = (prefix == path)
         queryhash = minihash(prefix)
         shard = queryhash % self.nshards
         if shard == self.shard_num:
             c = self.db.cursor()
             c.execute(
                 "INSERT OR IGNORE INTO text_index "
                 "(queryhash, filenum, offset, weight, complete) "
                 "VALUES (?, ?, ?, ?, ?)",
                 (queryhash, filenum, offset, weight, complete)
             )
 def propagate_blocks(self, edge, verbose=False):
     """
     Scan an edge and see if it is a DerivedFrom or FormOf edge whose right
     side matches a derivation block. If so, add its left side as a simple
     block and a derivation block.
     """
     if edge['rel'].endswith('DerivedFrom') or edge['rel'].endswith('FormOf'):
         if set(uri_prefixes(edge['end'])) & self.derivation_blocks:
             prefix = uri_prefix(edge['start'], 3)
             self.simple_blocks.add(prefix)
             self.derivation_blocks.add(prefix)
             if verbose:
                 print(f"Added derivation block: {prefix}")
Beispiel #10
0
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True):
    """
    Read a file of tab-separated association data from ConceptNet, such as
    `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations,
    and a pandas Index of labels.

    If you specify `orig_index`, then the index of labels will be pre-populated
    with existing labels, and any new labels will get index numbers that are
    higher than the index numbers the existing labels use. This is important
    for producing a sparse matrix that can be used for retrofitting onto an
    existing dense labeled matrix (see retrofit.py).
    """
    mat = SparseMatrixBuilder()

    labels = OrderedSet(orig_index)

    totals = defaultdict(float)
    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip().split('\t')

            index1 = labels.add(replace_numbers(concept1))
            index2 = labels.add(replace_numbers(concept2))
            value = float(value_str)
            mat[index1, index2] = value
            mat[index2, index1] = value
            totals[index1] += value
            totals[index2] += value

    # Link nodes to their more general versions
    for label in labels:
        prefixes = list(uri_prefixes(label, 3))
        if len(prefixes) >= 2:
            parent_uri = prefixes[-2]
            if parent_uri in labels:
                index1 = labels.index(label)
                index2 = labels.index(parent_uri)
                mat[index1, index2] = 1
                mat[index2, index1] = 1
                totals[index1] += 1
                totals[index2] += 1

    # add self-loops on the diagonal with equal weight to the rest of the row
    if self_loops:
        for key, value in totals.items():
            mat[key, key] = value

    shape = (len(labels), len(labels))
    index = pd.Index(labels)
    return mat.tocsr(shape), index
 def is_blocked(self, edge):
     """
     Test whether an edge should be blocked (whether any of its string values
     match a simple block).
     """
     edge_values = set(
         [
             prefix
             for value in edge.values()
             if isinstance(value, str)
             for prefix in uri_prefixes(value)
         ]
     )
     return bool(edge_values & self.simple_blocks)
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip(
            ).split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2),
                                          relation), concept1))
            else:
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2),
                                          relation), concept1))

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
Beispiel #13
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    """
    Scan through the list of assertions (edges that are unique in their
    start, end, and relation) and produce CSV files that can be loaded
    into PostgreSQL tables.

    The columns of these CSV files are unlabeled, but they correspond
    to the order of the table columns defined in schema.py.
    """
    # Construct the filenames of the CSV files, one per table
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_features = output_dir + '/edge_features.csv'
    output_edges_gin = output_dir + '/edges_gin.csv'

    # We can't rely on Postgres to assign IDs, because we need to know the
    # IDs to refer to them _before_ they're in Postgres. So we track our own
    # unique IDs using OrderedSet.
    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()

    # These are three files that we will write incrementally as we iterate
    # through the edges. The syntax restrictions on 'with' leave me with no
    # way to format this that satisfies my style checker and auto-formatter.
    with open(output_edges, 'w', encoding='utf-8') as edge_file,\
         open(output_edges_gin, 'w', encoding='utf-8') as edge_gin_file,\
         open(output_features, 'w', encoding='utf-8') as feature_file:
        for assertion in read_msgpack_stream(msgpack_filename):
            # Assertions are supposed to be unique. If they're not, we should
            # find out and the build should fail.
            if assertion['uri'] in assertion_list:
                raise ValueError("Duplicate assertion: {!r}".format(assertion))

            # Get unique IDs for the relation, start, and end, and the assertion
            # itself. The relation, start, and end IDs may already exists; this is
            # handled by OrderedSet.
            assertion_idx = assertion_list.add(assertion['uri'])
            rel_idx = relation_list.add(assertion['rel'])
            start_idx = node_list.add(assertion['start'])
            end_idx = node_list.add(assertion['end'])

            # Also get unique IDs for each of the sources listed as contributing
            # to this assertion.
            source_indices = []
            sources = assertion['sources']
            for source in sources:
                for sourceval in sorted(source.values()):
                    source_idx = source_list.add(sourceval)
                    source_indices.append(source_idx)

            # Write the edge data to the `edge_file`.
            jsondata = json.dumps(assertion,
                                  ensure_ascii=False,
                                  sort_keys=True)
            weight = assertion['weight']
            write_row(
                edge_file,
                [
                    assertion_idx,
                    assertion['uri'],
                    rel_idx,
                    start_idx,
                    end_idx,
                    weight,
                    jsondata,
                ],
            )

            # Convert the edge to the form that we can easily filter using GIN
            # indexing, and write that to the `edge_gin_file`.
            write_row(
                edge_gin_file,
                [
                    assertion_idx,
                    weight,
                    json.dumps(
                        gin_indexable_edge(assertion),
                        ensure_ascii=False,
                        sort_keys=True,
                    ),
                ],
            )

            # Extract the 'features' (combinations of the relation and one node)
            # that are present in the edge. We may need to match the node using
            # a prefix of that node, so store the feature separately for each
            # prefix.
            features = []

            # Get the IDs in the node table for each prefix of the nodes
            start_p_indices = [
                node_list.add(prefix)
                for prefix in uri_prefixes(assertion['start'], 3)
            ]
            end_p_indices = [
                node_list.add(prefix)
                for prefix in uri_prefixes(assertion['end'], 3)
            ]

            # Write the feature data, the 'direction' (forward, backward, or
            # symmetric), and the edge ID to the feature table.
            if assertion['rel'] in SYMMETRIC_RELATIONS:
                for start_p_idx in start_p_indices:
                    features.append((0, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((0, end_p_idx))
            else:
                for start_p_idx in start_p_indices:
                    features.append((1, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((-1, end_p_idx))

            for direction, node_idx in features:
                write_row(feature_file,
                          [rel_idx, direction, node_idx, assertion_idx])

    # Write our tables of unique IDs
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)
Beispiel #14
0
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip().split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1), relation), concept2)
                    )
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2), relation), concept1)
                    )
            else:
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1), relation), concept2)
                    )
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2), relation), concept1)
                    )

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
Beispiel #15
0
def assertions_to_sql_csv(msgpack_filename, output_dir):
    """
    Scan through the list of assertions (edges that are unique in their
    start, end, and relation) and produce CSV files that can be loaded
    into PostgreSQL tables.

    The columns of these CSV files are unlabeled, but they correspond
    to the order of the table columns defined in schema.py.
    """
    # Construct the filenames of the CSV files, one per table
    output_nodes = output_dir + '/nodes.csv'
    output_edges = output_dir + '/edges.csv'
    output_relations = output_dir + '/relations.csv'
    output_sources = output_dir + '/sources.csv'
    output_features = output_dir + '/edge_features.csv'
    output_edges_gin = output_dir + '/edges_gin.csv'

    # We can't rely on Postgres to assign IDs, because we need to know the
    # IDs to refer to them _before_ they're in Postgres. So we track our own
    # unique IDs using OrderedSet.
    node_list = OrderedSet()
    source_list = OrderedSet()
    assertion_list = OrderedSet()
    relation_list = OrderedSet()

    # These are three files that we will write incrementally as we iterate
    # through the edges. The syntax restrictions on 'with' leave me with no
    # way to format this that satisfies my style checker and auto-formatter.
    with open(output_edges, 'w', encoding='utf-8') as edge_file,\
         open(output_edges_gin, 'w', encoding='utf-8') as edge_gin_file,\
         open(output_features, 'w', encoding='utf-8') as feature_file:
        for assertion in read_msgpack_stream(msgpack_filename):
            # Assertions are supposed to be unique. If they're not, we should
            # find out and the build should fail.
            if assertion['uri'] in assertion_list:
                raise ValueError("Duplicate assertion: {!r}".format(assertion))

            # Get unique IDs for the relation, start, and end, and the assertion
            # itself. The relation, start, and end IDs may already exists; this is
            # handled by OrderedSet.
            assertion_idx = assertion_list.add(assertion['uri'])
            rel_idx = relation_list.add(assertion['rel'])
            start_idx = node_list.add(assertion['start'])
            end_idx = node_list.add(assertion['end'])

            # Also get unique IDs for each of the sources listed as contributing
            # to this assertion.
            source_indices = []
            sources = assertion['sources']
            for source in sources:
                for sourceval in sorted(source.values()):
                    source_idx = source_list.add(sourceval)
                    source_indices.append(source_idx)

            # Write the edge data to the `edge_file`.
            jsondata = json.dumps(assertion, ensure_ascii=False, sort_keys=True)
            weight = assertion['weight']
            write_row(
                edge_file,
                [
                    assertion_idx,
                    assertion['uri'],
                    rel_idx,
                    start_idx,
                    end_idx,
                    weight,
                    jsondata,
                ],
            )

            # Convert the edge to the form that we can easily filter using GIN
            # indexing, and write that to the `edge_gin_file`.
            write_row(
                edge_gin_file,
                [
                    assertion_idx,
                    weight,
                    json.dumps(
                        gin_indexable_edge(assertion),
                        ensure_ascii=False,
                        sort_keys=True,
                    ),
                ],
            )

            # Extract the 'features' (combinations of the relation and one node)
            # that are present in the edge. We may need to match the node using
            # a prefix of that node, so store the feature separately for each
            # prefix.
            features = []

            # Get the IDs in the node table for each prefix of the nodes
            start_p_indices = [
                node_list.add(prefix) for prefix in uri_prefixes(assertion['start'], 3)
            ]
            end_p_indices = [
                node_list.add(prefix) for prefix in uri_prefixes(assertion['end'], 3)
            ]

            # Write the feature data, the 'direction' (forward, backward, or
            # symmetric), and the edge ID to the feature table.
            if assertion['rel'] in SYMMETRIC_RELATIONS:
                for start_p_idx in start_p_indices:
                    features.append((0, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((0, end_p_idx))
            else:
                for start_p_idx in start_p_indices:
                    features.append((1, start_p_idx))
                for end_p_idx in end_p_indices:
                    features.append((-1, end_p_idx))

            for direction, node_idx in features:
                write_row(feature_file, [rel_idx, direction, node_idx, assertion_idx])

    # Write our tables of unique IDs
    write_ordered_set(output_nodes, node_list)
    write_ordered_set(output_sources, source_list)
    write_relations(output_relations, relation_list)