def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3): """ Takes in a file of tab-separated simple associations, and removes uncommon associations and associations unlikely to be useful. All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ counts = defaultdict(int) with open(filename, encoding='utf-8') as file: for line in file: left, right, _value, _dataset, rel = line.rstrip().split('\t') if rel == '/r/SenseOf': pass else: gleft = uri_prefix(left) gright = uri_prefix(right) if gright.startswith('/c/'): counts[gleft] += 1 if gleft.startswith('/c/'): counts[gright] += 1 filtered_concepts = { concept for (concept, count) in counts.items() if ( count >= en_cutoff or (not concept.startswith('/c/en/') and count >= cutoff) ) } with open(output_filename, 'w', encoding='utf-8') as out: with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if ( gleft in filtered_concepts and gright in filtered_concepts and fvalue != 0 ): if gleft != gright: line = '\t'.join([gleft, gright, value, dataset, rel]) print(line, file=out)
def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True): """ Reads an association file and builds an (undirected) graph from it. If filtered_concepts isn't None, it should be a collection of concepts, and only vertices from this collection and edges that link two such vertices will be added to the graph. If it _is_ None (the default), however, please note that no such filtering will be done (i.e. the effective filter collection is then the universal set of concepts, not the empty set). If reject_negative_relations is True (the default), only edges not corresponding to negative relations will be added to the graph. """ graph = cls() if filtered_concepts is None: filter_concepts = False else: filter_concepts = True with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if concept_is_bad(left) or concept_is_bad(right): continue if reject_negative_relations and is_negative_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if fvalue == 0: continue if gleft == gright: continue if filter_concepts and gleft not in filtered_concepts: continue if filter_concepts and gright not in filtered_concepts: continue graph.add_edge(gleft, gright, value, dataset, rel) return graph
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3): """ Takes in a file of tab-separated simple associations, and removes uncommon associations and associations unlikely to be useful. All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ counts = defaultdict(int) with open(filename, encoding='utf-8') as file: for line in file: left, right, _value, _dataset, rel = line.rstrip().split('\t') if rel == '/r/SenseOf': pass else: gleft = uri_prefix(left) gright = uri_prefix(right) counts[gleft] += 1 counts[gright] += 1 filtered_concepts = { concept for (concept, count) in counts.items() if ( count >= en_cutoff or (not concept.startswith('/c/en/') and count >= cutoff) ) } with open(output_filename, 'w', encoding='utf-8') as out: with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if ( gleft in filtered_concepts and gright in filtered_concepts and fvalue != 0 ): if gleft != gright: line = '\t'.join([gleft, gright, value, dataset, rel]) print(line, file=out)