Esempio n. 1
0
def tile_stats(orfs, tiles):
    """compute tile stats

    orfs and tiles are name->seq dicts

    NOTE: for prefix trie stats (e.g., num of tiles per orf), it is assumed the
    orf name is a prefix to the name of a tile from that orf
    """
    import numpy as np

    tile_lens = np.asarray([len(t) for t in tiles.values()])
    orf_lens = np.asarray([len(o) for o in orfs.values()])
    tile_size = int(round(np.median(tile_lens)).tolist())

    # compute tile counts for each orf
    orf_prefixes = CharTrie()
    for name in orfs:
        orf_prefixes[name] = True
    # ensure that no ORF name is a prefix for a different valid ORF
    for name in orfs:
        if len(orf_prefixes.keys(name)) != 1:
            print(orf_prefixes.keys(name))
            raise ValueError(
                "some ORF name is a prefix for a different valid ORF")
    tile_prefixes = CharTrie()
    for name in tiles:
        tile_prefixes[name] = True
    # compute orf coverages
    orf_coverages = {}
    for (orf, seq) in orfs.items():
        orf_residues = len(seq)
        tile_residues = 0.0
        if tile_prefixes.has_subtrie(orf) or (orf in tile_prefixes):
            for tile in tile_prefixes.keys(orf):
                tile_residues += len(tiles[tile])
        orf_coverages[orf] = tile_residues / orf_residues

    stats = {}
    stats["tile_size"] = tile_size
    stats["num_tiles"] = len(tiles)
    stats["total_tile_residues"] = tile_lens.sum().tolist()
    stats["avg_orf_coverage"] = tile_lens.sum().tolist() / orf_lens.sum(
    ).tolist()
    stats["num_orfs_smaller_than_tile_size"] = (orf_lens <
                                                tile_size).sum().tolist()
    stats["approx_num_tiles_naive_1x_tiling"] = (np.ceil(
        orf_lens / tile_size).sum().tolist())
    stats["avg_orf_coverage"] = sum(
        orf_coverages.values()) / len(orf_coverages)
    stats["max_tiles_per_len_normed_orf"] = max(orf_coverages.values())
    stats["tile_len_hist"] = compute_int_hist(tile_lens)
    # what is the tile coverage of each ORF (tot tile residues / orf residues)
    # tiles are assigned to ORFs if they share a name
    stats["orf_coverage_hist"] = compute_float_hist(
        list(orf_coverages.values()))
    stats["top_5_orf_cov"] = list(
        map(
            list,
            sorted(orf_coverages.items(), key=lambda tup: tup[1],
                   reverse=True)[:5],
        ))
    stats["bot_5_orf_cov"] = list(
        map(list,
            sorted(orf_coverages.items(), key=lambda tup: tup[1])[:5]))

    return stats
Esempio n. 2
0
            print 'route computed:', route[:-3]

        ''' Command processing '''
        fragments = action.split()
        command_name = fragments[0]

        command_trie = CharTrie()
        command_trie['?'] = question_command
        command_trie['back'] = back_command  #Can change location
        command_trie['bearings'] = bearings_command
        command_trie['history'] = history_command
        command_trie['moveto'] = move_command  #Can change location
        command_trie['route'] = route_command
        command_trie['rope'] = rope_command
        command_trie['shop'] = shop_command

        if command_trie.has_subtrie(command_name) or command_trie.has_key(
                command_name):
            list(command_trie[command_name:])[0]()
        elif command_name == 'q':
            continue_loop = False
        elif command_name == '':
            print 'You think now might be the time for action.'
        else:
            print 'You\'re not really sure what that means.'

    print ''

print('Quitting...')
''' End runstrip '''
Esempio n. 3
0
class NERLinker:
    """
    Disambiguates named entities and stores new unknown if they satisfy type restrictions.
    """
    def __init__(self,
                 outer_graph=gall,
                 ner_type_resolver=NERTypeResolver(),
                 metric_threshold=0.8,
                 strict_type_match=True):
        self.ntr = ner_type_resolver

        # Init storage
        self._trie = CharTrie()
        self._metric_threshold = metric_threshold
        self._strict_type_match = strict_type_match
        self._allowed_types = ENT_CLASSES

        self.predicate_namespace = dbo  # todo: move to constructor args
        self.outer_graph = outer_graph
        self.cache = dict()

    def update(self, uri_sf_pairs):
        """
        :param uri_sf_pairs: List[Tuple[URIRef, Option[str]]]: tolerable to None surface forms
        :return:
        """
        uri2sf = groupby(first, uri_sf_pairs)  # group by the same uri
        uris = list(uri2sf.keys())

        with Pool() as pool:

            def mmap(f, it):
                return list(
                    map(f, it)
                )  # todo: pool.map doesn't work: pickle issues with decorators

            ent_types = mmap(self.ntr.get_by_uri, uris)
            labels = mmap(get_label, uris)
            all_redirects = mmap(get_fellow_redirects,
                                 uris)  # lists of synonyms for each uri
            all_disambigs = mmap(get_fellow_disambiguations,
                                 uris)  # lists of omonyms for each uri

            for i, (ent_uri, ent_type, base_label, redirects,
                    disambigs) in enumerate(
                        zip(uris, ent_types, labels, all_redirects,
                            all_disambigs), 1):
                if ent_type:
                    entries = {TrieEntry(ent_uri, base_label,
                                         ent_type)}  # basic entry
                    entries.update(
                        TrieEntry(ent_uri, sf, ent_type)
                        for sfs in uri2sf[ent_uri]
                        for sf in sfs)  # entries from provided surface forms

                    redirects_labels = mmap(get_label, redirects)
                    entries.update(
                        TrieEntry(ent_uri, sf, ent_type)
                        for sf in redirects_labels)

                    disambigs_labels = mmap(get_label, disambigs)
                    disambigs_types = mmap(self.ntr.get_by_uri, disambigs)
                    entries.update(
                        TrieEntry(duri, dsf, dtype) for duri, dsf, dtype in
                        zip(disambigs, disambigs_labels, disambigs_types))

                    entries = filter(
                        all,
                        entries)  # all fields of entry should evaluate to True
                    sfgroups = groupby(lambda entry: entry.sf.lower(),
                                       entries)  # build 'index' of trie
                    _new = _upd = 0
                    for sfkey, group in sfgroups.items():
                        if not self._trie.has_key(sfkey):
                            self._trie[sfkey] = set(group)
                            _new += 1
                        else:
                            self._trie[sfkey].update(group)
                            _upd += 1
                    log.info(
                        'NERLinker: ent #{}: added {:3d}, updated {:3d} sfgroups; "{}"'
                        .format(i, _new, _upd, str(ent_uri)))

    def _resolve_edges(self, target_nodes, source_nodes):
        ont_graph = self.outer_graph
        ns = self.predicate_namespace
        new_edges = set()
        targets = set(target_nodes)

        for s in source_nodes:
            for rel, obj in ont_graph.predicate_objects(subject=s):
                if obj in targets and rel.startswith(ns) and obj != s:
                    new_edges.add((s, obj))
        return new_edges

    def _resolve_nodes(self, uri):
        ont_graph = self.outer_graph
        ns = self.predicate_namespace

        objs = {
            obj
            for rel, obj in ont_graph.predicate_objects(subject=uri)
            if rel.startswith(ns) and not isinstance(obj, Literal)
        }
        subjs = {
            subj
            for subj, rel in ont_graph.subject_predicates(object=uri)
            if rel.startswith(ns)
        }

        new_edges = {(uri, obj)
                     for obj in objs}.union({(subj, uri)
                                             for subj in subjs})
        new_nodes = objs.union(subjs)
        return new_nodes, new_edges

    def get_path_graph(self, uris, depth=2):
        """
        Based on the paper: https://arxiv.org/pdf/1707.05288.pdf
        :param uris: uris to build graph for
        :param depth: depth of the paths to search
        :return:
        """
        edges = set()
        nodes = set(uris)
        log.info('linker: started building subgraph on {} nodes with depth {}'.
                 format(len(nodes), depth))

        mmap = map  # todo: make parallel queries
        for i in range(depth - 1):
            new_nodes = set()
            for uri_nodes, uri_edges in mmap(self._resolve_nodes, nodes):
                new_nodes.update(uri_nodes)
                edges.update(uri_edges)
            nodes = new_nodes
            log.info('linker: finished iter {}/{} with {} new nodes, {} edges'.
                     format(i + 1, depth, len(new_nodes), len(edges)))

        # Last step can be done easier
        edges.update(self._resolve_edges(uris, nodes))
        log.info('linker: finished building subgraph: {} edges'.format(
            len(edges)))

        graph = nx.DiGraph()
        graph.add_nodes_from(uris)  # need only original entities
        graph.add_edges_from(edges)

        subgraph = nx.transitive_closure(graph).subgraph(nbunch=uris)
        log.info('linker: ended extracting subgraph: {} edges'.format(
            len(subgraph.edges())))

        return subgraph

    def link(self, ents, depth=2):
        """

        :param ents:
        :return: Dict[spacy.token.Span, rdflib.URIRef]
        """
        answers = {
            ent: None
            for ent in ents if ent.label_ in self._allowed_types
        }
        # Get candidate sets for all ents
        all_candidates = [(cand.uri, ent) for ent in ents
                          for cand in self.get_candidates(ent)]
        # Each candidate can resolve multiple entities
        candidates = defaultdict(list)
        for cand_uri, ent in all_candidates:
            candidates[cand_uri].append(ent)

        # Build subgraph for these candidates
        graph = self.get_path_graph(candidates, depth=depth)
        # Apply HITS or PageRank algorithm
        hubs, authorities = nx.hits(graph, max_iter=20)
        # Sort according to authority value
        authorities = sorted(authorities.items(), key=second, reverse=True)

        # todo: what to do with equally probable authorities? or with 'zero' authorities?
        #   maybe somehow preserve initial sort by get_candidates()? or returned weights (if any)
        for uri, auth_value in authorities:
            ents = candidates.get(uri, list())
            for ent in ents:
                if not answers[ent]:
                    answers[ent] = uri
        return answers

    def __call__(self, doc):
        answer_lists = {ent: self.get_candidates(ent) for ent in doc.ents}
        self.cache.update({
            ent: [str(entry.uri) for entry in answers]
            for ent, answers in answer_lists.items()
        })

        return doc

    def get(self, span, default=None):
        return self.cache.get(
            span,
            None) or default  # cache can return some evaluated to false value

    # todo: return some kind of weight or probability with matches
    def get_candidates(self, span):
        """
        :param span: spacy.token.Span
        :return: List[TrieEntry]
        """
        _trie = self._trie
        text = span.text.lower()
        candidates_filtered = []
        if span.label_ in self._allowed_types:
            # Determine how it's better to search
            if span.label_ == 'PERSON':
                # If ner type is Person: try all permutations of tokens
                tokens = filter(bool, text.split(' '))
                lprefixes = [
                    self._longest_prefix(' '.join(p))
                    for p in permutations(tokens)
                ]
                lprefixes = filter(bool, lprefixes)
                lprefix = max(lprefixes, key=len, default=None)
            else:
                lprefix = self._longest_prefix(text)

            if lprefix is not None:
                # log.info('span: "{}"; found prefix: "{}"'.format(span, lprefix))
                candidate_sets = _trie.itervalues(prefix=lprefix)
                candidates = list(chain.from_iterable(candidate_sets))

                # todo: it is temporary for keeping consistency with saved in trie old entity type schema
                tmap = ENT_MAPPING
                typed = groupby(
                    lambda e:
                    (tmap.get(e.ent_type) or e.ent_type) == span.label_,
                    candidates)
                # typed = groupby(lambda entry: entry.ent_type == span.label_, candidates)

                search_in = [True]
                if not self._strict_type_match:
                    search_in.append(False)

                # Search with the same type first
                for is_same_type in search_in:
                    typed_candidates = typed.get(is_same_type)
                    if typed_candidates:
                        candidates_filtered.extend(
                            self._fuzzy_filter(span.text, typed_candidates))
        return candidates_filtered  # in the case of not-found just the empty list

    def _fuzzy_filter(self, text, candidates, metric=fuzz.ratio):
        """
        :param text: str
        :param candidates: List[TrieEntry]
        :param metric: (str, str) -> Numeric
        :return: List[TrieEntry]
        """
        # similar = groupby(lambda entry: metric(entry.sf, text), candidates)  # group by val of metric
        # Calculate a metric
        measured = [(metric(entry.sf, text), entry) for entry in candidates]
        # Group by the same uri
        similar = groupby(lambda entry: entry[1].uri,
                          measured)  # uri: (m, entry)
        # In each group of same matches leave only the one with the highest match-metric
        similar = [max(sames, key=first) for sames in similar.values()]
        # Sort by the metric
        best_matches = sorted(similar, key=first, reverse=True)
        # Filter bad matches
        best_matches = [
            entry for m, entry in best_matches
            if m >= self._metric_threshold * 100
        ]

        # Some more checks on the best matches if there're several matches
        if len(best_matches) > 1:
            # best_matches = [max(best_matches, key=lambda entry: metric(raw_d(raw(entry.uri)), text))]
            best_matches = groupby(
                lambda entry: metric(raw_d(raw(entry.uri)), text),
                best_matches)
            best_matches = best_matches[max(best_matches)]
        return best_matches

    def _is_acronym(self, text, len_threshold=5):
        return len(text) < len_threshold and text.isupper()

    def _longest_prefix(self, text):
        l = len(text)
        left = max(1, floor(l * self._metric_threshold))
        for end in range(l, left - 1, -1):
            if self._trie.has_subtrie(text[:end]):
                return text[:end]

    def save(self, model_dir):
        model_name = type(self).__name__.lower() + '.pck'
        with open(os.path.join(model_dir, model_name), 'wb') as f:
            pickle.dump(self._trie, f)

    def load(self, model_dir):
        model_name = type(self).__name__.lower() + '.pck'
        with open(os.path.join(model_dir, model_name), 'rb') as f:
            self._trie = pickle.load(f)