Exemple #1
0
def update_automaton(dataframe,
                     automaton_filename=path.join(PROCESSED_DATA_PATH,
                                                  "vocabulary_automaton.pkl")):
    # Assert we have the same amount of concept names and ids.
    assert len(dataframe["concept_name"] == dataframe["concept_id"])

    try:
        with open(automaton_filename, "rb") as automaton_file:
            automaton = pickle.load(automaton_file)

        logging.info("Loaded previous automaton from path '{}'.".format(
            automaton_filename))
    except FileNotFoundError:
        logging.info("Created new automaton.")
        automaton = Automaton()

    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    with open(automaton_filename, "wb") as automaton_file:
        pickle.dump(automaton, automaton_file)

    logging.info(
        "Updated automaton under path '{}'.".format(automaton_filename))
    return automaton
Exemple #2
0
class AutomatonCache(object):
    def __init__(self):
        self.latest = None
        self.matches = {}
        self.regexes = []

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemple #3
0
    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                if term in matches:
                    matches[term].append(entity.id)
                else:
                    matches[term] = [entity.id]

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemple #4
0
def kmer_info(A: ahocorasick.Automaton, fastq: str) -> pd.DataFrame:
    """
    Finds k-mers in the input fastq files
    :param Automaton: Ahocorasick automaton with all the k-mers loaded in it
    :param fastq: filepath for the input fastq file

    :return: k-mer frequency at SNP positions found in test fastq
    """
    kmer_seq_counts = defaultdict(int)
    kmer_df = pd.DataFrame(columns=['POS', 'kmer_seq', 'freq'])
    for _, sequence in fp.parse_fastq(fastq):
        for idx, (_, kmer_seq, _) in A.iter(sequence):
            kmer_seq_counts[kmer_seq] += 1
    res = []
    for kmer_seq, freq in kmer_seq_counts.items():
        kmername, sequence, _ = A.get(kmer_seq)
        res.append((kmername, kmer_seq, freq))

    def f_out(val, index):
        return tuple(i[index] for i in val)

    tup1 = f_out(res, 0)
    tup2 = f_out(res, 1)
    tup3 = f_out(res, 2)
    for x in range(len(res)):
        kmer_df = kmer_df.append(
            {
                'POS': tup1[x],
                'kmer_seq': tup2[x],
                'freq': tup3[x]
            },
            ignore_index=True)
    return kmer_df
Exemple #5
0
class AutomatonCache(object):

    def __init__(self):
        self.latest = None
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = defaultdict(set)
        q = Entity.all()
        q = q.options(joinedload('other_names'))
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                matches[term].add(entity.id)

        if not len(matches):
            self.automaton = None
            return

        self.automaton = Automaton()
        for term, entities in matches.items():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemple #6
0
	def __init__(
		self,
		bc_to_id: Dict[str, str],
		len_linker: int,
		len_primer: int,
		*,
		max_mm: int = 1,
		use_stats: bool = True
	):
		self.bc_to_id = bc_to_id
		self.len_linker = len_linker
		self.len_primer = len_primer
		self.stats = None if not use_stats else dict(
			n_only_primer=0,
			n_multiple_bcs=0,
			n_no_barcode=0,
			n_regular=0,
			n_barcode_mismatch=0,
			n_junk=0,
		)
		
		self.automaton = Automaton()
		all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm)
		for pattern, barcode in all_barcodes.items():
			self.automaton.add_word(pattern, barcode)
		self.automaton.make_automaton()
Exemple #7
0
    def load_vocab(self, paths, add=False, separator='\t'):
        if add and hasattr(self, 'automaton'):
            pass
        else:
            self.automaton = Automaton()

        vocab = defaultdict(list)
        tags = set()
        if isinstance(paths, str):
            paths = [paths]
        for path in paths:
            name_space = os.path.split(path)[-1]
            print('read %s' % path)
            output = os.popen('wc -l ' + path)
            total = int(output.readline().split()[0])
            with open(path, 'r') as r_f:
                print('vocab file Examples:')
                for n, line in enumerate(r_f):
                    print(line.strip())
                    if n >= 10:
                        break
                r_f.seek(0)
                for line in tqdm(r_f, desc='read file', total=total):
                    if random.random() > self.keep_vocab_pro:
                        continue
                    splits = line.strip().split(separator)
                    try:
                        if len(splits) == 2:
                            word, tag = splits
                            value = True
                        elif len(splits) == 3:
                            word, tag, value = splits
                            value = char2num(value)

                        elif len(splits) == 1:
                            word = splits[0]
                            value = True
                            tag = 'WORD'

                        else:
                            continue

                        if self.vocab_name_space:
                            tag = name_space + '_' + tag
                        vocab[word].append((tag, value))
                        if tag not in tags:
                            tags.add(tag)

                    except Exception as e:
                        print('vocab error: path-%s, line %s' % (path, line),
                              e)
                        continue

        self.tags = tags if not hasattr(self, 'tags') else self.tags | tags

        for word, value in tqdm(vocab.items(), desc='add words'):
            self.automaton.add_word(word, (len(word), word, value))

        print('总共有%s个词' % len(vocab))
        self.automaton.make_automaton()
Exemple #8
0
 def __init__(self, identifier_mapper, identifiers):
     self.identifier_mapper = identifier_mapper
     self.identifiers = identifiers
     self.automaton = Automaton()
     for identifier in identifiers:
         mapped = identifier_mapper(identifier)
         self.automaton.add_word(identifier, (len(identifier), mapped))
     self.automaton.make_automaton()
     self.dest_dirs = set()
Exemple #9
0
    def __init__(self, gaze_file=data_path):
        self.locations = {}
        self.vocab_to_location = {}
        self.automaton = Automaton()

        with open(gaze_file) as cin:
            self.load_gazes(cin)

        self.automaton.make_automaton()
Exemple #10
0
class AutomatonCache(object):

    TYPES = {
        'Person': DocumentTag.TYPE_PERSON,
        'Organization': DocumentTag.TYPE_ORGANIZATION,
        'Company': DocumentTag.TYPE_ORGANIZATION,
        'LegalEntity': DocumentTag.TYPE_PERSON,
    }

    def __init__(self):
        self.latest = None
        self.automaton = Automaton()
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = match_form(name)
                # TODO: this is a weird heuristic, but to avoid overly
                # aggressive matching it may make sense:
                if match is None or ' ' not in match:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemple #11
0
def match(AC: ahocorasick.Automaton,
          tokens: List[str]) -> List[Tuple[str, int, int, Set[str]]]:
    """
    :param AC: the finalized Aho-Corasick automation.
    :param tokens: the list of input tokens.
    :return: a list of tuples where each tuple consists of
             - span: str,
             - start token index (inclusive): int
             - end token index (exclusive): int
             - a set of values for the span: Set[str]
    """
    smap, emap, idx = dict(), dict(), 0
    for i, token in enumerate(tokens):
        smap[idx] = i
        idx += len(token)
        emap[idx] = i
        idx += 1

    # find matches
    text = ' '.join(tokens)
    spans = []
    for eidx, t in AC.iter(text):
        eidx += 1
        sidx = eidx - len(t.span)
        sidx = smap.get(sidx, None)
        eidx = emap.get(eidx, None)
        if sidx is None or eidx is None: continue
        spans.append((t.span, sidx, eidx + 1, t.values))

    return spans
Exemple #12
0
def extract_tokenized(line: str, wn: Type[ExtractableWordnet], auto: Automaton,
                      id: str) -> TokenizedTagging:
    tagging = TokenizedTagging(wn)
    tokens = line.split(" ")
    starts = list(get_tokens_starts(tokens))
    extract_tokenized_iter(tagging, auto.iter(tokens), wn, tokens, starts, id)
    return tagging
Exemple #13
0
class AutomatonCache(object):

    TYPES = {
        'Person': DocumentTag.TYPE_PERSON,
        'Organization': DocumentTag.TYPE_ORGANIZATION,
        'Company': DocumentTag.TYPE_ORGANIZATION,
        'LegalEntity': DocumentTag.TYPE_PERSON,
    }

    def __init__(self):
        self.latest = None
        self.automaton = Automaton()
        self.matches = {}

    def generate(self):
        with lock:
            self._generate()

    def _generate(self):
        latest = Entity.latest()
        if latest is None:
            return
        if self.latest is not None and self.latest >= latest:
            return
        self.latest = latest

        matches = {}
        q = Entity.all()
        q = q.filter(Entity.state == Entity.STATE_ACTIVE)
        for entity in q:
            for term in entity.regex_terms:
                type_ = self.TYPES.get(entity.type)
                if type_ is None:
                    continue
                if term in matches:
                    matches[term].append((entity.name, type_))
                else:
                    matches[term] = [(entity.name, type_)]

        if not len(matches):
            return

        for term, entities in matches.iteritems():
            self.automaton.add_word(term.encode('utf-8'), entities)
        self.automaton.make_automaton()
        log.info('Generated automaton with %s terms', len(matches))
Exemple #14
0
def test_add_concepts():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    assert len(tuple(automaton.keys())) == 15791

    first_keys = sorted(automaton.keys())[:10]
    assert first_keys == [
        '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol',
        "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol",
        '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane',
        '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol',
        '1,2-Dipalmitoylphosphatidylcholine'
    ]

    first_concept_id, first_concept_name = automaton.get(first_keys[0])

    assert (first_concept_id, first_concept_name) == (8514, '% REF')
Exemple #15
0
def test_match_text():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    matches = list(generate_matches(automaton=automaton, text=dummy_abstract))
    match_soll_values = [(54, (46257025, 'ethyl acetate')),
                         (653, (45616149, 'formic acid')),
                         (785, (8512, 'day'))]
    assert matches == match_soll_values
Exemple #16
0
class AhoCorasickPathGenerator:
    def __init__(self, identifier_mapper, identifiers):
        self.identifier_mapper = identifier_mapper
        self.identifiers = identifiers
        self.automaton = Automaton()
        for identifier in identifiers:
            mapped = identifier_mapper(identifier)
            self.automaton.add_word(identifier, (len(identifier), mapped))
        self.automaton.make_automaton()
        self.dest_dirs = set()

    def blind_path(self, path):
        out = ''
        idx = 0
        for end_position, (length, mapped) in self.automaton.iter(path):
            end_idx = end_position + 1
            start_idx = end_idx - length
            out += path[idx:start_idx] + mapped
            idx = end_idx
        out += path[idx:]
        return out

    def __call__(self, input_dir, output_dir):
        for root, dirs, files in os.walk(input_dir):
            for name in files:
                source_file_name = os.path.join(root, name)
                relpath = os.path.relpath(
                    source_file_name,
                    start=input_dir,
                )
                dest_file_name = output_dir / self.blind_path(relpath)
                self.dest_dirs.add(abspath(dest_file_name.parent))
                yield (
                    abspath(source_file_name),
                    abspath(dest_file_name),
                )

    @property
    def init_lines(self):
        return "\n".join(f'mkdir -p "{dest_dir}"'
                         for dest_dir in self.dest_dirs) + "\n"
Exemple #17
0
def find_in_fastqs(A: Automaton, *fastqs):
    """Find scheme kmers in input fastq files

    Args:
        A: Aho-Corasick Automaton with scheme SNV target kmers loaded
        fastqs: Input fastq file paths

    Returns:
        Dataframe with any matches found in input fastq files
    """
    kmer_seq_counts = defaultdict(int)
    for fastq in fastqs:
        for _, sequence in parse_fastq(fastq):
            for idx, (_, kmer_seq, _) in A.iter(sequence):
                kmer_seq_counts[kmer_seq] += 1
    res = []
    for kmer_seq, freq in kmer_seq_counts.items():
        kmername, sequence, _ = A.get(kmer_seq)
        res.append((kmername, kmer_seq, freq))
    df = pd.DataFrame(res, columns=['kmername', 'seq', 'freq'])
    return df
Exemple #18
0
    def build_automaton(self):
        q = Entity.all()
        q = q.filter(Entity.schema.in_(self.TYPES.keys()))

        matches = {}
        for entity in q:
            tag = self.TYPES.get(entity.schema)
            if tag is None:
                continue
            for name in entity.names:
                if name is None or len(name) > 120:
                    continue
                match = self.match_form(name)
                if match is None:
                    continue
                if match in matches:
                    matches[match].append((name, tag))
                else:
                    matches[match] = [(name, tag)]

        if not len(matches):
            return

        automaton = Automaton()
        for term, entities in matches.iteritems():
            automaton.add_word(term, entities)
        automaton.make_automaton()
        return automaton
Exemple #19
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length. The wordpiece convention is inverted for
     # matching: continuations are unmarked (instead of "##") and
     # string start is marked by "^^".
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab]
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
Exemple #20
0
def make_wordlist(filepath):
    with open(filepath, 'r') as f:
        wordlist = Automaton()
        for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())):
            wordlist.add_word(word, (idx, word))
            wordlist.make_automaton()
    return wordlist
Exemple #21
0
 def build_automata(vocab):
     # Build Aho-Corasick matching automata for vocabulary items
     # grouped by length.
     from ahocorasick import Automaton
     start_time = datetime.now()
     info('start building automata at {}'.format(
         start_time.strftime("%H:%M:%S")))
     strings = list(vocab)
     max_len = max(len(s) for s in strings)
     strings.sort(key=lambda s: len(s))
     strings_by_len = defaultdict(list)
     for k, g in groupby(strings, lambda s: len(s)):
         strings_by_len[k] = list(g)
     automata_by_len = {}
     for i in range(1, max_len + 1):
         if i not in strings_by_len:
             continue
         a = Automaton()
         for s in strings_by_len[i]:
             a.add_word(s, i)
         a.make_automaton()
         automata_by_len[i] = a
     end_time = datetime.now()
     info('finish building automata at {} (delta {})'.format(
         end_time.strftime("%H:%M:%S"), end_time - start_time))
     return automata_by_len
Exemple #22
0
def extract_auto(line: str, wn: Type[ExtractableWordnet], auto: Automaton,
                 from_id: str) -> UntokenizedTagging:
    tagging = UntokenizedTagging(wn)
    for tok_idx, (end_pos, (token, wn_to_lemma)) in enumerate(auto.iter(line)):
        groups = wn.synset_group_lemmas(objify_lemmas(wn_to_lemma))
        tags = []
        for group in groups:
            tag_group = TaggedLemma(token)
            tag_group.lemma_objs = group
            tags.append(tag_group)
        tagging.add_tags(token, [Anchor(from_id, end_pos - len(token) + 1)],
                         tags)
    return tagging
def benchmark_pyahocorasick(LINE):
    from ahocorasick import Automaton, STORE_INTS

    automaton = Automaton()
    for i, key in enumerate(KEYS):
        automaton.add_word(key, key)
    automaton.make_automaton()

    print(list(automaton.iter(LINE)))

    benchmark("list(automaton.iter(LINE))", locals())
Exemple #24
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        A.add_word(sequence, (header, sequence, False))
        A.add_word(revcomp(sequence), (header, sequence, True))
    A.make_automaton()
    return A
 async def _update_links_automaton(self):
     """
     Fetch the latest version of the links from the table, build an automaton.
     """
     logger.info(
         "_update_links_automaton: fetching links from table %s",
         self._links_table,
     )
     try:
         links = await self._api.run_db_interaction(
             "Fetch links from the table", _db_fetch_links,
             self._links_table)
         logger.info("_update_links_automaton: we received %d links",
                     len(links))
         new_link_automaton = Automaton(ahocorasick.STORE_LENGTH)
         for link in links:
             new_link_automaton.add_word(link)
         await make_deferred_yieldable(
             deferToThread(new_link_automaton.make_automaton))
         self._link_automaton = new_link_automaton
     except Exception as e:
         logger.exception("_update_links_automaton: could not update")
         raise e
Exemple #26
0
    def _get_keyword_processor(self, custom_vocab: List[str]):
        keyword_processor = Automaton()

        for i, keyword in enumerate(custom_vocab):
            if len(keyword) > 1:
                keyword_processor.add_word(keyword, (i, keyword))

        keyword_processor.make_automaton()
        return keyword_processor
Exemple #27
0
class Gazetteer:
    def __init__(self, gaze_file=data_path):
        self.locations = {}
        self.vocab_to_location = {}
        self.automaton = Automaton()

        with open(gaze_file) as cin:
            self.load_gazes(cin)

        self.automaton.make_automaton()

    def load_gazes(self, cin):
        for line in cin:
            line = line.split('\t')
            line[-1] = line[-1].rstrip()
            self.locations[line[0]] = tuple(line)

            for vocab in line[3:]:
                if vocab in self.vocab_to_location:
                    self.vocab_to_location[vocab].append(line[0])
                else:
                    self.vocab_to_location[vocab] = [line[0]]

        for vocab, value in self.vocab_to_location.items():
            self.automaton.add_word(vocab, tuple(value))

    def match(self, string):
        ret = {}

        for end_index, value in self.automaton.iter(string):
            for lid in value:
                if lid in ret:
                    ret[lid] = (ret[lid][0], ret[lid][1] + 1)
                else:
                    ret[lid] = (self.locations[lid], 1)

        return ret
Exemple #28
0
 def _make_kwtree(keywords):
     if keywords:
         kwtree = Automaton()
         for keyword in keywords:
             kwtree.add_word(keyword, keyword)
         kwtree.make_automaton()
     else:
         kwtree = None
     return kwtree
Exemple #29
0
def init_automaton(scheme_fasta):
    """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta

    Args:
        scheme_fasta: SNV scheme fasta file path

    Returns:
         Aho-Corasick Automaton with kmers loaded
    """
    A = Automaton()
    for header, sequence in parse_fasta(scheme_fasta):
        kmer_list = expand_degenerate_bases(sequence)
        for seq in kmer_list:
            A.add_word(seq, (header, seq, False))
            A.add_word(revcomp(seq), (header, seq, True))
    A.make_automaton()
    return A
Exemple #30
0
def initialize_ac_automaton(kmers: pd.DataFrame):

    A = Automaton()

    for idx, kmer in enumerate(set(kmers['kmer'])):
        A.add_word(kmer, (idx, kmer))

    A.make_automaton()

    return A
Exemple #31
0
def match_kmers_to_reads(A: Automaton, *reads_paths) -> Dict[str, int]:

    kmer_counts = {}

    for reads in reads_paths:

        for sequence in yield_reads(reads):

            for _, (_, kmer) in A.iter(sequence):

                try:
                    kmer_counts[kmer] += 1
                except KeyError:
                    kmer_counts[kmer] = 1

    return pd.DataFrame(pd.Series(kmer_counts, name='count', dtype=int))