def update_automaton(dataframe, automaton_filename=path.join(PROCESSED_DATA_PATH, "vocabulary_automaton.pkl")): # Assert we have the same amount of concept names and ids. assert len(dataframe["concept_name"] == dataframe["concept_id"]) try: with open(automaton_filename, "rb") as automaton_file: automaton = pickle.load(automaton_file) logging.info("Loaded previous automaton from path '{}'.".format( automaton_filename)) except FileNotFoundError: logging.info("Created new automaton.") automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() with open(automaton_filename, "wb") as automaton_file: pickle.dump(automaton, automaton_file) logging.info( "Updated automaton under path '{}'.".format(automaton_filename)) return automaton
class AutomatonCache(object): def __init__(self): self.latest = None self.matches = {} self.regexes = [] def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: if term in matches: matches[term].append(entity.id) else: matches[term] = [entity.id] if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def kmer_info(A: ahocorasick.Automaton, fastq: str) -> pd.DataFrame: """ Finds k-mers in the input fastq files :param Automaton: Ahocorasick automaton with all the k-mers loaded in it :param fastq: filepath for the input fastq file :return: k-mer frequency at SNP positions found in test fastq """ kmer_seq_counts = defaultdict(int) kmer_df = pd.DataFrame(columns=['POS', 'kmer_seq', 'freq']) for _, sequence in fp.parse_fastq(fastq): for idx, (_, kmer_seq, _) in A.iter(sequence): kmer_seq_counts[kmer_seq] += 1 res = [] for kmer_seq, freq in kmer_seq_counts.items(): kmername, sequence, _ = A.get(kmer_seq) res.append((kmername, kmer_seq, freq)) def f_out(val, index): return tuple(i[index] for i in val) tup1 = f_out(res, 0) tup2 = f_out(res, 1) tup3 = f_out(res, 2) for x in range(len(res)): kmer_df = kmer_df.append( { 'POS': tup1[x], 'kmer_seq': tup2[x], 'freq': tup3[x] }, ignore_index=True) return kmer_df
class AutomatonCache(object): def __init__(self): self.latest = None self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: matches[term].add(entity.id) if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.items(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def __init__( self, bc_to_id: Dict[str, str], len_linker: int, len_primer: int, *, max_mm: int = 1, use_stats: bool = True ): self.bc_to_id = bc_to_id self.len_linker = len_linker self.len_primer = len_primer self.stats = None if not use_stats else dict( n_only_primer=0, n_multiple_bcs=0, n_no_barcode=0, n_regular=0, n_barcode_mismatch=0, n_junk=0, ) self.automaton = Automaton() all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm) for pattern, barcode in all_barcodes.items(): self.automaton.add_word(pattern, barcode) self.automaton.make_automaton()
def load_vocab(self, paths, add=False, separator='\t'): if add and hasattr(self, 'automaton'): pass else: self.automaton = Automaton() vocab = defaultdict(list) tags = set() if isinstance(paths, str): paths = [paths] for path in paths: name_space = os.path.split(path)[-1] print('read %s' % path) output = os.popen('wc -l ' + path) total = int(output.readline().split()[0]) with open(path, 'r') as r_f: print('vocab file Examples:') for n, line in enumerate(r_f): print(line.strip()) if n >= 10: break r_f.seek(0) for line in tqdm(r_f, desc='read file', total=total): if random.random() > self.keep_vocab_pro: continue splits = line.strip().split(separator) try: if len(splits) == 2: word, tag = splits value = True elif len(splits) == 3: word, tag, value = splits value = char2num(value) elif len(splits) == 1: word = splits[0] value = True tag = 'WORD' else: continue if self.vocab_name_space: tag = name_space + '_' + tag vocab[word].append((tag, value)) if tag not in tags: tags.add(tag) except Exception as e: print('vocab error: path-%s, line %s' % (path, line), e) continue self.tags = tags if not hasattr(self, 'tags') else self.tags | tags for word, value in tqdm(vocab.items(), desc='add words'): self.automaton.add_word(word, (len(word), word, value)) print('总共有%s个词' % len(vocab)) self.automaton.make_automaton()
def __init__(self, identifier_mapper, identifiers): self.identifier_mapper = identifier_mapper self.identifiers = identifiers self.automaton = Automaton() for identifier in identifiers: mapped = identifier_mapper(identifier) self.automaton.add_word(identifier, (len(identifier), mapped)) self.automaton.make_automaton() self.dest_dirs = set()
def __init__(self, gaze_file=data_path): self.locations = {} self.vocab_to_location = {} self.automaton = Automaton() with open(gaze_file) as cin: self.load_gazes(cin) self.automaton.make_automaton()
class AutomatonCache(object): TYPES = { 'Person': DocumentTag.TYPE_PERSON, 'Organization': DocumentTag.TYPE_ORGANIZATION, 'Company': DocumentTag.TYPE_ORGANIZATION, 'LegalEntity': DocumentTag.TYPE_PERSON, } def __init__(self): self.latest = None self.automaton = Automaton() self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = match_form(name) # TODO: this is a weird heuristic, but to avoid overly # aggressive matching it may make sense: if match is None or ' ' not in match: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def match(AC: ahocorasick.Automaton, tokens: List[str]) -> List[Tuple[str, int, int, Set[str]]]: """ :param AC: the finalized Aho-Corasick automation. :param tokens: the list of input tokens. :return: a list of tuples where each tuple consists of - span: str, - start token index (inclusive): int - end token index (exclusive): int - a set of values for the span: Set[str] """ smap, emap, idx = dict(), dict(), 0 for i, token in enumerate(tokens): smap[idx] = i idx += len(token) emap[idx] = i idx += 1 # find matches text = ' '.join(tokens) spans = [] for eidx, t in AC.iter(text): eidx += 1 sidx = eidx - len(t.span) sidx = smap.get(sidx, None) eidx = emap.get(eidx, None) if sidx is None or eidx is None: continue spans.append((t.span, sidx, eidx + 1, t.values)) return spans
def extract_tokenized(line: str, wn: Type[ExtractableWordnet], auto: Automaton, id: str) -> TokenizedTagging: tagging = TokenizedTagging(wn) tokens = line.split(" ") starts = list(get_tokens_starts(tokens)) extract_tokenized_iter(tagging, auto.iter(tokens), wn, tokens, starts, id) return tagging
class AutomatonCache(object): TYPES = { 'Person': DocumentTag.TYPE_PERSON, 'Organization': DocumentTag.TYPE_ORGANIZATION, 'Company': DocumentTag.TYPE_ORGANIZATION, 'LegalEntity': DocumentTag.TYPE_PERSON, } def __init__(self): self.latest = None self.automaton = Automaton() self.matches = {} def generate(self): with lock: self._generate() def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: type_ = self.TYPES.get(entity.type) if type_ is None: continue if term in matches: matches[term].append((entity.name, type_)) else: matches[term] = [(entity.name, type_)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def test_add_concepts(): data_path = prepare_data( path.join(path.dirname(__file__), "..", "data", "raw", "vocabularies-tiny.zip")) dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"), sep="\t").dropna(subset=["concept_name"]) automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() assert len(tuple(automaton.keys())) == 15791 first_keys = sorted(automaton.keys())[:10] assert first_keys == [ '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol', "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol", '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane', '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol', '1,2-Dipalmitoylphosphatidylcholine' ] first_concept_id, first_concept_name = automaton.get(first_keys[0]) assert (first_concept_id, first_concept_name) == (8514, '% REF')
def test_match_text(): data_path = prepare_data( path.join(path.dirname(__file__), "..", "data", "raw", "vocabularies-tiny.zip")) dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"), sep="\t").dropna(subset=["concept_name"]) automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() matches = list(generate_matches(automaton=automaton, text=dummy_abstract)) match_soll_values = [(54, (46257025, 'ethyl acetate')), (653, (45616149, 'formic acid')), (785, (8512, 'day'))] assert matches == match_soll_values
class AhoCorasickPathGenerator: def __init__(self, identifier_mapper, identifiers): self.identifier_mapper = identifier_mapper self.identifiers = identifiers self.automaton = Automaton() for identifier in identifiers: mapped = identifier_mapper(identifier) self.automaton.add_word(identifier, (len(identifier), mapped)) self.automaton.make_automaton() self.dest_dirs = set() def blind_path(self, path): out = '' idx = 0 for end_position, (length, mapped) in self.automaton.iter(path): end_idx = end_position + 1 start_idx = end_idx - length out += path[idx:start_idx] + mapped idx = end_idx out += path[idx:] return out def __call__(self, input_dir, output_dir): for root, dirs, files in os.walk(input_dir): for name in files: source_file_name = os.path.join(root, name) relpath = os.path.relpath( source_file_name, start=input_dir, ) dest_file_name = output_dir / self.blind_path(relpath) self.dest_dirs.add(abspath(dest_file_name.parent)) yield ( abspath(source_file_name), abspath(dest_file_name), ) @property def init_lines(self): return "\n".join(f'mkdir -p "{dest_dir}"' for dest_dir in self.dest_dirs) + "\n"
def find_in_fastqs(A: Automaton, *fastqs): """Find scheme kmers in input fastq files Args: A: Aho-Corasick Automaton with scheme SNV target kmers loaded fastqs: Input fastq file paths Returns: Dataframe with any matches found in input fastq files """ kmer_seq_counts = defaultdict(int) for fastq in fastqs: for _, sequence in parse_fastq(fastq): for idx, (_, kmer_seq, _) in A.iter(sequence): kmer_seq_counts[kmer_seq] += 1 res = [] for kmer_seq, freq in kmer_seq_counts.items(): kmername, sequence, _ = A.get(kmer_seq) res.append((kmername, kmer_seq, freq)) df = pd.DataFrame(res, columns=['kmername', 'seq', 'freq']) return df
def build_automaton(self): q = Entity.all() q = q.filter(Entity.schema.in_(self.TYPES.keys())) matches = {} for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = self.match_form(name) if match is None: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return automaton = Automaton() for term, entities in matches.iteritems(): automaton.add_word(term, entities) automaton.make_automaton() return automaton
def build_automata(vocab): # Build Aho-Corasick matching automata for vocabulary items # grouped by length. The wordpiece convention is inverted for # matching: continuations are unmarked (instead of "##") and # string start is marked by "^^". from ahocorasick import Automaton start_time = datetime.now() info('start building automata at {}'.format( start_time.strftime("%H:%M:%S"))) strings = [v[2:] if v.startswith('##') else '^^' + v for v in vocab] max_len = max(len(s) for s in strings) strings.sort(key=lambda s: len(s)) strings_by_len = defaultdict(list) for k, g in groupby(strings, lambda s: len(s)): strings_by_len[k] = list(g) automata_by_len = {} for i in range(1, max_len + 1): if i not in strings_by_len: continue a = Automaton() for s in strings_by_len[i]: a.add_word(s, i) a.make_automaton() automata_by_len[i] = a end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return automata_by_len
def make_wordlist(filepath): with open(filepath, 'r') as f: wordlist = Automaton() for idx, word in enumerate(set(Base().encode(t) for t in f.read().split())): wordlist.add_word(word, (idx, word)) wordlist.make_automaton() return wordlist
def build_automata(vocab): # Build Aho-Corasick matching automata for vocabulary items # grouped by length. from ahocorasick import Automaton start_time = datetime.now() info('start building automata at {}'.format( start_time.strftime("%H:%M:%S"))) strings = list(vocab) max_len = max(len(s) for s in strings) strings.sort(key=lambda s: len(s)) strings_by_len = defaultdict(list) for k, g in groupby(strings, lambda s: len(s)): strings_by_len[k] = list(g) automata_by_len = {} for i in range(1, max_len + 1): if i not in strings_by_len: continue a = Automaton() for s in strings_by_len[i]: a.add_word(s, i) a.make_automaton() automata_by_len[i] = a end_time = datetime.now() info('finish building automata at {} (delta {})'.format( end_time.strftime("%H:%M:%S"), end_time - start_time)) return automata_by_len
def extract_auto(line: str, wn: Type[ExtractableWordnet], auto: Automaton, from_id: str) -> UntokenizedTagging: tagging = UntokenizedTagging(wn) for tok_idx, (end_pos, (token, wn_to_lemma)) in enumerate(auto.iter(line)): groups = wn.synset_group_lemmas(objify_lemmas(wn_to_lemma)) tags = [] for group in groups: tag_group = TaggedLemma(token) tag_group.lemma_objs = group tags.append(tag_group) tagging.add_tags(token, [Anchor(from_id, end_pos - len(token) + 1)], tags) return tagging
def benchmark_pyahocorasick(LINE): from ahocorasick import Automaton, STORE_INTS automaton = Automaton() for i, key in enumerate(KEYS): automaton.add_word(key, key) automaton.make_automaton() print(list(automaton.iter(LINE))) benchmark("list(automaton.iter(LINE))", locals())
def init_automaton(scheme_fasta): """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta Args: scheme_fasta: SNV scheme fasta file path Returns: Aho-Corasick Automaton with kmers loaded """ A = Automaton() for header, sequence in parse_fasta(scheme_fasta): A.add_word(sequence, (header, sequence, False)) A.add_word(revcomp(sequence), (header, sequence, True)) A.make_automaton() return A
async def _update_links_automaton(self): """ Fetch the latest version of the links from the table, build an automaton. """ logger.info( "_update_links_automaton: fetching links from table %s", self._links_table, ) try: links = await self._api.run_db_interaction( "Fetch links from the table", _db_fetch_links, self._links_table) logger.info("_update_links_automaton: we received %d links", len(links)) new_link_automaton = Automaton(ahocorasick.STORE_LENGTH) for link in links: new_link_automaton.add_word(link) await make_deferred_yieldable( deferToThread(new_link_automaton.make_automaton)) self._link_automaton = new_link_automaton except Exception as e: logger.exception("_update_links_automaton: could not update") raise e
def _get_keyword_processor(self, custom_vocab: List[str]): keyword_processor = Automaton() for i, keyword in enumerate(custom_vocab): if len(keyword) > 1: keyword_processor.add_word(keyword, (i, keyword)) keyword_processor.make_automaton() return keyword_processor
class Gazetteer: def __init__(self, gaze_file=data_path): self.locations = {} self.vocab_to_location = {} self.automaton = Automaton() with open(gaze_file) as cin: self.load_gazes(cin) self.automaton.make_automaton() def load_gazes(self, cin): for line in cin: line = line.split('\t') line[-1] = line[-1].rstrip() self.locations[line[0]] = tuple(line) for vocab in line[3:]: if vocab in self.vocab_to_location: self.vocab_to_location[vocab].append(line[0]) else: self.vocab_to_location[vocab] = [line[0]] for vocab, value in self.vocab_to_location.items(): self.automaton.add_word(vocab, tuple(value)) def match(self, string): ret = {} for end_index, value in self.automaton.iter(string): for lid in value: if lid in ret: ret[lid] = (ret[lid][0], ret[lid][1] + 1) else: ret[lid] = (self.locations[lid], 1) return ret
def _make_kwtree(keywords): if keywords: kwtree = Automaton() for keyword in keywords: kwtree.add_word(keyword, keyword) kwtree.make_automaton() else: kwtree = None return kwtree
def init_automaton(scheme_fasta): """Initialize Aho-Corasick Automaton with kmers from SNV scheme fasta Args: scheme_fasta: SNV scheme fasta file path Returns: Aho-Corasick Automaton with kmers loaded """ A = Automaton() for header, sequence in parse_fasta(scheme_fasta): kmer_list = expand_degenerate_bases(sequence) for seq in kmer_list: A.add_word(seq, (header, seq, False)) A.add_word(revcomp(seq), (header, seq, True)) A.make_automaton() return A
def initialize_ac_automaton(kmers: pd.DataFrame): A = Automaton() for idx, kmer in enumerate(set(kmers['kmer'])): A.add_word(kmer, (idx, kmer)) A.make_automaton() return A
def match_kmers_to_reads(A: Automaton, *reads_paths) -> Dict[str, int]: kmer_counts = {} for reads in reads_paths: for sequence in yield_reads(reads): for _, (_, kmer) in A.iter(sequence): try: kmer_counts[kmer] += 1 except KeyError: kmer_counts[kmer] = 1 return pd.DataFrame(pd.Series(kmer_counts, name='count', dtype=int))