def kmer_info(A: ahocorasick.Automaton, fastq: str) -> pd.DataFrame: """ Finds k-mers in the input fastq files :param Automaton: Ahocorasick automaton with all the k-mers loaded in it :param fastq: filepath for the input fastq file :return: k-mer frequency at SNP positions found in test fastq """ kmer_seq_counts = defaultdict(int) kmer_df = pd.DataFrame(columns=['POS', 'kmer_seq', 'freq']) for _, sequence in fp.parse_fastq(fastq): for idx, (_, kmer_seq, _) in A.iter(sequence): kmer_seq_counts[kmer_seq] += 1 res = [] for kmer_seq, freq in kmer_seq_counts.items(): kmername, sequence, _ = A.get(kmer_seq) res.append((kmername, kmer_seq, freq)) def f_out(val, index): return tuple(i[index] for i in val) tup1 = f_out(res, 0) tup2 = f_out(res, 1) tup3 = f_out(res, 2) for x in range(len(res)): kmer_df = kmer_df.append( { 'POS': tup1[x], 'kmer_seq': tup2[x], 'freq': tup3[x] }, ignore_index=True) return kmer_df
def test_add_concepts(): data_path = prepare_data( path.join(path.dirname(__file__), "..", "data", "raw", "vocabularies-tiny.zip")) dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"), sep="\t").dropna(subset=["concept_name"]) automaton = Automaton() automaton = add_concepts( automaton, zip(dataframe["concept_name"], dataframe["concept_id"])) automaton.make_automaton() assert len(tuple(automaton.keys())) == 15791 first_keys = sorted(automaton.keys())[:10] assert first_keys == [ '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol', "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol", '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane', '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol', '1,2-Dipalmitoylphosphatidylcholine' ] first_concept_id, first_concept_name = automaton.get(first_keys[0]) assert (first_concept_id, first_concept_name) == (8514, '% REF')
def find_in_fastqs(A: Automaton, *fastqs): """Find scheme kmers in input fastq files Args: A: Aho-Corasick Automaton with scheme SNV target kmers loaded fastqs: Input fastq file paths Returns: Dataframe with any matches found in input fastq files """ kmer_seq_counts = defaultdict(int) for fastq in fastqs: for _, sequence in parse_fastq(fastq): for idx, (_, kmer_seq, _) in A.iter(sequence): kmer_seq_counts[kmer_seq] += 1 res = [] for kmer_seq, freq in kmer_seq_counts.items(): kmername, sequence, _ = A.get(kmer_seq) res.append((kmername, kmer_seq, freq)) df = pd.DataFrame(res, columns=['kmername', 'seq', 'freq']) return df
def __create_automaton(self): paths = [ ('Brand', os.path.join(Path.dictionary, 'Brand.txt')), ('Car', os.path.join(Path.dictionary, 'Car.txt')), ('Train', os.path.join(Path.dictionary, 'Train.txt')), ('Predicate', os.path.join(Path.dictionary, 'config.txt')) ] automaton = Automaton() for tag, path in paths: with open(path, 'r') as r_f: for line in r_f: line = line.rstrip('\n') _, *words = line.split('\t') for word in words: word = re.sub('\(.*?\)', '', word.lower()) _, tag_set = automaton.get(word, (word, set())) tag_set.add(tag) automaton.add_word(word, (word, tag_set)) automaton.make_automaton() return automaton
class TrieTree: ''' 前缀树类,用于匹配词典 Parameters ---------- paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开: 第一列为词, 第二列为词对应的信息, 第三列为信息附带的数值等,没有则默认为True 如: 中国 LOC 0.8 美国 国家 tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应: c: "BIES + _ + 词" m: "BIES + _" mc: "BIES + _","BIES + _ + 词" Return ------ defaultdict(in, {idx_0:{feature: value}, idx_1:...}) 返回一个以词id对应特征字典的特征集合 Examples -------- >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c') >>> trietree_c("中国是一个国家") defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}}) >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m') >>> trietree_m("中国是一个国家") defaultdict(in, {0: {'B': True}, 1: {'E': True}}) >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc') >>> trietree_mc("中国是一个国家") defaultdict(in, {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}}) ''' def __init__(self, vocab_paths, vocab_match_type='mc', drop_vocab_pro=0, vocab_name_space=False, separator='\t'): self.match_cnt = Counter() self.user_automaton = {} self.keep_vocab_pro = 1 - drop_vocab_pro self.vocab_name_space = vocab_name_space self.vmp = vocab_match_type self.load_vocab(vocab_paths, separator=separator) self.cnt = Counter() print('trietree:\ntp: %s\n, vocab path:%s' % (self.vmp, str(vocab_paths))) if self.keep_vocab_pro < 1: print('drop vocab pro', self.keep_vocab_pro) def __call__(self, *args, **kwargs): vocab_feature = self._vocab_feature(*args, **kwargs) return vocab_feature def load_vocab(self, paths, add=False, separator='\t'): if add and hasattr(self, 'automaton'): pass else: self.automaton = Automaton() vocab = defaultdict(list) tags = set() if isinstance(paths, str): paths = [paths] for path in paths: name_space = os.path.split(path)[-1] print('read %s' % path) output = os.popen('wc -l ' + path) total = int(output.readline().split()[0]) with open(path, 'r') as r_f: print('vocab file Examples:') for n, line in enumerate(r_f): print(line.strip()) if n >= 10: break r_f.seek(0) for line in tqdm(r_f, desc='read file', total=total): if random.random() > self.keep_vocab_pro: continue splits = line.strip().split(separator) try: if len(splits) == 2: word, tag = splits value = True elif len(splits) == 3: word, tag, value = splits value = char2num(value) elif len(splits) == 1: word = splits[0] value = True tag = 'WORD' else: continue if self.vocab_name_space: tag = name_space + '_' + tag vocab[word].append((tag, value)) if tag not in tags: tags.add(tag) except Exception as e: print('vocab error: path-%s, line %s' % (path, line), e) continue self.tags = tags if not hasattr(self, 'tags') else self.tags | tags for word, value in tqdm(vocab.items(), desc='add words'): self.automaton.add_word(word, (len(word), word, value)) print('总共有%s个词' % len(vocab)) self.automaton.make_automaton() def _vocab_feature(self, sentence): vocab_feature = defaultdict(dict) self.match(sentence, vocab_feature) if self.user_automaton: self.match(sentence, vocab_feature, base_or_user='******') return vocab_feature def match(self, sentence, vocab_feature, base_or_user='******'): if base_or_user == 'base': result = self.automaton.iter(sentence) else: result = self.user_automaton.iter(sentence) for end_idx, (word_len, _, tag_value) in list(result): start_idx = end_idx - word_len + 1 for tag, value in tag_value: self.match_cnt[tag] += 1 if self.vmp == 'c': tagss = [create_tag(word_len, tag)] elif self.vmp == 'm': tagss = [create_tag(word_len, '')] elif self.vmp == 'mc': tagss = [ create_tag(word_len, tag), create_tag(word_len, '') ] else: tagss = [] for tags in tagss: for idx, tag in zip(range(start_idx, end_idx + 1), tags): vocab_feature[idx][tag] = value def init_user_automaton(self): self.user_automaton = Automaton() self.user_automaton.make_automaton() def add_word(self, word, tag, value, update=True): ''' Parameters ---------- word: 匹配的词 tag: 词对应的信息 value: 信息附带的数值 Examples -------- >>> trietree.add_word('中国', '国家', True) >>> trietree.user_automaton.get('中国') (2, '中国', [('LOC', True)]) ''' have_add = '' if self.user_automaton == {}: self.init_user_automaton() wl, w, tag_values = self.user_automaton.get(word, (len(word), word, [])) for i, (t, v) in enumerate(tag_values): if t == tag: tag_values[i] = (tag, value) break else: tag_values.append((tag, value)) self.user_automaton.add_word(w, (wl, w, tag_values)) if update: self.user_automaton.make_automaton() def add_words(self, word_tag_values): ''' do: for word, tag, value in word_tag_values: self.add_word(word, tag, value, update=False) Examples -------- word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)] >>> trietree.add_word('中国', '国家', True) >>> trietree.user_automaton.get('中国') (2, '中国', [('面积', 9666), ('人口', 8888)]) ''' for word, tag, value in word_tag_values: self.add_word(word, tag, value, update=False) self.user_automaton.make_automaton() def get(self, key, default=None, vocab='all'): ''' 与字典get方法一样 Parameters ---------- vocab: 用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all ''' if vocab == 'base': value = self.automaton.get(key, default) elif vocab == 'user': value = self.user_automaton.get(key, default) else: value = { 'base': self.automaton.get(key, default), 'user': self.user_automaton.get(key, default) } return value