コード例 #1
0
def kmer_info(A: ahocorasick.Automaton, fastq: str) -> pd.DataFrame:
    """
    Finds k-mers in the input fastq files
    :param Automaton: Ahocorasick automaton with all the k-mers loaded in it
    :param fastq: filepath for the input fastq file

    :return: k-mer frequency at SNP positions found in test fastq
    """
    kmer_seq_counts = defaultdict(int)
    kmer_df = pd.DataFrame(columns=['POS', 'kmer_seq', 'freq'])
    for _, sequence in fp.parse_fastq(fastq):
        for idx, (_, kmer_seq, _) in A.iter(sequence):
            kmer_seq_counts[kmer_seq] += 1
    res = []
    for kmer_seq, freq in kmer_seq_counts.items():
        kmername, sequence, _ = A.get(kmer_seq)
        res.append((kmername, kmer_seq, freq))

    def f_out(val, index):
        return tuple(i[index] for i in val)

    tup1 = f_out(res, 0)
    tup2 = f_out(res, 1)
    tup3 = f_out(res, 2)
    for x in range(len(res)):
        kmer_df = kmer_df.append(
            {
                'POS': tup1[x],
                'kmer_seq': tup2[x],
                'freq': tup3[x]
            },
            ignore_index=True)
    return kmer_df
コード例 #2
0
ファイル: quiz5.py プロジェクト: rxlukas/cs329-1
def match(AC: ahocorasick.Automaton,
          tokens: List[str]) -> List[Tuple[str, int, int, Set[str]]]:
    """
    :param AC: the finalized Aho-Corasick automation.
    :param tokens: the list of input tokens.
    :return: a list of tuples where each tuple consists of
             - span: str,
             - start token index (inclusive): int
             - end token index (exclusive): int
             - a set of values for the span: Set[str]
    """
    smap, emap, idx = dict(), dict(), 0
    for i, token in enumerate(tokens):
        smap[idx] = i
        idx += len(token)
        emap[idx] = i
        idx += 1

    # find matches
    text = ' '.join(tokens)
    spans = []
    for eidx, t in AC.iter(text):
        eidx += 1
        sidx = eidx - len(t.span)
        sidx = smap.get(sidx, None)
        eidx = emap.get(eidx, None)
        if sidx is None or eidx is None: continue
        spans.append((t.span, sidx, eidx + 1, t.values))

    return spans
コード例 #3
0
def extract_tokenized(line: str, wn: Type[ExtractableWordnet], auto: Automaton,
                      id: str) -> TokenizedTagging:
    tagging = TokenizedTagging(wn)
    tokens = line.split(" ")
    starts = list(get_tokens_starts(tokens))
    extract_tokenized_iter(tagging, auto.iter(tokens), wn, tokens, starts, id)
    return tagging
コード例 #4
0
def benchmark_pyahocorasick(LINE):
    from ahocorasick import Automaton, STORE_INTS

    automaton = Automaton()
    for i, key in enumerate(KEYS):
        automaton.add_word(key, key)
    automaton.make_automaton()

    print(list(automaton.iter(LINE)))

    benchmark("list(automaton.iter(LINE))", locals())
コード例 #5
0
def extract_auto(line: str, wn: Type[ExtractableWordnet], auto: Automaton,
                 from_id: str) -> UntokenizedTagging:
    tagging = UntokenizedTagging(wn)
    for tok_idx, (end_pos, (token, wn_to_lemma)) in enumerate(auto.iter(line)):
        groups = wn.synset_group_lemmas(objify_lemmas(wn_to_lemma))
        tags = []
        for group in groups:
            tag_group = TaggedLemma(token)
            tag_group.lemma_objs = group
            tags.append(tag_group)
        tagging.add_tags(token, [Anchor(from_id, end_pos - len(token) + 1)],
                         tags)
    return tagging
コード例 #6
0
ファイル: acm.py プロジェクト: dorbarker/kmer-mlst
def match_kmers_to_reads(A: Automaton, *reads_paths) -> Dict[str, int]:

    kmer_counts = {}

    for reads in reads_paths:

        for sequence in yield_reads(reads):

            for _, (_, kmer) in A.iter(sequence):

                try:
                    kmer_counts[kmer] += 1
                except KeyError:
                    kmer_counts[kmer] = 1

    return pd.DataFrame(pd.Series(kmer_counts, name='count', dtype=int))
コード例 #7
0
def find_in_fasta(A: Automaton, fasta: str) -> pd.DataFrame:
    """Find scheme kmers in input fasta file

    Args:
        A: Aho-Corasick Automaton with scheme SNV target kmers loaded
        fasta: Input fasta path

    Returns:
        Dataframe with any matches found in input fasta file
    """
    res = []
    for contig_header, sequence in parse_fasta(fasta):
        for idx, (kmername, kmer_seq, is_revcomp) in A.iter(sequence):
            res.append((kmername, kmer_seq, is_revcomp, contig_header, idx))
    df = pd.DataFrame(
        res,
        columns=['kmername', 'seq', 'is_revcomp', 'contig_id', 'match_index'])
    return df
コード例 #8
0
def tag_with_dict(company_trie: Automaton,
                  sents: list,
                  duplicate=None) -> float:
    sent_tags = []
    sent_text = []
    for sent in sents:
        text = ''.join(sent).strip()
        text = unicodedata.normalize('NFKC', text)
        chunks = []
        tags = ['O'] * len(text)
        # find all chunks
        for idx, (_, w) in company_trie.iter(text):
            end_idx = idx + 1
            start_idx = end_idx - len(w)
            chunks.append(
                [start_idx, end_idx, w]
            )  # [[48, 53, '愛知学泉大'], [122, 130, 'シャンソン化粧品'], [131, 135, 'ジャパン'], [131, 139, 'ジャパンエナジー'], [133, 134, 'パ'], [140, 144, '第一勧銀']]
        # find chunks
        if len(chunks) != 0:
            # filter chunks
            chunks = filter_chunks(
                chunks
            )  # [[122, 130, 'シャンソン化粧品'], [131, 139, 'ジャパンエナジー'], [140, 144, '第一勧銀']]
            # generate labels
            for chunk in chunks:
                start_idx, end_idx = chunk[0], chunk[1]
                if duplicate:
                    if chunk[
                            2] in duplicate:  # if 'シャンソン化粧品' is in the duplicate names that show more than once in dataset
                        for tag_idx in range(start_idx, end_idx):
                            if tag_idx == start_idx:
                                tags[tag_idx] = 'B-company'
                            else:
                                tags[tag_idx] = 'I-company'
                else:
                    for tag_idx in range(start_idx, end_idx):
                        if tag_idx == start_idx:
                            tags[tag_idx] = 'B-company'
                        else:
                            tags[tag_idx] = 'I-company'
        sent_tags.append(tags)
        sent_text.append([x for x in text])
    return sent_tags, sent_text
コード例 #9
0
class AhoCorasickPathGenerator:
    def __init__(self, identifier_mapper, identifiers):
        self.identifier_mapper = identifier_mapper
        self.identifiers = identifiers
        self.automaton = Automaton()
        for identifier in identifiers:
            mapped = identifier_mapper(identifier)
            self.automaton.add_word(identifier, (len(identifier), mapped))
        self.automaton.make_automaton()
        self.dest_dirs = set()

    def blind_path(self, path):
        out = ''
        idx = 0
        for end_position, (length, mapped) in self.automaton.iter(path):
            end_idx = end_position + 1
            start_idx = end_idx - length
            out += path[idx:start_idx] + mapped
            idx = end_idx
        out += path[idx:]
        return out

    def __call__(self, input_dir, output_dir):
        for root, dirs, files in os.walk(input_dir):
            for name in files:
                source_file_name = os.path.join(root, name)
                relpath = os.path.relpath(
                    source_file_name,
                    start=input_dir,
                )
                dest_file_name = output_dir / self.blind_path(relpath)
                self.dest_dirs.add(abspath(dest_file_name.parent))
                yield (
                    abspath(source_file_name),
                    abspath(dest_file_name),
                )

    @property
    def init_lines(self):
        return "\n".join(f'mkdir -p "{dest_dir}"'
                         for dest_dir in self.dest_dirs) + "\n"
コード例 #10
0
def find_in_fastqs(A: Automaton, *fastqs):
    """Find scheme kmers in input fastq files

    Args:
        A: Aho-Corasick Automaton with scheme SNV target kmers loaded
        fastqs: Input fastq file paths

    Returns:
        Dataframe with any matches found in input fastq files
    """
    kmer_seq_counts = defaultdict(int)
    for fastq in fastqs:
        for _, sequence in parse_fastq(fastq):
            for idx, (_, kmer_seq, _) in A.iter(sequence):
                kmer_seq_counts[kmer_seq] += 1
    res = []
    for kmer_seq, freq in kmer_seq_counts.items():
        kmername, sequence, _ = A.get(kmer_seq)
        res.append((kmername, kmer_seq, freq))
    df = pd.DataFrame(res, columns=['kmername', 'seq', 'freq'])
    return df
コード例 #11
0
class Gazetteer:
    def __init__(self, gaze_file=data_path):
        self.locations = {}
        self.vocab_to_location = {}
        self.automaton = Automaton()

        with open(gaze_file) as cin:
            self.load_gazes(cin)

        self.automaton.make_automaton()

    def load_gazes(self, cin):
        for line in cin:
            line = line.split('\t')
            line[-1] = line[-1].rstrip()
            self.locations[line[0]] = tuple(line)

            for vocab in line[3:]:
                if vocab in self.vocab_to_location:
                    self.vocab_to_location[vocab].append(line[0])
                else:
                    self.vocab_to_location[vocab] = [line[0]]

        for vocab, value in self.vocab_to_location.items():
            self.automaton.add_word(vocab, tuple(value))

    def match(self, string):
        ret = {}

        for end_index, value in self.automaton.iter(string):
            for lid in value:
                if lid in ret:
                    ret[lid] = (ret[lid][0], ret[lid][1] + 1)
                else:
                    ret[lid] = (self.locations[lid], 1)

        return ret
コード例 #12
0
ファイル: issue_53.py プロジェクト: zhu/pyahocorasick
from ahocorasick import Automaton
auto = Automaton()
auto.add_word('wounded', 'wounded')

auto.make_automaton()

for item in auto.iter('Winning \U0001F629 so gutted, can\'t do anything for 4 weeks... Myth. #wounded'):
    print(item)

for item in auto.iter('Winning so gutted, can\'t do anything for 4 weeks... Myth. #wounded'):
    print(item)
コード例 #13
0
class TrieTree:
    '''
    前缀树类,用于匹配词典
    Parameters
    ----------
    paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开:
        第一列为词,
        第二列为词对应的信息,
        第三列为信息附带的数值等,没有则默认为True
        如: 
        中国 LOC 0.8
        美国 国家

    tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应:
        c:  "BIES + _ + 词"
        m:  "BIES + _"
        mc: "BIES + _","BIES + _ + 词"

    Return
    ------
    defaultdict(in, {idx_0:{feature: value}, idx_1:...})
    返回一个以词id对应特征字典的特征集合


    Examples
    --------
    >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c')
    >>> trietree_c("中国是一个国家")
    defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}})

    >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m')
    >>> trietree_m("中国是一个国家")
    defaultdict(in, {0: {'B': True}, 1: {'E': True}})

    >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc')
    >>> trietree_mc("中国是一个国家")
    defaultdict(in,
            {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}})

    '''
    def __init__(self,
                 vocab_paths,
                 vocab_match_type='mc',
                 drop_vocab_pro=0,
                 vocab_name_space=False,
                 separator='\t'):
        self.match_cnt = Counter()
        self.user_automaton = {}
        self.keep_vocab_pro = 1 - drop_vocab_pro
        self.vocab_name_space = vocab_name_space
        self.vmp = vocab_match_type
        self.load_vocab(vocab_paths, separator=separator)
        self.cnt = Counter()

        print('trietree:\ntp: %s\n, vocab path:%s' %
              (self.vmp, str(vocab_paths)))
        if self.keep_vocab_pro < 1:
            print('drop vocab pro', self.keep_vocab_pro)

    def __call__(self, *args, **kwargs):
        vocab_feature = self._vocab_feature(*args, **kwargs)
        return vocab_feature

    def load_vocab(self, paths, add=False, separator='\t'):
        if add and hasattr(self, 'automaton'):
            pass
        else:
            self.automaton = Automaton()

        vocab = defaultdict(list)
        tags = set()
        if isinstance(paths, str):
            paths = [paths]
        for path in paths:
            name_space = os.path.split(path)[-1]
            print('read %s' % path)
            output = os.popen('wc -l ' + path)
            total = int(output.readline().split()[0])
            with open(path, 'r') as r_f:
                print('vocab file Examples:')
                for n, line in enumerate(r_f):
                    print(line.strip())
                    if n >= 10:
                        break
                r_f.seek(0)
                for line in tqdm(r_f, desc='read file', total=total):
                    if random.random() > self.keep_vocab_pro:
                        continue
                    splits = line.strip().split(separator)
                    try:
                        if len(splits) == 2:
                            word, tag = splits
                            value = True
                        elif len(splits) == 3:
                            word, tag, value = splits
                            value = char2num(value)

                        elif len(splits) == 1:
                            word = splits[0]
                            value = True
                            tag = 'WORD'

                        else:
                            continue

                        if self.vocab_name_space:
                            tag = name_space + '_' + tag
                        vocab[word].append((tag, value))
                        if tag not in tags:
                            tags.add(tag)

                    except Exception as e:
                        print('vocab error: path-%s, line %s' % (path, line),
                              e)
                        continue

        self.tags = tags if not hasattr(self, 'tags') else self.tags | tags

        for word, value in tqdm(vocab.items(), desc='add words'):
            self.automaton.add_word(word, (len(word), word, value))

        print('总共有%s个词' % len(vocab))
        self.automaton.make_automaton()

    def _vocab_feature(self, sentence):
        vocab_feature = defaultdict(dict)
        self.match(sentence, vocab_feature)
        if self.user_automaton:
            self.match(sentence, vocab_feature, base_or_user='******')

        return vocab_feature

    def match(self, sentence, vocab_feature, base_or_user='******'):

        if base_or_user == 'base':
            result = self.automaton.iter(sentence)
        else:
            result = self.user_automaton.iter(sentence)

        for end_idx, (word_len, _, tag_value) in list(result):

            start_idx = end_idx - word_len + 1
            for tag, value in tag_value:
                self.match_cnt[tag] += 1
                if self.vmp == 'c':
                    tagss = [create_tag(word_len, tag)]
                elif self.vmp == 'm':
                    tagss = [create_tag(word_len, '')]
                elif self.vmp == 'mc':
                    tagss = [
                        create_tag(word_len, tag),
                        create_tag(word_len, '')
                    ]
                else:
                    tagss = []
                for tags in tagss:
                    for idx, tag in zip(range(start_idx, end_idx + 1), tags):
                        vocab_feature[idx][tag] = value

    def init_user_automaton(self):
        self.user_automaton = Automaton()
        self.user_automaton.make_automaton()

    def add_word(self, word, tag, value, update=True):
        '''
        Parameters
        ----------
        word:  匹配的词
        tag:   词对应的信息
        value: 信息附带的数值

        Examples
        --------
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('LOC', True)])
        '''
        have_add = ''
        if self.user_automaton == {}:
            self.init_user_automaton()
        wl, w, tag_values = self.user_automaton.get(word,
                                                    (len(word), word, []))
        for i, (t, v) in enumerate(tag_values):
            if t == tag:
                tag_values[i] = (tag, value)
                break
        else:
            tag_values.append((tag, value))
        self.user_automaton.add_word(w, (wl, w, tag_values))
        if update:
            self.user_automaton.make_automaton()

    def add_words(self, word_tag_values):
        '''
        do:

        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)



        Examples
        --------
        word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)]
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('面积', 9666), ('人口', 8888)])

        '''
        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)
        self.user_automaton.make_automaton()

    def get(self, key, default=None, vocab='all'):
        '''
        与字典get方法一样

        Parameters
        ----------
        vocab:  用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all
        '''
        if vocab == 'base':
            value = self.automaton.get(key, default)
        elif vocab == 'user':
            value = self.user_automaton.get(key, default)
        else:
            value = {
                'base': self.automaton.get(key, default),
                'user': self.user_automaton.get(key, default)
            }
        return value
コード例 #14
0
ファイル: crf_wordseg_util.py プロジェクト: bact/nlp-thai
def extract_features(
    doc: str,
    ngram: int = DEFAULT_NGRAM,
    dict_automaton: ahocorasick.Automaton = DICT_AUTOMATON,
) -> List[List]:
    len_doc = len(doc)
    look_range = list(range(1, int(ngram / 2) + 1))

    # Get (start, end) candidates from dictionary
    dict_start_boundaries = set()
    dict_end_boundaries = set()
    for end_index, length in dict_automaton.iter(doc):
        start_index = end_index - length + 1
        dict_start_boundaries.add(start_index)
        dict_end_boundaries.add(end_index)

    doc_features = []
    for i, char in enumerate(doc):
        ct = get_chartype(char)
        char_features = ["bias", "t={}".format(ct)]
        if ct not in GENERIC_CHARTYPES:
            if char == "\n":
                char = "EOL"
            char_features.append("c={}".format(char))

        if i == 0:
            char_features.append("BOS")  # Beginning of string
        elif i == len_doc - 1:
            char_features.append("EOS")  # End of string

        # Look backward
        for j in look_range:
            if i >= j:
                c = doc[i - j]
                ct = get_chartype(c)
                char_features.append("t-{}={}".format(j, ct))
                if ct not in GENERIC_CHARTYPES:
                    if char == "\n":
                        char = "EOL"
                    char_features.append("c-{}={}".format(j, c))
            else:
                break

        # Look forward
        for j in look_range:
            if i < len_doc - j:
                c = doc[i + j]
                ct = get_chartype(c)
                char_features.append("t{}={}".format(j, ct))
                if ct not in GENERIC_CHARTYPES:
                    if char == "\n":
                        char = "EOL"
                    char_features.append("c{}={}".format(j, c))
            else:
                break

        dict_start_boundary = "n"
        if i in dict_start_boundaries:
            dict_start_boundary = "y"
        char_features.append("ds=" + dict_start_boundary)

        dict_end_boundary = "n"
        if i in dict_end_boundaries:
            dict_end_boundary = "y"
        char_features.append("de=" + dict_end_boundary)

        doc_features.append(char_features)

    return doc_features
コード例 #15
0
class ReadTagger:
	def __init__(
		self,
		bc_to_id: Dict[str, str],
		len_linker: int,
		len_primer: int,
		*,
		max_mm: int = 1,
		use_stats: bool = True
	):
		self.bc_to_id = bc_to_id
		self.len_linker = len_linker
		self.len_primer = len_primer
		self.stats = None if not use_stats else dict(
			n_only_primer=0,
			n_multiple_bcs=0,
			n_no_barcode=0,
			n_regular=0,
			n_barcode_mismatch=0,
			n_junk=0,
		)
		
		self.automaton = Automaton()
		all_barcodes, self.blacklist = get_all_barcodes(bc_to_id.keys(), max_mm=max_mm)
		for pattern, barcode in all_barcodes.items():
			self.automaton.add_word(pattern, barcode)
		self.automaton.make_automaton()
	
	def search_barcode(self, read: str) -> Tuple[int, int, str]:
		for end, barcode in self.automaton.iter(read):
			start = end - len(barcode) + 1
			yield start, end + 1, barcode
	
	def tag_read(self, header: str, seq_read: str, seq_qual: str) -> TaggedRead:
		# as ordered set
		matches = OrderedDict((match, None) for match in self.search_barcode(seq_read))
		
		match_iter: Iterator[Tuple[int, int, str]] = iter(matches)
		bc_start, bc_end, barcode = next(match_iter, (None, None, None))
		
		bc_id = self.bc_to_id.get(barcode)
		other_barcodes = frozenset(set(self.bc_to_id[bc] for _, _, bc in match_iter) - {bc_id})
		
		if barcode is not None:
			linker_end = bc_end + self.len_linker if bc_end else None
			
			junk = seq_read[:bc_start] or None
			linker = seq_read[bc_end:linker_end]
			amplicon = seq_read[linker_end:]
			barcode_mismatch = seq_read[bc_start:bc_end] != barcode
		else:
			junk = None
			linker = None
			amplicon = seq_read
			barcode_mismatch = False
		
		read = TaggedRead(
			header, seq_qual, self.len_primer, junk, bc_id,
			linker, amplicon, other_barcodes, barcode_mismatch,
		)
		
		if self.stats is not None:
			for name, pred in PREDS.items():
				if pred(read):
					self.stats[name] += 1
		
		return read
	
	def get_barcode_table(self, plain=False):
		cell_templates = {
			(True, True): '{}',
			(True, False): '<span class="b">{}</span>',
			(False, True): '<span class="a">{}</span>',
			(False, False): '<span class="both">{}</span>',
		}
		
		patterns = sorted({bc for bc_pairs in self.blacklist.values() for pair in bc_pairs for bc in pair})
		sprs = pd.DataFrame(index=patterns, columns=patterns, dtype=str)
		for pattern, bc_pairs in self.blacklist.items():
			for bc1, bc2 in bc_pairs:
				sprs.loc[bc1, bc2] = ''.join(
					cell_templates[bc1[i] == base, bc2[i] == base].format(base)
					for i, base in enumerate(pattern)
				)
		
		with pd.option_context('display.max_colwidth', -1):
			html = sprs.to_html(escape=False, na_rep='')
		
		if plain:
			return html
		return HTML_INTRO + html
コード例 #16
0
            if total_words_to_search != total_words_added:
                words_to_search.append(value)
                total_words_added += 1
            if x == 0:
                total_initial_words += 1
            A.add_word(value, value)

print(f"Initial words {total_initial_words}")
print(f"Total patterns on AC trie: {total_initial_words*total_iterations+1}")
A.make_automaton()

start1 = process_time()
for word_to_search in words_to_search:
    start = process_time()
    end = 0
    for match in A.iter(word_to_search):
        pass
end1 = process_time()
print(
    f"Took {end1-start1}sec to match {len(words_to_search)} patterns on a AC automaton with {total_initial_words*total_iterations}"
)

#Took 0.0668650930000001sec for 25000 patterns (change var total_words_to_search, above.)
#Took 0.23291606600000003sec for 100000 patterns
#Took 0.4542991380000001sec for 200000 patterns
#Took 0.684820883sec for 300000 patterns

# Took 0.061951500999999964sec to match 25000 patterns on a AC automaton with 63000
# Took 0.06499120199999997sec to match 25000 patterns on a AC automaton with 126000
# Took 0.066342342sec to match 25000 patterns on a AC automaton with 189000
# Took 0.07048644500000001sec to match 25000 patterns on a AC automaton with 315000