コード例 #1
0
def kmer_info(A: ahocorasick.Automaton, fastq: str) -> pd.DataFrame:
    """
    Finds k-mers in the input fastq files
    :param Automaton: Ahocorasick automaton with all the k-mers loaded in it
    :param fastq: filepath for the input fastq file

    :return: k-mer frequency at SNP positions found in test fastq
    """
    kmer_seq_counts = defaultdict(int)
    kmer_df = pd.DataFrame(columns=['POS', 'kmer_seq', 'freq'])
    for _, sequence in fp.parse_fastq(fastq):
        for idx, (_, kmer_seq, _) in A.iter(sequence):
            kmer_seq_counts[kmer_seq] += 1
    res = []
    for kmer_seq, freq in kmer_seq_counts.items():
        kmername, sequence, _ = A.get(kmer_seq)
        res.append((kmername, kmer_seq, freq))

    def f_out(val, index):
        return tuple(i[index] for i in val)

    tup1 = f_out(res, 0)
    tup2 = f_out(res, 1)
    tup3 = f_out(res, 2)
    for x in range(len(res)):
        kmer_df = kmer_df.append(
            {
                'POS': tup1[x],
                'kmer_seq': tup2[x],
                'freq': tup3[x]
            },
            ignore_index=True)
    return kmer_df
コード例 #2
0
def test_add_concepts():
    data_path = prepare_data(
        path.join(path.dirname(__file__), "..", "data", "raw",
                  "vocabularies-tiny.zip"))

    dataframe = pd.read_csv(path.join(data_path, "CONCEPT.csv"),
                            sep="\t").dropna(subset=["concept_name"])
    automaton = Automaton()
    automaton = add_concepts(
        automaton, zip(dataframe["concept_name"], dataframe["concept_id"]))

    automaton.make_automaton()

    assert len(tuple(automaton.keys())) == 15791

    first_keys = sorted(automaton.keys())[:10]
    assert first_keys == [
        '% REF', '(1-6)-alpha-glucomannan', '1 alpha-hydroxyergocalciferol',
        "1,1',1'',1'''-(ethylenedinitrilo)tetra-2-propanol",
        '1,1,1-trichloro-2,2,2-trifluoroethane', '1,1-difluoroethane',
        '1,10-decanediol', '1,10-phenanthroline', '1,2,6-hexanetriol',
        '1,2-Dipalmitoylphosphatidylcholine'
    ]

    first_concept_id, first_concept_name = automaton.get(first_keys[0])

    assert (first_concept_id, first_concept_name) == (8514, '% REF')
コード例 #3
0
def find_in_fastqs(A: Automaton, *fastqs):
    """Find scheme kmers in input fastq files

    Args:
        A: Aho-Corasick Automaton with scheme SNV target kmers loaded
        fastqs: Input fastq file paths

    Returns:
        Dataframe with any matches found in input fastq files
    """
    kmer_seq_counts = defaultdict(int)
    for fastq in fastqs:
        for _, sequence in parse_fastq(fastq):
            for idx, (_, kmer_seq, _) in A.iter(sequence):
                kmer_seq_counts[kmer_seq] += 1
    res = []
    for kmer_seq, freq in kmer_seq_counts.items():
        kmername, sequence, _ = A.get(kmer_seq)
        res.append((kmername, kmer_seq, freq))
    df = pd.DataFrame(res, columns=['kmername', 'seq', 'freq'])
    return df
コード例 #4
0
ファイル: dictionary_match.py プロジェクト: skywindy/carqabot
    def __create_automaton(self):
        paths = [
            ('Brand', os.path.join(Path.dictionary, 'Brand.txt')),
            ('Car', os.path.join(Path.dictionary, 'Car.txt')),
            ('Train', os.path.join(Path.dictionary, 'Train.txt')),
            ('Predicate', os.path.join(Path.dictionary, 'config.txt'))
        ]
        automaton = Automaton()
        for tag, path in paths:
            with open(path, 'r') as r_f:
                for line in r_f:
                    line = line.rstrip('\n')
                    _, *words = line.split('\t')
                    for word in words:
                        word = re.sub('\(.*?\)', '', word.lower())
                        _, tag_set = automaton.get(word, (word, set()))
                        tag_set.add(tag)
                        automaton.add_word(word, (word, tag_set))

        automaton.make_automaton()
        return automaton
コード例 #5
0
class TrieTree:
    '''
    前缀树类,用于匹配词典
    Parameters
    ----------
    paths:一个或者一组字典文件名(str or list),文件格式要求每列用制表符隔开:
        第一列为词,
        第二列为词对应的信息,
        第三列为信息附带的数值等,没有则默认为True
        如: 
        中国 LOC 0.8
        美国 国家

    tp:为匹配类型,可选"c, m, mc",默认"mc", 分别对应:
        c:  "BIES + _ + 词"
        m:  "BIES + _"
        mc: "BIES + _","BIES + _ + 词"

    Return
    ------
    defaultdict(in, {idx_0:{feature: value}, idx_1:...})
    返回一个以词id对应特征字典的特征集合


    Examples
    --------
    >>> trietree_c = TrieTree(paths=your_vocab_files, tp='c')
    >>> trietree_c("中国是一个国家")
    defaultdict(in, {0: {'B_LOC': True}, 1: {'E_LOC': True}})

    >>> trietree_m = TrieTree(paths=your_vocab_files, tp='m')
    >>> trietree_m("中国是一个国家")
    defaultdict(in, {0: {'B': True}, 1: {'E': True}})

    >>> trietree_mc = TrieTree(paths=your_vocab_files, tp='mc')
    >>> trietree_mc("中国是一个国家")
    defaultdict(in,
            {0: {'B': True, 'B_LOC': True}, 1: {'E': True, 'E_LOC': True}})

    '''
    def __init__(self,
                 vocab_paths,
                 vocab_match_type='mc',
                 drop_vocab_pro=0,
                 vocab_name_space=False,
                 separator='\t'):
        self.match_cnt = Counter()
        self.user_automaton = {}
        self.keep_vocab_pro = 1 - drop_vocab_pro
        self.vocab_name_space = vocab_name_space
        self.vmp = vocab_match_type
        self.load_vocab(vocab_paths, separator=separator)
        self.cnt = Counter()

        print('trietree:\ntp: %s\n, vocab path:%s' %
              (self.vmp, str(vocab_paths)))
        if self.keep_vocab_pro < 1:
            print('drop vocab pro', self.keep_vocab_pro)

    def __call__(self, *args, **kwargs):
        vocab_feature = self._vocab_feature(*args, **kwargs)
        return vocab_feature

    def load_vocab(self, paths, add=False, separator='\t'):
        if add and hasattr(self, 'automaton'):
            pass
        else:
            self.automaton = Automaton()

        vocab = defaultdict(list)
        tags = set()
        if isinstance(paths, str):
            paths = [paths]
        for path in paths:
            name_space = os.path.split(path)[-1]
            print('read %s' % path)
            output = os.popen('wc -l ' + path)
            total = int(output.readline().split()[0])
            with open(path, 'r') as r_f:
                print('vocab file Examples:')
                for n, line in enumerate(r_f):
                    print(line.strip())
                    if n >= 10:
                        break
                r_f.seek(0)
                for line in tqdm(r_f, desc='read file', total=total):
                    if random.random() > self.keep_vocab_pro:
                        continue
                    splits = line.strip().split(separator)
                    try:
                        if len(splits) == 2:
                            word, tag = splits
                            value = True
                        elif len(splits) == 3:
                            word, tag, value = splits
                            value = char2num(value)

                        elif len(splits) == 1:
                            word = splits[0]
                            value = True
                            tag = 'WORD'

                        else:
                            continue

                        if self.vocab_name_space:
                            tag = name_space + '_' + tag
                        vocab[word].append((tag, value))
                        if tag not in tags:
                            tags.add(tag)

                    except Exception as e:
                        print('vocab error: path-%s, line %s' % (path, line),
                              e)
                        continue

        self.tags = tags if not hasattr(self, 'tags') else self.tags | tags

        for word, value in tqdm(vocab.items(), desc='add words'):
            self.automaton.add_word(word, (len(word), word, value))

        print('总共有%s个词' % len(vocab))
        self.automaton.make_automaton()

    def _vocab_feature(self, sentence):
        vocab_feature = defaultdict(dict)
        self.match(sentence, vocab_feature)
        if self.user_automaton:
            self.match(sentence, vocab_feature, base_or_user='******')

        return vocab_feature

    def match(self, sentence, vocab_feature, base_or_user='******'):

        if base_or_user == 'base':
            result = self.automaton.iter(sentence)
        else:
            result = self.user_automaton.iter(sentence)

        for end_idx, (word_len, _, tag_value) in list(result):

            start_idx = end_idx - word_len + 1
            for tag, value in tag_value:
                self.match_cnt[tag] += 1
                if self.vmp == 'c':
                    tagss = [create_tag(word_len, tag)]
                elif self.vmp == 'm':
                    tagss = [create_tag(word_len, '')]
                elif self.vmp == 'mc':
                    tagss = [
                        create_tag(word_len, tag),
                        create_tag(word_len, '')
                    ]
                else:
                    tagss = []
                for tags in tagss:
                    for idx, tag in zip(range(start_idx, end_idx + 1), tags):
                        vocab_feature[idx][tag] = value

    def init_user_automaton(self):
        self.user_automaton = Automaton()
        self.user_automaton.make_automaton()

    def add_word(self, word, tag, value, update=True):
        '''
        Parameters
        ----------
        word:  匹配的词
        tag:   词对应的信息
        value: 信息附带的数值

        Examples
        --------
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('LOC', True)])
        '''
        have_add = ''
        if self.user_automaton == {}:
            self.init_user_automaton()
        wl, w, tag_values = self.user_automaton.get(word,
                                                    (len(word), word, []))
        for i, (t, v) in enumerate(tag_values):
            if t == tag:
                tag_values[i] = (tag, value)
                break
        else:
            tag_values.append((tag, value))
        self.user_automaton.add_word(w, (wl, w, tag_values))
        if update:
            self.user_automaton.make_automaton()

    def add_words(self, word_tag_values):
        '''
        do:

        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)



        Examples
        --------
        word_tag_values = [('中国', '面积', 9666), ('中国', '人口', 8888)]
        >>> trietree.add_word('中国', '国家', True)
        >>> trietree.user_automaton.get('中国')
        (2, '中国', [('面积', 9666), ('人口', 8888)])

        '''
        for word, tag, value in word_tag_values:
            self.add_word(word, tag, value, update=False)
        self.user_automaton.make_automaton()

    def get(self, key, default=None, vocab='all'):
        '''
        与字典get方法一样

        Parameters
        ----------
        vocab:  用于选择基本词典或者用户自定义词典,base(基本)/user(用户自定义)/all(两个),默认为all
        '''
        if vocab == 'base':
            value = self.automaton.get(key, default)
        elif vocab == 'user':
            value = self.user_automaton.get(key, default)
        else:
            value = {
                'base': self.automaton.get(key, default),
                'user': self.user_automaton.get(key, default)
            }
        return value