def parse(fn): hll = HLLCounter(.01, K) lens = [] names = [] gc_len = 0 n_ambiguous = 0 for contig in ReadParser(fn): sequence = contig.sequence lens.append(len(sequence)) names.append(contig.name) if DNA.match(sequence) is None: raise RuntimeError('non-ACGTN characters not supported. '\ 'Offending transcript: \n>{0}\n{1}\nbad'\ .format(contig.name, contig.sequence)) if 'N' in sequence: sequence = sequence.replace('N', 'A') n_ambiguous += 1 hll.consume_string(sequence) gc_len += contig.sequence.count('C') gc_len += contig.sequence.count('G') S = pd.Series(lens, index=names) try: S.sort_values() except AttributeError: S.sort() gc_perc = float(gc_len) / S.sum() return S, hll.estimate_cardinality(), gc_perc, n_ambiguous
def get_unique_kmers(mmetsp, fasta): print(fasta) counter = HLLCounter(0.1, 25) counter.consume_fasta(fasta) unique_kmers = counter.estimate_cardinality() print(unique_kmers) return unique_kmers
def parse(fn): hll = HLLCounter(.01, K) lens = [] names = [] gc_len = 0 n_ambiguous = 0 for contig in ReadParser(fn): sequence = contig.sequence lens.append(len(sequence)) names.append(contig.name) if DNA.match(sequence) is None: raise RuntimeError('non-ACGTN characters not supported. '\ 'Offending transcript: \n>{0}\n{1}\nbad'\ .format(contig.name, contig.sequence)) if 'N' in sequence: sequence = sequence.replace('N', 'A') n_ambiguous += 1 hll.consume_string(sequence) gc_len += contig.sequence.count('C') gc_len += contig.sequence.count('G') S = pd.Series(lens, index=names) try: S.sort_values() except AttributeError: S.sort() gc_perc = float(gc_len) / S.sum() print('return') return S, hll.estimate_cardinality(), gc_perc, n_ambiguous
def get_unique_kmers(sra,fasta): print fasta counter = HLLCounter(0.1,25) counter.consume_fasta(fasta) unique_kmers = counter.estimate_cardinality() print unique_kmers return unique_kmers
def parse(fn): hll = HLLCounter(.01, K) lens = [] names = [] gc_len = 0 for contig in ReadParser(fn): lens.append(len(contig.sequence)) names.append(contig.name) hll.consume_string(contig.sequence) gc_len += contig.sequence.count('C') gc_len += contig.sequence.count('G') S = pd.Series(lens, index=names) S.sort() gc_perc = float(gc_len) / S.sum() return S, hll.estimate_cardinality(), gc_perc
def parse(fn): hll = HLLCounter(.01, K) lens = [] names = [] gc_len = 0 for contig in ReadParser(fn): lens.append(len(contig.sequence)) names.append(contig.name) hll.consume_string(contig.sequence) gc_len += contig.sequence.count('C') gc_len += contig.sequence.count('G') S = pd.Series(lens, index=names) try: S.sort_values() except AttributeError: S.sort() gc_perc = float(gc_len) / S.sum() return S, hll.estimate_cardinality(), gc_perc