Example #1
0
    def parse(fn):
        hll = HLLCounter(.01, K)
        lens = []
        names = []
        gc_len = 0
        n_ambiguous = 0
        for contig in ReadParser(fn):
            sequence = contig.sequence
            lens.append(len(sequence))
            names.append(contig.name)

            if DNA.match(sequence) is None:
                raise RuntimeError('non-ACGTN characters not supported. '\
                                   'Offending transcript: \n>{0}\n{1}\nbad'\
                                   .format(contig.name, contig.sequence))
            if 'N' in sequence:
                sequence = sequence.replace('N', 'A')
                n_ambiguous += 1

            hll.consume_string(sequence)
            gc_len += contig.sequence.count('C')
            gc_len += contig.sequence.count('G')
        S = pd.Series(lens, index=names)
        try:
            S.sort_values()
        except AttributeError:
            S.sort()
        gc_perc = float(gc_len) / S.sum()
        return S, hll.estimate_cardinality(), gc_perc, n_ambiguous
Example #2
0
def get_unique_kmers(mmetsp, fasta):
    print(fasta)
    counter = HLLCounter(0.1, 25)
    counter.consume_fasta(fasta)
    unique_kmers = counter.estimate_cardinality()
    print(unique_kmers)
    return unique_kmers
Example #3
0
    def parse(fn):
        hll = HLLCounter(.01, K)
        lens = []
        names = []
        gc_len = 0
        n_ambiguous = 0
        for contig in ReadParser(fn):
            sequence = contig.sequence
            lens.append(len(sequence))
            names.append(contig.name)

            if DNA.match(sequence) is None:
                raise RuntimeError('non-ACGTN characters not supported. '\
                                   'Offending transcript: \n>{0}\n{1}\nbad'\
                                   .format(contig.name, contig.sequence))
            if 'N' in sequence:
                sequence = sequence.replace('N', 'A')
                n_ambiguous += 1

            hll.consume_string(sequence)
            gc_len += contig.sequence.count('C')
            gc_len += contig.sequence.count('G')
        S = pd.Series(lens, index=names)
        try:
            S.sort_values()
        except AttributeError:
            S.sort()
        gc_perc = float(gc_len) / S.sum()
        print('return')
        return S, hll.estimate_cardinality(), gc_perc, n_ambiguous
Example #4
0
def get_unique_kmers(sra,fasta):
	print fasta
	counter = HLLCounter(0.1,25)
	counter.consume_fasta(fasta)
	unique_kmers = counter.estimate_cardinality()
	print unique_kmers
	return unique_kmers
Example #5
0
 def parse(fn):
     hll = HLLCounter(.01, K)
     lens = []
     names = []
     gc_len = 0
     for contig in ReadParser(fn):
         lens.append(len(contig.sequence))
         names.append(contig.name)
         hll.consume_string(contig.sequence)
         gc_len += contig.sequence.count('C')
         gc_len += contig.sequence.count('G')
     S = pd.Series(lens, index=names)
     S.sort()
     gc_perc = float(gc_len) / S.sum()
     return S, hll.estimate_cardinality(), gc_perc
Example #6
0
 def parse(fn):
     hll = HLLCounter(.01, K)
     lens = []
     names = []
     gc_len = 0
     for contig in ReadParser(fn):
         lens.append(len(contig.sequence))
         names.append(contig.name)
         hll.consume_string(contig.sequence)
         gc_len += contig.sequence.count('C')
         gc_len += contig.sequence.count('G')
     S = pd.Series(lens, index=names)
     try:
         S.sort_values()
     except AttributeError:
         S.sort()
     gc_perc = float(gc_len) / S.sum()
     return S, hll.estimate_cardinality(), gc_perc