コード例 #1
0
ファイル: calc_prob.py プロジェクト: treangen/quast_frc
def getAssemblyLength(assembly_fasta):
    """
    Return the total length of the assembly of the given assembly fasta file.
    If an abundance file (for metagenomic analysis) is specified, we have to 
    multiple each contig by its nominal abundance.
    """
    assembly_length = 0.0
    pf = SeqIO.ParseFasta(assembly_fasta)
    tuple = pf.getRecord()
    while tuple is not None:
        #print contig_abundance[tuple[0].split(' ')[0]]
        assembly_length += contig_abundance[tuple[0].split(' ')[0]] * len(
            tuple[1])
        if debug_level > 0:
            sys.stderr.write('Contig ' + tuple[0].split(' ')[0] + 'length: ' +
                             str(assembly_length) + '\n')
        tuple = pf.getRecord()
    """ DEPRECIATED: uses biopython
    handle = open(assembly_fasta, "rU")
    for record in SeqIO.parse(handle, "fasta") :
        assembly_length += len(record.seq)

    handle.close()
    """
    return assembly_length
コード例 #2
0
def build_kmer_to_contig_index(assembly_filename, kmer_size):
    """
    Return a dictionary of mappings from {kmer: {contig: [location_1, location_2, ...]} }
    """

    # Kmer inverted index
    kmer_ii = defaultdict(list)

    # Contig lengths
    contig_lengths = defaultdict(int)

    pf = SeqIO.ParseFasta(assembly_filename)
    tuple = pf.getRecord()
    kmer_revcomp = None
    kmer = None
    while tuple is not None:

        for i in xrange(0, len(tuple[1]) - kmer_size + 1):
            kmer = tuple[1][i:i + kmer_size]
            kmer_revcomp = revcompl(kmer)
            if kmer < kmer_revcomp:
                kmer_ii[kmer].append((tuple[0], i + 1))
            else:
                kmer_ii[kmer_revcomp].append((tuple[0], i + 1))

        contig_lengths[tuple[0]] = len(tuple[1])
        tuple = pf.getRecord()

    return kmer_ii, contig_lengths