def getAssemblyLength(assembly_fasta): """ Return the total length of the assembly of the given assembly fasta file. If an abundance file (for metagenomic analysis) is specified, we have to multiple each contig by its nominal abundance. """ assembly_length = 0.0 pf = SeqIO.ParseFasta(assembly_fasta) tuple = pf.getRecord() while tuple is not None: #print contig_abundance[tuple[0].split(' ')[0]] assembly_length += contig_abundance[tuple[0].split(' ')[0]] * len( tuple[1]) if debug_level > 0: sys.stderr.write('Contig ' + tuple[0].split(' ')[0] + 'length: ' + str(assembly_length) + '\n') tuple = pf.getRecord() """ DEPRECIATED: uses biopython handle = open(assembly_fasta, "rU") for record in SeqIO.parse(handle, "fasta") : assembly_length += len(record.seq) handle.close() """ return assembly_length
def build_kmer_to_contig_index(assembly_filename, kmer_size): """ Return a dictionary of mappings from {kmer: {contig: [location_1, location_2, ...]} } """ # Kmer inverted index kmer_ii = defaultdict(list) # Contig lengths contig_lengths = defaultdict(int) pf = SeqIO.ParseFasta(assembly_filename) tuple = pf.getRecord() kmer_revcomp = None kmer = None while tuple is not None: for i in xrange(0, len(tuple[1]) - kmer_size + 1): kmer = tuple[1][i:i + kmer_size] kmer_revcomp = revcompl(kmer) if kmer < kmer_revcomp: kmer_ii[kmer].append((tuple[0], i + 1)) else: kmer_ii[kmer_revcomp].append((tuple[0], i + 1)) contig_lengths[tuple[0]] = len(tuple[1]) tuple = pf.getRecord() return kmer_ii, contig_lengths