def run_minimus(fasta, outroot=None, restore_singletons=True, contig_prefix='', qual=None): '''given a fasta file and an optional output root (otherwise use fasta base) generates an assembly using minimus from the amos package assembly saved as <outroot>.minimus.fasta optionally restores singleton reads in <outroot>.all.fasta if restore_singletons=True, returns path to .all.fasta, otherwise returns path to minimus.fasta''' if outroot is None: outroot = fasta.rsplit('.', 1)[0] if qual: print >> sys.stderr, 'qualities invoked (%s)' % qual os.system('toAmos -s %s -q %s -o %s.afg' % (fasta, qual, outroot)) else: os.system('toAmos -s %s -o %s.afg' % (fasta, outroot)) os.system('minimus -D TGT=%s.afg %s.minimus' % (outroot, outroot)) if contig_prefix: lines = open(outroot + '.minimus.contig').readlines() fh = open(outroot + '.minimus.contig', 'w') for l in lines: if l.startswith('##'): print >> fh, '##' + contig_prefix + l[2:], else: print >> fh, l, fh.close() if restore_singletons: in_assem = re.findall('#(.+?)\(', open(outroot + '.minimus.contig').read()) reads = Seq.Fasta(fasta) for f in in_assem: del reads[f] all_fasta = outroot + '.all.fasta' assem = Seq.Fasta(outroot + '.minimus.fasta') allseq = Seq.Fasta() allseq.update(dict([(contig_prefix + k, v) for k, v in assem.items()])) allseq.update(reads) allseq.write_to_file(all_fasta) return all_fasta else: if contig_prefix: f = outroot + '.minimus.fasta' lines = open(f).readlines() fh = open(f, 'w') for l in lines: if l.startswith('>'): print >> fh, '>' + contig_prefix + l[1:], else: print >> fh, l, fh.close() return outroot + '.minimus.fasta'
def build_fasta_from_scaff_gff(infasta_s, gff, contig_prefix='', include_singletons=True, ol_minID=0.9, outfile=None, mum_len='4'): '''takes scaffolding information from gff of the form generated by get_scaff_from_minimus builds a single assembly for all scaffold instructions pertaining to seqids in infasta if include_singletons is True, adds all sequences from infasta not included in scaffolds along with the scaffolded sequence in the returned assembly ol_minID is the minimum %ID accepted for overlaps in contigs ''' if isinstance(infasta_s, str): infasta = Seq.Fasta(infasta_s) else: infasta = deepcopy(infasta_s) suffixes = ['', 'b', 'c', 'd', 'e', 'f'] current_suffix = '' #use only scaffolding info relevant to the specified infasta in_ids = infasta.seq_names() this_gff = [r for r in gff if r['seqid'] in in_ids] #use only scaffolding info that joins 2 or more seqs contigs = {}.fromkeys([r['attribute_contig'] for r in this_gff], 0) for r in this_gff: contigs[r['attribute_contig']] += 1 #get final ordered scaffolding layout this_gff = sorted( [r for r in this_gff if contigs[r['attribute_contig']] > 1], key=lambda r: (r['attribute_contig'], int(r['attribute_cstart']), int(r['attribute_cend']))) #extract sequences and orient for scaffolding assem_frags = infasta.substr_from_gff(this_gff, plus_strand=True, name_key=None) assem = Seq.Fasta() for k, v in contigs.items(): if v > 1: assem[contig_prefix + k] = Seq.Sequence('') if this_gff: for i, r in enumerate(this_gff[:-1]): next = this_gff[i + 1] if r['attribute_contig'] == next['attribute_contig']: s1, e1, s2, e2 = [ int(n) for n in [ r['attribute_cstart'], r['attribute_cend'], next['attribute_cstart'], next['attribute_cend'] ] ] print >> sys.stderr, 'scaffolding %s %s %s %s:\n\t%s\n\t%s' % ( s1, e1, s2, e2, r, next) if e1 > s2: print >> sys.stderr, 'OVERLAP:\n\t%s\n\t%s' % ( assem_frags[r['seqid']][(s2 - s1):], assem_frags[next['seqid']][:(e1 - s2 + 1)]) fa1 = Seq.Fasta() fa2 = Seq.Fasta() fa1['seq1'] = assem_frags[r['seqid']][(s2 - s1):] fa2['seq2'] = assem_frags[next['seqid']][:(e1 - s2 + 1)] shorter = min(len(fa1['seq1']), len(fa2['seq2'])) mums = Aln.mum(fa1, fa2, mumargs={'-l': '%s' % int(mum_len)})[0] match = float(sum([mumr['score'] for mumr in mums])) if (shorter <= 2*int(mum_len) + math.ceil((1-ol_minID)*shorter)) or \ (match/shorter >= ol_minID) or \ (fa1['seq1'][:shorter] == fa2['seq2'][:shorter]) or \ (Seq.is_simple(fa1['seq1']) or Seq.is_simple(fa2['seq2'])): assem[contig_prefix + r['attribute_contig'] + current_suffix] += assem_frags[r['seqid']][:s2] else: #implement record of splitting into a/b/etc fragments! print >> sys.stdout, fa1, '\n', fa2, '\n', mums current_suffix = suffixes[ suffixes.index(current_suffix) + 1] print >> sys.stderr, 'overlap of %s bp %0.2f %%id unresolved (min %0.2f)\nstarting %s' % ( e1 - s2, match / (e1 - s2), ol_minID, current_suffix) assem[contig_prefix + r['attribute_contig'] + current_suffix] = assem_frags[r['seqid']] else: spacer = Seq.Sequence('n' * (s2 - e1)) assem[contig_prefix + r['attribute_contig'] + current_suffix] += assem_frags[r['seqid']] + spacer else: assem[contig_prefix + r['attribute_contig'] + current_suffix] += assem_frags[r['seqid']] current_suffix = '' assem[contig_prefix + this_gff[-1]['attribute_contig'] + current_suffix] += assem_frags[this_gff[-1]['seqid']] if include_singletons: singletons = dict([(k, v) for k, v in infasta.items() if not k in [r['seqid'] for r in this_gff]]) assem.update(singletons) if outfile: assem.write_to_file(outfile) return assem