def make_fasta_from_list(querylist, queryfasta, gaplen, seqoutname, outfilename): ### Query list element format # Gap: [61252 , (0:61252) , gap , 61252 , 0] # [length , (T_start:Tstop) , "gap" , length , 0] # Object: [b40-14.HS_iter4_seq3733|+ , (61252:6463804) , 93612:7595148 , -6402552 , 4526208] # [ID|strand , (T_start:Tstop) , Q_start:Q_stop , -(alignment length) , matches] gaplen = int(gaplen) seq = Seq("") seq.id = seqoutname for CompntId in querylist: Id, T_range, Q_range, alignment, matches = CompntId if not Q_range == "gap": CompntId_name = Id[:-2] Orientation = Id[-1] # Add gap between Components if str(seq) != "": seq = seq + "N" * gaplen if Orientation == "-": my_sub_seq = queryfasta[CompntId_name].reverse_complement() else: my_sub_seq = queryfasta[CompntId_name] seq = seq + my_sub_seq # Print the entire sequence seq.id = seqoutname seq.description = "" print >> outfilename, seq.format('fasta')
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output, genome, chromosomes): seq = Seq('') dbblocks = [] mapped = False scaffold_chromosomes = [] for block in outblocks: if genome.revised_db: dbblocks.append([ block.chromosome, block.cm, scaffold, block.start, block.end, block.length ]) if genome.revised_fasta: seq += genome.sequences[scaffold][block.start - 1:block.end] if block.chromosome != '0': scaffold_chromosomes.append(int(block.chromosome)) stats['scaffolds'] += 1 scaffold_chromosomes = set(scaffold_chromosomes) scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end scaffold_length = scaffold_end - scaffold_start + 1 stats['scaffold_length'] += scaffold_length if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1: unmapped_output.append( [gd.Block(scaffold, scaffold_start, scaffold_end)]) scaffold_name = genome.revised + "{:05d}".format(genome.revised_count) genome.revised_count += 1 genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start, scaffold_end)] = scaffold_name stats['written_scaffolds'] += 1 stats['written_length'] += scaffold_length if genome.revised_db: for block in dbblocks: genome.revised_db.execute( "insert into scaffold_map values (?,?,?,?,?,?)", block) if genome.revised_fasta: seq.description = "length={}".format(len(seq)) seq.id = scaffold_name SeqIO.write(seq, genome.revised_fasta, "fasta") if len(scaffold_chromosomes) > 0: chrom = next(iter(scaffold_chromosomes)) chr_unmapped_end = chromosomes[ chrom].unmapped_start + scaffold_length - 1 chromosomes[chrom].agp.append( "{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format( "chr{}_unmapped".format(chrom), chromosomes[chrom].unmapped_start, chr_unmapped_end, chromosomes[chrom].unmapped_part, scaffold_name, scaffold_length)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].agp.append( "{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format( "chr{}_unmapped".format(chrom), chr_unmapped_end + 1, chr_unmapped_end + 100, chromosomes[chrom].unmapped_part)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].unmapped_start = chr_unmapped_end + 101 else: stats['discard_scaffolds'] += 1 stats['discard_length'] += scaffold_length for dbblock in dbblocks: dblength = dbblock[5] partslength = 0 for newpart in genome.newparts[scaffold]: for origpart in genome.origparts[newpart.oldname]: if dbblock[2] == origpart.newname and ( dbblock[3] <= origpart.newstart <= dbblock[4] or dbblock[3] <= origpart.newend <= dbblock[4]): if origpart.parttype in ['active', 'retained']: partslength += origpart.newend - origpart.newstart + 1 origpart.parttype = 'removed' if dblength != partslength: print(scaffold, dblength, partslength, dbblock)
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output, genome, chromosomes): seq = Seq('') dbblocks = [] mapped = False scaffold_chromosomes = [] for block in outblocks: if genome.revised_db: dbblocks.append([block.chromosome, block.cm, scaffold, block.start, block.end, block.length]) if genome.revised_fasta: seq += genome.sequences[scaffold][block.start-1:block.end] if block.chromosome != '0': scaffold_chromosomes.append(int(block.chromosome)) stats['scaffolds'] += 1 scaffold_chromosomes = set(scaffold_chromosomes) scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end scaffold_length = scaffold_end - scaffold_start + 1 stats['scaffold_length'] += scaffold_length if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1: unmapped_output.append([gd.Block(scaffold, scaffold_start, scaffold_end)]) scaffold_name = genome.revised + "{:05d}".format(genome.revised_count) genome.revised_count += 1 genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start, scaffold_end)] = scaffold_name stats['written_scaffolds'] += 1 stats['written_length'] += scaffold_length if genome.revised_db: for block in dbblocks: genome.revised_db.execute("insert into scaffold_map values (?,?,?,?,?,?)", block) if genome.revised_fasta: seq.description = "length={}".format(len(seq)) seq.id = scaffold_name SeqIO.write(seq, genome.revised_fasta, "fasta") if len(scaffold_chromosomes) > 0: chrom = next(iter(scaffold_chromosomes)) chr_unmapped_end = chromosomes[chrom].unmapped_start + scaffold_length - 1 chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format("chr{}_unmapped".format(chrom), chromosomes[chrom].unmapped_start, chr_unmapped_end, chromosomes[chrom].unmapped_part, scaffold_name, scaffold_length)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format("chr{}_unmapped".format(chrom), chr_unmapped_end+1, chr_unmapped_end+100, chromosomes[chrom].unmapped_part)) chromosomes[chrom].unmapped_part += 1 chromosomes[chrom].unmapped_start = chr_unmapped_end + 101 else: stats['discard_scaffolds'] += 1 stats['discard_length'] += scaffold_length for dbblock in dbblocks: dblength = dbblock[5] partslength = 0 for newpart in genome.newparts[scaffold]: for origpart in genome.origparts[newpart.oldname]: if dbblock[2] == origpart.newname and (dbblock[3] <= origpart.newstart <= dbblock[4] or dbblock[3] <= origpart.newend <= dbblock[4]): if origpart.parttype in ['active', 'retained']: partslength += origpart.newend - origpart.newstart + 1 origpart.parttype = 'removed' if dblength != partslength: print(scaffold, dblength, partslength, dbblock)