コード例 #1
0
ファイル: HaploFunct.py プロジェクト: fennell-lab/HaploSync
def make_fasta_from_list(querylist, queryfasta, gaplen, seqoutname,
                         outfilename):

    ### Query list element format
    # Gap:	[61252	,	(0:61252)		,	gap		,	61252	,	0]
    #		[length	,	(T_start:Tstop)	, 	"gap" 	, 	length 	, 	0]
    # Object:	[b40-14.HS_iter4_seq3733|+	,	(61252:6463804)	,	93612:7595148	,	-6402552			,	4526208]
    #			[ID|strand					,	(T_start:Tstop)	,	Q_start:Q_stop	,	-(alignment length)	,	matches]

    gaplen = int(gaplen)

    seq = Seq("")
    seq.id = seqoutname

    for CompntId in querylist:

        Id, T_range, Q_range, alignment, matches = CompntId

        if not Q_range == "gap":

            CompntId_name = Id[:-2]
            Orientation = Id[-1]

            # Add gap between Components
            if str(seq) != "":
                seq = seq + "N" * gaplen

            if Orientation == "-":
                my_sub_seq = queryfasta[CompntId_name].reverse_complement()
            else:
                my_sub_seq = queryfasta[CompntId_name]

            seq = seq + my_sub_seq

    # Print the entire sequence
    seq.id = seqoutname
    seq.description = ""
    print >> outfilename, seq.format('fasta')
コード例 #2
0
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output,
                            genome, chromosomes):

    seq = Seq('')
    dbblocks = []
    mapped = False
    scaffold_chromosomes = []
    for block in outblocks:
        if genome.revised_db:
            dbblocks.append([
                block.chromosome, block.cm, scaffold, block.start, block.end,
                block.length
            ])
        if genome.revised_fasta:
            seq += genome.sequences[scaffold][block.start - 1:block.end]
        if block.chromosome != '0':
            scaffold_chromosomes.append(int(block.chromosome))

    stats['scaffolds'] += 1

    scaffold_chromosomes = set(scaffold_chromosomes)
    scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end
    scaffold_length = scaffold_end - scaffold_start + 1
    stats['scaffold_length'] += scaffold_length

    if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1:
        unmapped_output.append(
            [gd.Block(scaffold, scaffold_start, scaffold_end)])

        scaffold_name = genome.revised + "{:05d}".format(genome.revised_count)
        genome.revised_count += 1
        genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start,
                                               scaffold_end)] = scaffold_name

        stats['written_scaffolds'] += 1
        stats['written_length'] += scaffold_length
        if genome.revised_db:
            for block in dbblocks:
                genome.revised_db.execute(
                    "insert into scaffold_map values (?,?,?,?,?,?)", block)
        if genome.revised_fasta:
            seq.description = "length={}".format(len(seq))
            seq.id = scaffold_name
            SeqIO.write(seq, genome.revised_fasta, "fasta")

        if len(scaffold_chromosomes) > 0:
            chrom = next(iter(scaffold_chromosomes))
            chr_unmapped_end = chromosomes[
                chrom].unmapped_start + scaffold_length - 1
            chromosomes[chrom].agp.append(
                "{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format(
                    "chr{}_unmapped".format(chrom),
                    chromosomes[chrom].unmapped_start, chr_unmapped_end,
                    chromosomes[chrom].unmapped_part, scaffold_name,
                    scaffold_length))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].agp.append(
                "{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format(
                    "chr{}_unmapped".format(chrom), chr_unmapped_end + 1,
                    chr_unmapped_end + 100, chromosomes[chrom].unmapped_part))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].unmapped_start = chr_unmapped_end + 101
    else:
        stats['discard_scaffolds'] += 1
        stats['discard_length'] += scaffold_length
        for dbblock in dbblocks:
            dblength = dbblock[5]
            partslength = 0
            for newpart in genome.newparts[scaffold]:
                for origpart in genome.origparts[newpart.oldname]:
                    if dbblock[2] == origpart.newname and (
                            dbblock[3] <= origpart.newstart <= dbblock[4]
                            or dbblock[3] <= origpart.newend <= dbblock[4]):
                        if origpart.parttype in ['active', 'retained']:
                            partslength += origpart.newend - origpart.newstart + 1
                            origpart.parttype = 'removed'
            if dblength != partslength:
                print(scaffold, dblength, partslength, dbblock)
コード例 #3
0
def write_unmapped_scaffold(outblocks, scaffold, stats, unmapped_output, genome, chromosomes):

    seq = Seq('')
    dbblocks = []
    mapped = False
    scaffold_chromosomes = []
    for block in outblocks:
        if genome.revised_db:
            dbblocks.append([block.chromosome, block.cm, scaffold, block.start, block.end, block.length])                
        if genome.revised_fasta:
            seq += genome.sequences[scaffold][block.start-1:block.end]
        if block.chromosome != '0':
            scaffold_chromosomes.append(int(block.chromosome))


    stats['scaffolds'] += 1

    scaffold_chromosomes = set(scaffold_chromosomes)
    scaffold_start, scaffold_end = outblocks[0].start, outblocks[-1].end
    scaffold_length = scaffold_end - scaffold_start + 1
    stats['scaffold_length'] += scaffold_length

    if seq.features or len(scaffold_chromosomes) > 0 and len(dbblocks) > 1:
        unmapped_output.append([gd.Block(scaffold, scaffold_start, scaffold_end)])

        scaffold_name = genome.revised + "{:05d}".format(genome.revised_count)
        genome.revised_count += 1
        genome.revised_names["{}_{}_{}".format(scaffold, scaffold_start, scaffold_end)] = scaffold_name

        stats['written_scaffolds'] += 1
        stats['written_length'] += scaffold_length
        if genome.revised_db:
            for block in dbblocks:
                genome.revised_db.execute("insert into scaffold_map values (?,?,?,?,?,?)", block)
        if genome.revised_fasta:
            seq.description = "length={}".format(len(seq))
            seq.id = scaffold_name
            SeqIO.write(seq, genome.revised_fasta, "fasta")

        if len(scaffold_chromosomes) > 0:
            chrom = next(iter(scaffold_chromosomes))
            chr_unmapped_end = chromosomes[chrom].unmapped_start + scaffold_length - 1 
            chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tD\t{}\t1\t{}\t+\n".format("chr{}_unmapped".format(chrom), chromosomes[chrom].unmapped_start, chr_unmapped_end, chromosomes[chrom].unmapped_part, scaffold_name, scaffold_length))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].agp.append("{}\t{}\t{}\t{}\tN\t100\tfragment\tno\n".format("chr{}_unmapped".format(chrom), chr_unmapped_end+1, chr_unmapped_end+100, chromosomes[chrom].unmapped_part))
            chromosomes[chrom].unmapped_part += 1
            chromosomes[chrom].unmapped_start = chr_unmapped_end + 101
    else:
        stats['discard_scaffolds'] += 1
        stats['discard_length'] += scaffold_length
        for dbblock in dbblocks:
            dblength = dbblock[5]
            partslength = 0
            for newpart in genome.newparts[scaffold]:
                for origpart in genome.origparts[newpart.oldname]:
                    if dbblock[2] == origpart.newname and (dbblock[3] <= origpart.newstart <= dbblock[4] or dbblock[3] <= origpart.newend <= dbblock[4]):
                        if origpart.parttype in ['active', 'retained']:
                            partslength += origpart.newend - origpart.newstart + 1
                            origpart.parttype = 'removed'
            if dblength != partslength:
                print(scaffold, dblength, partslength, dbblock)