def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            
            # Now we have started our alignment we can add sequences to it 
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence)
            
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            
            output_file.write(align.format("fasta"))
Exemple #2
0
 def __init__(self,ace_file):    
     self.ace_file = ace_file
     self.records = Ace.read(open(ace_file, 'r'))
     assert len(self.records.contigs)==1
     self.contig = self.records.contigs[0]
     self.consensus = self.contig.sequence
     self.consensus_name = self.contig.name
     self.number_sequences = len(self.contig.reads)
     self.reference = ""
     self.reference_name = ""
def gene_expression_2matrix(in_ace, out_file, tags, min_seq):
    """Count sequences with each tags in all contigs.
    
    """
    print
    print "USING MATRIX OUTPUT FORMAT"
    print
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("gene_name\tgene_length")
        for tag in tags:
            output_file.write("\t" + tag)
        output_file.write("\tXX_noTag")
        output_file.write("\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta_2list(align.format("fasta"))
            if len(sequences) < min_seq:
                continue
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            contig_seq = sequences[0][1].replace("*", "")
            contig_length = str(len(contig_seq))
            output_file.write(contig_name + "\t" + contig_length)
            print "Treating", contig_name
            d = defaultdict(int)
            for tag in tags:
                d[tag] = 0
            d["XX_noTag"] = 0
            fasta_counter = 0
            for fasta in sequences:
                fasta_counter += 1
                found_tag = 0
                for tag in tags:
                    if fasta[0].find(tag) > -1:
                        d[tag] += 1
                        found_tag = 1
                if found_tag == 0 and fasta[0].find("Consensus") < 0:
                    d["XX_noTag"] += 1
            for tag in sorted(d):
                output_file.write("\t" + str(d[tag]))
            output_file.write("\n")
Exemple #4
0
def report(args):
    """
    %prog report [--options] ace_file > report

    Prepare a report of read location, consensus location or quality segment per contig
    """
    from jcvi.utils.table import tabulate

    p = OptionParser(report.__doc__)

    types = {"read":      ["padded_start", "padded_end", "orient"],
             "consensus": ["padded_consensus_start", "padded_consensus_end"],
             "quality"  : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"]
            }
    valid_types = tuple(types.keys())
    p.add_option("--type", default="read", choices=valid_types,
            help="choose report type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    for c in ace.contigs:
        print c.name
        table = dict()
        if opts.type == "read":
            ps, pe = [], []
            ps = [read.padded_start for read in c.af]
            for i in xrange(1, len(ps)):
                pe.append(ps[i] - ps[i-1])
            pe.append(c.nbases)
            map = dict(zip(ps, pe))
            for i, read in enumerate(c.af):
                values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "consensus":
            for read in c.bs:
                values = [str(x) for x in (read.padded_start, read.padded_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "quality":
            for read in c.reads:
                (r1, r2) = (read.rd, read.qa)
                values = [str(x) for x in (r2.qual_clipping_start, r2.qual_clipping_end, r2.align_clipping_start, r2.align_clipping_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(r1.name), label)] = values[i]
        print tabulate(table), "\n"
def main():
    base_name = 'FX5ZTWB02D1DFX'
    #contigs = Ace.parse(open('/Users/bcf/Tmp/tmp2.fa.cap.ace'))
    c = Ace.read(open('/Users/bcf/Tmp/tmp2.fa.cap.ace'))
    '''for c in contigs:
        for r in c.reads:
            if r.rd.name == base_name:
                contig = c
                break
            else:
                pass'''
    write(c, '/Users/bcf/Tmp/tmp2_rewrite.fa.cap.ace')
    pdb.set_trace()
def parse_ace(ace_file):
	ace_gen = Ace.parse(open(ace_file, 'r'))
	contig = ace_gen.next()
	align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
	align.add_sequence(contig.name, contig.sequence)
 
	for readn in range(len(contig.reads)):
		clipst = contig.reads[readn].qa.qual_clipping_start
		clipe = contig.reads[readn].qa.qual_clipping_end
		start = contig.af[readn].padded_start
		seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)

		seq = pad_read(seq, start, len(contig.sequence))
		align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq)

	return contig, align
Exemple #7
0
def extract(args):
    """
    %prog extract [--options] ace_file

    Extract contigs from ace file and if necessary reformat header with
    a pipe(|) separated list of constituent reads.
    """
    p = OptionParser(extract.__doc__)
    p.add_option("--format", default=False, action="store_true",
            help="enable flag to reformat header into a symbol separated list of constituent reads "+ \
            "[default: %default]")
    p.add_option("--sep", default="|",
            help="choose a separator used to list the reads in the FASTA header [default: '%default']")
    p.add_option("--singlets", default=False, action="store_true",
            help="ask the program to look in the singlets file (should be in the same folder) for " +\
            "unused reads and put them in the resultant fasta file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    fastafile = acefile.rsplit(".", 1)[0] + ".fasta"
    fw = open(fastafile, "w")
    for c in ace.contigs:
        id = c.name
        if opts.format:
           id = opts.sep.join([read.name for read in c.af])

        seqrec = SeqRecord(Seq(c.sequence), id=id, description="")
        SeqIO.write([seqrec], fw, "fasta")

    if opts.singlets:
        singletsfile = acefile.rsplit(".", 1)[0] + ".singlets"
        if os.path.getsize(singletsfile) > 0:
            fp = SeqIO.parse(must_open(singletsfile), "fasta")
            for rec in fp:
                SeqIO.write(rec, fw, "fasta")

    fw.close()
    logging.debug('Wrote contigs to fasta file {0}'.format(fastafile))
Exemple #8
0
def extract(args):
    """
    %prog extract [--options] ace_file

    Extract contigs from ace file and if necessary reformat header with
    a pipe(|) separated list of constituent reads.
    """
    p = OptionParser(extract.__doc__)
    p.add_option("--format", default=False, action="store_true",
            help="enable flag to reformat header into a symbol separated list of constituent reads "+ \
            "[default: %default]")
    p.add_option("--sep", default="|",
            help="choose a separator used to list the reads in the FASTA header [default: '%default']")
    p.add_option("--singlets", default=False, action="store_true",
            help="ask the program to look in the singlets file (should be in the same folder) for " +\
            "unused reads and put them in the resultant fasta file [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    fastafile = acefile.rsplit(".", 1)[0] + ".fasta"
    fw = open(fastafile, "w")
    for c in ace.contigs:
        id = c.name
        if opts.format:
           id = opts.sep.join([read.name for read in c.af])

        seqrec = SeqRecord(Seq(c.sequence), id=id, description="")
        SeqIO.write([seqrec], fw, "fasta")

    if opts.singlets:
        singletsfile = acefile.rsplit(".", 1)[0] + ".singlets"
        if os.path.getsize(singletsfile) > 0:
            fp = SeqIO.parse(must_open(singletsfile), "fasta")
            for rec in fp:
                SeqIO.write(rec, fw, "fasta")

    fw.close()
    logging.debug('Wrote contigs to fasta file {0}'.format(fastafile))
Exemple #9
0
def AceIterator(handle) :
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags."""

    for ace_contig in Ace.parse(handle) :
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str :
            if "T" in consensus_seq_str :
                #Very odd! Error?
                alpha = generic_ncleotide
            else :
                alpha = generic_rna
        else :
            alpha = generic_dna
            
        if "*" in consensus_seq_str :
            #For consistency with most other file formats, map
            #any * gaps into 0 gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*","-"),
                                Gapped(alpha, gap_char="-"))
        else :
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO - Consensus base quality (BQ lines).  Note that any gaps
        #(* character) in the consensus does not get a quality entry.
        #This really needs Biopython support for per-letter-annotation.

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?
            
        seq_record = SeqRecord(consensus_seq,
                               id = ace_contig.name,
                               name = ace_contig.name)
        yield seq_record 
Exemple #10
0
def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags."""

    for ace_contig in Ace.parse(handle):
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                #Very odd! Error?
                alpha = generic_ncleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            #For consistency with most other file formats, map
            #any * gaps into 0 gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO - Consensus base quality (BQ lines).  Note that any gaps
        #(* character) in the consensus does not get a quality entry.
        #This really needs Biopython support for per-letter-annotation.

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)
        yield seq_record
Exemple #11
0
def parse_singletons_fasta_in_ace(contig_ace_dir, singleton_seq_dir):
    # get contig info
    os.chdir(contig_ace_dir)
    for ace_file in sorted(glob.glob("*.ace")):
        ace_record = Ace.read(open(ace_file))
        contigs = ace_record.contigs
        for contig in contigs:
            if contig.nreads == 1:
                singleton_name = contig.reads[0].rd.name
                singleton_seq = Seq(contig.reads[0].rd.sequence)
                singleton_record = SeqRecord(seq=singleton_seq,
                                             id="",
                                             name="",
                                             description=singleton_name)
                singleton_file = singleton_seq_dir + "/" + singleton_name + ".fsa"
                singleton_fd = open(singleton_file, "w")
                SeqIO.write([singleton_record], singleton_fd, "fasta")
                singleton_fd.close()
                os.system("sed -i \"s/> />/g\" " + singleton_file)
Exemple #12
0
def ace2fasta(in_file, out_file):
    ace_gen = Ace.parse(open(in_file, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "All contigs treated"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))

            # Now we have started our alignment we can add sequences to it
            # Add concensus sequence to alignment
            align.add_sequence(contig.name, contig.sequence.replace("*", ""))
            """for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)"""

            output_file.write(align.format("fasta"))
Exemple #13
0
from Bio.Sequencing import Ace
fn = '../../samples/contig1.ace'
acefilerecord = Ace.read(open(fn))
# For each contig:
for ctg in acefilerecord.contigs:
   print('==========================================')
   print('Contig name: %s'%ctg.name)
   print('Bases: %s'%ctg.nbases)
   print('Reads: %s'%ctg.nreads)
   print('Segments: %s'%ctg.nsegments)
   print('Sequence: %s'%ctg.sequence)
   print('Quality: %s'%ctg.quality)
   # For each read in contig:
   for read in ctg.reads:
       print('Read name: %s'%read.rd.name)
       print('Align start: %s'%read.qa.align_clipping_start)
       print('Align end: %s'%read.qa.align_clipping_end)
       print('Qual start: %s'%read.qa.qual_clipping_start)
       print('Qual end: %s'%read.qa.qual_clipping_end)
       print('Read sequence: %s'%read.rd.sequence)
       print('==========================================')
Exemple #14
0
def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8  # PARAMETER
            max_diff = 3  # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100  # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes += 1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")
Exemple #15
0
def pairwise(in_ace, out_file):
    """Calculate pairwise differentiation indexes.
    
    """
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start
                clipe = contig.reads[readn].qa.qual_clipping_end
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            window_len = 8 # PARAMETER
            max_diff = 3 # PARAMETER
            len_contig = len(sequences[0][1])
            number_indexes = 0
            total_indexes = 0
            for seq in sequences[1:]:
                try:
                    start = len(re.findall("^-+", seq[1])[0])
                except:
                    start = 0
                len_seq = 0
                min_len_seq = 100 # PARAMETER
                count = 0
                for window in range(start, len_contig, window_len):
                    nuc_contig = sequences[0][1][window:window + window_len]
                    nuc_seq = seq[1][window:window + window_len]
                    if "-" in nuc_seq:
                        len_seq += len(nuc_seq.replace("-", ""))
                    else:
                        diff = count_diff(nuc_contig, nuc_seq, max_diff)
                        if diff[1] == False:
                            count += diff[0]
                            len_seq += window_len
                len_seq -= seq.count("*")
                if len_seq >= min_len_seq:
                    index = float(count) / len_seq
                    if count > 0:
                        number_indexes +=1
                        total_indexes += index
                else:
                    index = "NA"
                #output_file.write(contig_name + "\t" + str(index) + "\n")
            try:
                mean_index = float(total_indexes) / number_indexes
            except:
                mean_index = "NA"
            output_file.write(contig_name + "\t" + str(mean_index) + "\n")
Exemple #16
0
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step,
                   coverage, stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) -1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" + str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(len([h for h in haplotypes[-1]
                                                  if h[-1] == v and h[0].startswith(g)])))
                            bamova_file.write("\n")
                        with open ("fasta_output/" + contig.name + ".fasta", "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write("Marker" + str(marker_number) + "\t" +
                                                  "\t".join([str(x) for x in h]) + "\t" +
                                                  ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"
Exemple #17
0
def report(args):
    """
    %prog report [--options] ace_file > report

    Prepare a report of read location, consensus location or quality segment per contig
    """
    from jcvi.utils.table import tabulate

    p = OptionParser(report.__doc__)

    types = {
        "read": ["padded_start", "padded_end", "orient"],
        "consensus": ["padded_consensus_start", "padded_consensus_end"],
        "quality": [
            "qual_clipping_start", "qual_clipping_end", "align_clipping_start",
            "align_clipping_end"
        ]
    }
    valid_types = tuple(types.keys())
    p.add_option("--type",
                 default="read",
                 choices=valid_types,
                 help="choose report type [default: %default]")

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    acefile, = args
    ace = Ace.read(must_open(acefile))
    logging.debug('Loaded ace file {0}'.format(acefile))

    for c in ace.contigs:
        print c.name
        table = dict()
        if opts.type == "read":
            ps, pe = [], []
            ps = [read.padded_start for read in c.af]
            for i in xrange(1, len(ps)):
                pe.append(ps[i] - ps[i - 1])
            pe.append(c.nbases)
            map = dict(zip(ps, pe))
            for i, read in enumerate(c.af):
                values = [
                    str(x) for x in (read.padded_start, map[read.padded_start],
                                     read.coru)
                ]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "consensus":
            for read in c.bs:
                values = [str(x) for x in (read.padded_start, read.padded_end)]
                for i, label in enumerate(types[opts.type]):
                    table[(str(read.name), label)] = values[i]
        elif opts.type == "quality":
            for read in c.reads:
                (r1, r2) = (read.rd, read.qa)
                values = [
                    str(x)
                    for x in (r2.qual_clipping_start, r2.qual_clipping_end,
                              r2.align_clipping_start, r2.align_clipping_end)
                ]
                for i, label in enumerate(types[opts.type]):
                    table[(str(r1.name), label)] = values[i]
        print tabulate(table), "\n"
            part_site_comp_fh.writerow(row)

    cutoff = []

    if (p.use_reads):

        print "Working on ace file {}".format(p.read_fn)

        contig_read_dict = {}
        contig_read_len_dict = {}

        from Bio.Sequencing import Ace

        with open(p.use_reads, 'rU') as ace_fh:

            for contig in Ace.parse(ace_fh):
                """rd (reads) - read with name, sequence, etc
				qa (read qual) - which parts used as consensus
				ds - file name of read's chromatogram file
				af - loc of read within contig
				bs (base segment) - which read chosen at consensus at each pos
				rt (transient read tags) - generated by crossmatch and phrap
				ct (consensus tag)
				wa (whole assembly tag) - hosts assembly program name, version, etc
				wr
				reads - info about read supporting ace contig
				contig - holds info about contig from ace record"""

                contig_name = "{}".format(contig.name)  # contig00001

                if not contig_name in contig_read_dict:
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage,
                   stars, ngroups, nhaplo):
    """Get haplotypes from contigs in an ace file
    
    """
    marker_number = 0
    min_freq = 0.05
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        with open(out_bamova, "w") as bamova_file:
            output_file.write("Contig_nb\tWindow\tHaplotype\n")
            contig_counter = 0
            ntreated = 0
            for contig in ace_gen:
                pass_haplo = False
                contig_counter += 1
                align = Alignment(Gapped(IUPAC.ambiguous_dna, "X"))
                align.add_sequence(contig.name, contig.sequence)
                if len(contig.reads) - 1 < coverage:
                    continue
                ntreated += 1
                for readn in xrange(len(contig.reads)):
                    clipst = contig.reads[readn].qa.qual_clipping_start
                    clipe = contig.reads[readn].qa.qual_clipping_end
                    clipst2 = contig.reads[readn].qa.align_clipping_start
                    clipe2 = contig.reads[readn].qa.align_clipping_end
                    if clipst2 > clipst:
                        clipst = clipst2
                    if clipe2 < clipe2:
                        clipe = clipe2
                    start = contig.af[readn].padded_start
                    seq = cut_ends(contig.reads[readn].rd.sequence, clipst,
                                   clipe)
                    seq = pad_read(seq, start, len(contig.sequence))
                    if "pseudo" not in contig.reads[readn].rd.name:
                        align.add_sequence(contig.reads[readn].rd.name, seq)
                sequences = read_fasta(align.format("fasta"))
                sequences = [[s[0].replace(">", ""), s[1]] for s in sequences]
                contig_name = sequences[0][0]
                concensus = sequences[0][1]
                error_positions = multi_find("*", concensus)[::-1]
                for p in error_positions:
                    sequences = [[s[0], s[1][0:p] + s[1][p + 1:]]
                                 for s in sequences]
                concensus = sequences[0][1]
                sequences = [[s[0], correct_sequence(concensus, s[1])]
                             for s in sequences[1:]]
                sequences, snp_pos = snp_positions(sequences)
                haplotypes = best_snps(sequences, snp_pos, coverage)
                if haplotypes != "Empty":
                    bamova = []
                    variants = list(
                        sorted(list(set([h[-1] for h in haplotypes[-1]]))))
                    groups = list(
                        sorted(set([h[0][:3] for h in haplotypes[-1]])))
                    if len(groups) >= ngroups:
                        pass_haplo = True
                        for g in groups:
                            if len([
                                    h[0] for h in haplotypes[-1]
                                    if h[0].startswith(g)
                            ]) < nhaplo:
                                pass_haplo = False
                    if pass_haplo:
                        print contig.name
                        bamova_file.write("Marker" + str(marker_number) + "\n")
                        group_number = 0
                        for g in groups:
                            bamova_file.write("Population\t" +
                                              str(group_number))
                            group_number += 1
                            for v in variants:
                                bamova_file.write("\t" + str(
                                    len([
                                        h for h in haplotypes[-1]
                                        if h[-1] == v and h[0].startswith(g)
                                    ])))
                            bamova_file.write("\n")
                        with open("fasta_output/" + contig.name + ".fasta",
                                  "w") as f:
                            output_file.write(contig.name + "\n")
                            for h in haplotypes[-1]:
                                f.write(">" + h[0] + str(marker_number) +
                                        "\n" + h[2] + "\n")
                                h[1] = [x - h[1][0] + 1 for x in h[1]]
                                output_file.write(
                                    "Marker" + str(marker_number) + "\t" +
                                    "\t".join([str(x) for x in h]) + "\t" +
                                    ":".join(variants) + "\n")
                        marker_number += 1
                output_file.flush()
                bamova_file.flush()
                cutoff = 100000
                if contig_counter > cutoff:
                    break
        print "\n", str(ntreated), "contigs out of", str(
            contig_counter), "were treated"
Exemple #20
0
from Bio.Sequencing import Ace
fn = '../../samples/contig1.ace'
acefilerecord = Ace.read(open(fn))
# For each contig:
for ctg in acefilerecord.contigs:
    print('==========================================')
    print('Contig name: %s' % ctg.name)
    print('Bases: %s' % ctg.nbases)
    print('Reads: %s' % ctg.nreads)
    print('Segments: %s' % ctg.nsegments)
    print('Sequence: %s' % ctg.sequence)
    print('Quality: %s' % ctg.quality)
    # For each read in contig:
    for read in ctg.reads:
        print('Read name: %s' % read.rd.name)
        print('Align start: %s' % read.qa.align_clipping_start)
        print('Align end: %s' % read.qa.align_clipping_end)
        print('Qual start: %s' % read.qa.qual_clipping_start)
        print('Qual end: %s' % read.qa.qual_clipping_end)
        print('Read sequence: %s' % read.rd.sequence)
        print('==========================================')
Exemple #21
0
 def _get_gen(self):
     return ace.parse(open(self.ace_filename))
Exemple #22
0
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start  # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end  # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start  # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end  # Added
                if clipst2 > clipst:  # Added
                    clipst = clipst2  # Added
                if clipe2 < clipe2:  # Added
                    clipe = clipe2  # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5  # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue  # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1]  # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" +
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")
Exemple #23
0
def AceIterator(handle):
    """Returns SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from Bio import SeqIO
    >>> handle = open("Ace/consed_sample.ace", "rU")
    >>> for record in SeqIO.parse(handle, "ace"):
    ...     print record.id, record.seq[:10]+"...", len(record)
    ...     print max(record.letter_annotations["phred_quality"])
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from Bio import SeqIO
    >>> handle = open("Ace/contig1.ace", "rU")
    >>> for record in SeqIO.parse(handle, "ace"):
    ...     print record.id, "..." + record.seq[85:95]+"..."
    ...     print record.letter_annotations["phred_quality"][85:95]
    ...     print max(record.letter_annotations["phred_quality"])
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(handle):
        #Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        #Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                #Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            #For consistency with most other file formats, map
            #any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"),
                                Gapped(alpha, gap_char="-"))
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        #TODO? - Base segments (BS lines) which indicates which read
        #phrap has chosen to be the consensus at a particular position.
        #Perhaps as SeqFeature objects?

        #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        #Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        #Consensus base quality (BQ lines).  Note that any gaps (originally
        #as * characters) in the consensus do not get a quality entry, so
        #we assign a quality of None (zero would be missleading as there may
        #be excelent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record
Exemple #24
0
def AceIterator(source):
    """Return SeqRecord objects from an ACE file.

    This uses the Bio.Sequencing.Ace module to do the hard work.  Note that
    by iterating over the file in a single pass, we are forced to ignore any
    WA, CT, RT or WR footer tags.

    Ace files include the base quality for each position, which are taken
    to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file
    using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's
    letter_annotations dictionary under the "phred_quality" key.

    >>> from Bio import SeqIO
    >>> with open("Ace/consed_sample.ace") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s %s... %i" % (record.id, record.seq[:10], len(record)))
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 agccccgggc... 1475
    90

    However, ACE files do not include a base quality for any gaps in the
    consensus sequence, and these are represented in Biopython with a quality
    of zero. Using zero is perhaps misleading as there may be very strong
    evidence to support the gap in the consensus. Previous versions of
    Biopython therefore used None instead, but this complicated usage, and
    prevented output of the gapped sequence as FASTQ format.

    >>> from Bio import SeqIO
    >>> with open("Ace/contig1.ace") as handle:
    ...     for record in SeqIO.parse(handle, "ace"):
    ...         print("%s ...%s..." % (record.id, record.seq[85:95]))
    ...         print(record.letter_annotations["phred_quality"][85:95])
    ...         print(max(record.letter_annotations["phred_quality"]))
    Contig1 ...AGAGG-ATGC...
    [57, 57, 54, 57, 57, 0, 57, 72, 72, 72]
    90
    Contig2 ...GAATTACTAT...
    [68, 68, 68, 68, 68, 68, 68, 68, 68, 68]
    90

    """
    for ace_contig in Ace.parse(source):
        # Convert the ACE contig record into a SeqRecord...
        consensus_seq_str = ace_contig.sequence
        # Assume its DNA unless there is a U in it,
        if "U" in consensus_seq_str:
            if "T" in consensus_seq_str:
                # Very odd! Error?
                alpha = generic_nucleotide
            else:
                alpha = generic_rna
        else:
            alpha = generic_dna

        if "*" in consensus_seq_str:
            # For consistency with most other file formats, map
            # any * gaps into - gaps.
            assert "-" not in consensus_seq_str
            consensus_seq = Seq(consensus_seq_str.replace("*", "-"), alpha)
        else:
            consensus_seq = Seq(consensus_seq_str, alpha)

        # TODO? - Base segments (BS lines) which indicates which read
        # phrap has chosen to be the consensus at a particular position.
        # Perhaps as SeqFeature objects?

        # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines)
        # Perhaps as SeqFeature objects?

        seq_record = SeqRecord(consensus_seq,
                               id=ace_contig.name,
                               name=ace_contig.name)

        # Consensus base quality (BQ lines).  Note that any gaps (originally
        # as * characters) in the consensus do not get a quality entry, so
        # we assign a quality of None (zero would be misleading as there may
        # be excellent support for having a gap here).
        quals = []
        i = 0
        for base in consensus_seq:
            if base == "-":
                quals.append(0)
            else:
                quals.append(ace_contig.quality[i])
                i += 1
        assert i == len(ace_contig.quality)
        seq_record.letter_annotations["phred_quality"] = quals

        yield seq_record
Exemple #25
0
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars):
    """Genotype individuals at SNPs loci.
    
    """
    win_buffer = (win_len - 1) / 2
    ace_gen = Ace.parse(open(in_ace, 'r'))
    with open(out_file, "w") as output_file:
        output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n")
        while 1:
            try:
                contig = ace_gen.next()
            except:
                print "***All contigs treated***"
                break
            align = Alignment(Gapped(IUPAC.ambiguous_dna, "-"))
            align.add_sequence(contig.name, contig.sequence)
            for readn in xrange(len(contig.reads)):
                clipst = contig.reads[readn].qa.qual_clipping_start # GOOD
                clipe = contig.reads[readn].qa.qual_clipping_end # GOOD
                clipst2 = contig.reads[readn].qa.align_clipping_start # Added
                clipe2 = contig.reads[readn].qa.align_clipping_end # Added
                if clipst2 > clipst: # Added
                    clipst = clipst2 # Added
                if clipe2 < clipe2: # Added
                    clipe = clipe2 # Added
                start = contig.af[readn].padded_start
                seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe)
                seq = pad_read(seq, start, len(contig.sequence))
                if "pseudo" not in contig.reads[readn].rd.name:
                    align.add_sequence(contig.reads[readn].rd.name, seq)
            sequences = read_fasta(align.format("fasta"))
            contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0]
            print "Treating", contig_name
            positions = []
            try:
                positions = snp_dict[contig_name]
            except:
                continue
            d = {}
            for pos in positions:
                if stars == True:
                    pos_ok = correct_position(pos, sequences[0][1])
                else:
                    pos_ok = pos
                left = pos_ok - 5
                if left < 0:
                    left = 0
                right = pos_ok + 1 + 5 # takes into account the middle nucleotide
                ref_window = sequences[0][1][left:right]
                d.setdefault(pos, {})
                d[pos].setdefault("XX_noTag", {})
                for nuc in list("ACGTN*-"):
                    d[pos]["XX_noTag"].setdefault(nuc, 0)
                for tag in tags:
                    d[pos].setdefault(tag, {})
                    for nuc in list("ACGTN*-"):
                        d[pos][tag].setdefault(nuc, 0)
                for fasta in sequences:
                    window = fasta[1][left:right]
                    del_count = 0
                    if window.count("-") > win_buffer - 3:
                        continue # Need at least 3 nucleotides on each side
                    for tag in tags:
                        if tag in fasta[0]:
                            t = tag
                            break
                        else:
                            t = "XX_noTag"
                    if len(ref_window) == len(window):
                        for i in xrange(len(window)):
                            if ref_window[i].isalpha() and window[i] == "*" or \
                               window[i].isalpha() and ref_window[i] == "*":
                                del_count += 1
                    if del_count > max_del:
                        continue
                    p = pos
                    s = fasta[1] # Sequence
                    n = s[pos_ok - 1].upper()
                    d[p][t][n] += 1
            for p in sorted(d):
                for t in sorted(d[p]):
                    output_file.write(contig_name + "\t" + str(p) + "\t" + 
                                      str(t))
                    for n in list("ACGTN*-"):
                        output_file.write("\t" + str(d[p][t][n]))
                    output_file.write("\n")