def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) output_file.write(align.format("fasta"))
def __init__(self,ace_file): self.ace_file = ace_file self.records = Ace.read(open(ace_file, 'r')) assert len(self.records.contigs)==1 self.contig = self.records.contigs[0] self.consensus = self.contig.sequence self.consensus_name = self.contig.name self.number_sequences = len(self.contig.reads) self.reference = "" self.reference_name = ""
def gene_expression_2matrix(in_ace, out_file, tags, min_seq): """Count sequences with each tags in all contigs. """ print print "USING MATRIX OUTPUT FORMAT" print ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("gene_name\tgene_length") for tag in tags: output_file.write("\t" + tag) output_file.write("\tXX_noTag") output_file.write("\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta_2list(align.format("fasta")) if len(sequences) < min_seq: continue contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] contig_seq = sequences[0][1].replace("*", "") contig_length = str(len(contig_seq)) output_file.write(contig_name + "\t" + contig_length) print "Treating", contig_name d = defaultdict(int) for tag in tags: d[tag] = 0 d["XX_noTag"] = 0 fasta_counter = 0 for fasta in sequences: fasta_counter += 1 found_tag = 0 for tag in tags: if fasta[0].find(tag) > -1: d[tag] += 1 found_tag = 1 if found_tag == 0 and fasta[0].find("Consensus") < 0: d["XX_noTag"] += 1 for tag in sorted(d): output_file.write("\t" + str(d[tag])) output_file.write("\n")
def report(args): """ %prog report [--options] ace_file > report Prepare a report of read location, consensus location or quality segment per contig """ from jcvi.utils.table import tabulate p = OptionParser(report.__doc__) types = {"read": ["padded_start", "padded_end", "orient"], "consensus": ["padded_consensus_start", "padded_consensus_end"], "quality" : ["qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end"] } valid_types = tuple(types.keys()) p.add_option("--type", default="read", choices=valid_types, help="choose report type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) acefile, = args ace = Ace.read(must_open(acefile)) logging.debug('Loaded ace file {0}'.format(acefile)) for c in ace.contigs: print c.name table = dict() if opts.type == "read": ps, pe = [], [] ps = [read.padded_start for read in c.af] for i in xrange(1, len(ps)): pe.append(ps[i] - ps[i-1]) pe.append(c.nbases) map = dict(zip(ps, pe)) for i, read in enumerate(c.af): values = [str(x) for x in (read.padded_start, map[read.padded_start], read.coru)] for i, label in enumerate(types[opts.type]): table[(str(read.name), label)] = values[i] elif opts.type == "consensus": for read in c.bs: values = [str(x) for x in (read.padded_start, read.padded_end)] for i, label in enumerate(types[opts.type]): table[(str(read.name), label)] = values[i] elif opts.type == "quality": for read in c.reads: (r1, r2) = (read.rd, read.qa) values = [str(x) for x in (r2.qual_clipping_start, r2.qual_clipping_end, r2.align_clipping_start, r2.align_clipping_end)] for i, label in enumerate(types[opts.type]): table[(str(r1.name), label)] = values[i] print tabulate(table), "\n"
def main(): base_name = 'FX5ZTWB02D1DFX' #contigs = Ace.parse(open('/Users/bcf/Tmp/tmp2.fa.cap.ace')) c = Ace.read(open('/Users/bcf/Tmp/tmp2.fa.cap.ace')) '''for c in contigs: for r in c.reads: if r.rd.name == base_name: contig = c break else: pass''' write(c, '/Users/bcf/Tmp/tmp2_rewrite.fa.cap.ace') pdb.set_trace()
def parse_ace(ace_file): ace_gen = Ace.parse(open(ace_file, 'r')) contig = ace_gen.next() align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in range(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) align.add_sequence(contig.reads[readn].rd.name + "_" + contig.af[readn].coru, seq) return contig, align
def extract(args): """ %prog extract [--options] ace_file Extract contigs from ace file and if necessary reformat header with a pipe(|) separated list of constituent reads. """ p = OptionParser(extract.__doc__) p.add_option("--format", default=False, action="store_true", help="enable flag to reformat header into a symbol separated list of constituent reads "+ \ "[default: %default]") p.add_option("--sep", default="|", help="choose a separator used to list the reads in the FASTA header [default: '%default']") p.add_option("--singlets", default=False, action="store_true", help="ask the program to look in the singlets file (should be in the same folder) for " +\ "unused reads and put them in the resultant fasta file [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) acefile, = args ace = Ace.read(must_open(acefile)) logging.debug('Loaded ace file {0}'.format(acefile)) fastafile = acefile.rsplit(".", 1)[0] + ".fasta" fw = open(fastafile, "w") for c in ace.contigs: id = c.name if opts.format: id = opts.sep.join([read.name for read in c.af]) seqrec = SeqRecord(Seq(c.sequence), id=id, description="") SeqIO.write([seqrec], fw, "fasta") if opts.singlets: singletsfile = acefile.rsplit(".", 1)[0] + ".singlets" if os.path.getsize(singletsfile) > 0: fp = SeqIO.parse(must_open(singletsfile), "fasta") for rec in fp: SeqIO.write(rec, fw, "fasta") fw.close() logging.debug('Wrote contigs to fasta file {0}'.format(fastafile))
def AceIterator(handle) : """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags.""" for ace_contig in Ace.parse(handle) : #Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence #Assume its DNA unless there is a U in it, if "U" in consensus_seq_str : if "T" in consensus_seq_str : #Very odd! Error? alpha = generic_ncleotide else : alpha = generic_rna else : alpha = generic_dna if "*" in consensus_seq_str : #For consistency with most other file formats, map #any * gaps into 0 gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*","-"), Gapped(alpha, gap_char="-")) else : consensus_seq = Seq(consensus_seq_str, alpha) #TODO - Consensus base quality (BQ lines). Note that any gaps #(* character) in the consensus does not get a quality entry. #This really needs Biopython support for per-letter-annotation. #TODO? - Base segments (BS lines) which indicates which read #phrap has chosen to be the consensus at a particular position. #Perhaps as SeqFeature objects? #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) #Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id = ace_contig.name, name = ace_contig.name) yield seq_record
def AceIterator(handle): """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags.""" for ace_contig in Ace.parse(handle): #Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence #Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: #Very odd! Error? alpha = generic_ncleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: #For consistency with most other file formats, map #any * gaps into 0 gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), Gapped(alpha, gap_char="-")) else: consensus_seq = Seq(consensus_seq_str, alpha) #TODO - Consensus base quality (BQ lines). Note that any gaps #(* character) in the consensus does not get a quality entry. #This really needs Biopython support for per-letter-annotation. #TODO? - Base segments (BS lines) which indicates which read #phrap has chosen to be the consensus at a particular position. #Perhaps as SeqFeature objects? #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) #Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) yield seq_record
def parse_singletons_fasta_in_ace(contig_ace_dir, singleton_seq_dir): # get contig info os.chdir(contig_ace_dir) for ace_file in sorted(glob.glob("*.ace")): ace_record = Ace.read(open(ace_file)) contigs = ace_record.contigs for contig in contigs: if contig.nreads == 1: singleton_name = contig.reads[0].rd.name singleton_seq = Seq(contig.reads[0].rd.sequence) singleton_record = SeqRecord(seq=singleton_seq, id="", name="", description=singleton_name) singleton_file = singleton_seq_dir + "/" + singleton_name + ".fsa" singleton_fd = open(singleton_file, "w") SeqIO.write([singleton_record], singleton_fd, "fasta") singleton_fd.close() os.system("sed -i \"s/> />/g\" " + singleton_file)
def ace2fasta(in_file, out_file): ace_gen = Ace.parse(open(in_file, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "All contigs treated" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) # Now we have started our alignment we can add sequences to it # Add concensus sequence to alignment align.add_sequence(contig.name, contig.sequence.replace("*", "")) """for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq)""" output_file.write(align.format("fasta"))
from Bio.Sequencing import Ace fn = '../../samples/contig1.ace' acefilerecord = Ace.read(open(fn)) # For each contig: for ctg in acefilerecord.contigs: print('==========================================') print('Contig name: %s'%ctg.name) print('Bases: %s'%ctg.nbases) print('Reads: %s'%ctg.nreads) print('Segments: %s'%ctg.nsegments) print('Sequence: %s'%ctg.sequence) print('Quality: %s'%ctg.quality) # For each read in contig: for read in ctg.reads: print('Read name: %s'%read.rd.name) print('Align start: %s'%read.qa.align_clipping_start) print('Align end: %s'%read.qa.align_clipping_end) print('Qual start: %s'%read.qa.qual_clipping_start) print('Qual end: %s'%read.qa.qual_clipping_end) print('Read sequence: %s'%read.rd.sequence) print('==========================================')
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes += 1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
def pairwise(in_ace, out_file): """Calculate pairwise differentiation indexes. """ ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name window_len = 8 # PARAMETER max_diff = 3 # PARAMETER len_contig = len(sequences[0][1]) number_indexes = 0 total_indexes = 0 for seq in sequences[1:]: try: start = len(re.findall("^-+", seq[1])[0]) except: start = 0 len_seq = 0 min_len_seq = 100 # PARAMETER count = 0 for window in range(start, len_contig, window_len): nuc_contig = sequences[0][1][window:window + window_len] nuc_seq = seq[1][window:window + window_len] if "-" in nuc_seq: len_seq += len(nuc_seq.replace("-", "")) else: diff = count_diff(nuc_contig, nuc_seq, max_diff) if diff[1] == False: count += diff[0] len_seq += window_len len_seq -= seq.count("*") if len_seq >= min_len_seq: index = float(count) / len_seq if count > 0: number_indexes +=1 total_indexes += index else: index = "NA" #output_file.write(contig_name + "\t" + str(index) + "\n") try: mean_index = float(total_indexes) / number_indexes except: mean_index = "NA" output_file.write(contig_name + "\t" + str(mean_index) + "\n")
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) -1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p+1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list(sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list(sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([h[0] for h in haplotypes[-1] if h[0].startswith(g)]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str(len([h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g)]))) bamova_file.write("\n") with open ("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write("Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str(contig_counter), "were treated"
def report(args): """ %prog report [--options] ace_file > report Prepare a report of read location, consensus location or quality segment per contig """ from jcvi.utils.table import tabulate p = OptionParser(report.__doc__) types = { "read": ["padded_start", "padded_end", "orient"], "consensus": ["padded_consensus_start", "padded_consensus_end"], "quality": [ "qual_clipping_start", "qual_clipping_end", "align_clipping_start", "align_clipping_end" ] } valid_types = tuple(types.keys()) p.add_option("--type", default="read", choices=valid_types, help="choose report type [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) acefile, = args ace = Ace.read(must_open(acefile)) logging.debug('Loaded ace file {0}'.format(acefile)) for c in ace.contigs: print c.name table = dict() if opts.type == "read": ps, pe = [], [] ps = [read.padded_start for read in c.af] for i in xrange(1, len(ps)): pe.append(ps[i] - ps[i - 1]) pe.append(c.nbases) map = dict(zip(ps, pe)) for i, read in enumerate(c.af): values = [ str(x) for x in (read.padded_start, map[read.padded_start], read.coru) ] for i, label in enumerate(types[opts.type]): table[(str(read.name), label)] = values[i] elif opts.type == "consensus": for read in c.bs: values = [str(x) for x in (read.padded_start, read.padded_end)] for i, label in enumerate(types[opts.type]): table[(str(read.name), label)] = values[i] elif opts.type == "quality": for read in c.reads: (r1, r2) = (read.rd, read.qa) values = [ str(x) for x in (r2.qual_clipping_start, r2.qual_clipping_end, r2.align_clipping_start, r2.align_clipping_end) ] for i, label in enumerate(types[opts.type]): table[(str(r1.name), label)] = values[i] print tabulate(table), "\n"
part_site_comp_fh.writerow(row) cutoff = [] if (p.use_reads): print "Working on ace file {}".format(p.read_fn) contig_read_dict = {} contig_read_len_dict = {} from Bio.Sequencing import Ace with open(p.use_reads, 'rU') as ace_fh: for contig in Ace.parse(ace_fh): """rd (reads) - read with name, sequence, etc qa (read qual) - which parts used as consensus ds - file name of read's chromatogram file af - loc of read within contig bs (base segment) - which read chosen at consensus at each pos rt (transient read tags) - generated by crossmatch and phrap ct (consensus tag) wa (whole assembly tag) - hosts assembly program name, version, etc wr reads - info about read supporting ace contig contig - holds info about contig from ace record""" contig_name = "{}".format(contig.name) # contig00001 if not contig_name in contig_read_dict:
def get_haplotypes(in_ace, out_file, out_bamova, win_len, step, coverage, stars, ngroups, nhaplo): """Get haplotypes from contigs in an ace file """ marker_number = 0 min_freq = 0.05 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: with open(out_bamova, "w") as bamova_file: output_file.write("Contig_nb\tWindow\tHaplotype\n") contig_counter = 0 ntreated = 0 for contig in ace_gen: pass_haplo = False contig_counter += 1 align = Alignment(Gapped(IUPAC.ambiguous_dna, "X")) align.add_sequence(contig.name, contig.sequence) if len(contig.reads) - 1 < coverage: continue ntreated += 1 for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start clipe = contig.reads[readn].qa.qual_clipping_end clipst2 = contig.reads[readn].qa.align_clipping_start clipe2 = contig.reads[readn].qa.align_clipping_end if clipst2 > clipst: clipst = clipst2 if clipe2 < clipe2: clipe = clipe2 start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) sequences = [[s[0].replace(">", ""), s[1]] for s in sequences] contig_name = sequences[0][0] concensus = sequences[0][1] error_positions = multi_find("*", concensus)[::-1] for p in error_positions: sequences = [[s[0], s[1][0:p] + s[1][p + 1:]] for s in sequences] concensus = sequences[0][1] sequences = [[s[0], correct_sequence(concensus, s[1])] for s in sequences[1:]] sequences, snp_pos = snp_positions(sequences) haplotypes = best_snps(sequences, snp_pos, coverage) if haplotypes != "Empty": bamova = [] variants = list( sorted(list(set([h[-1] for h in haplotypes[-1]])))) groups = list( sorted(set([h[0][:3] for h in haplotypes[-1]]))) if len(groups) >= ngroups: pass_haplo = True for g in groups: if len([ h[0] for h in haplotypes[-1] if h[0].startswith(g) ]) < nhaplo: pass_haplo = False if pass_haplo: print contig.name bamova_file.write("Marker" + str(marker_number) + "\n") group_number = 0 for g in groups: bamova_file.write("Population\t" + str(group_number)) group_number += 1 for v in variants: bamova_file.write("\t" + str( len([ h for h in haplotypes[-1] if h[-1] == v and h[0].startswith(g) ]))) bamova_file.write("\n") with open("fasta_output/" + contig.name + ".fasta", "w") as f: output_file.write(contig.name + "\n") for h in haplotypes[-1]: f.write(">" + h[0] + str(marker_number) + "\n" + h[2] + "\n") h[1] = [x - h[1][0] + 1 for x in h[1]] output_file.write( "Marker" + str(marker_number) + "\t" + "\t".join([str(x) for x in h]) + "\t" + ":".join(variants) + "\n") marker_number += 1 output_file.flush() bamova_file.flush() cutoff = 100000 if contig_counter > cutoff: break print "\n", str(ntreated), "contigs out of", str( contig_counter), "were treated"
from Bio.Sequencing import Ace fn = '../../samples/contig1.ace' acefilerecord = Ace.read(open(fn)) # For each contig: for ctg in acefilerecord.contigs: print('==========================================') print('Contig name: %s' % ctg.name) print('Bases: %s' % ctg.nbases) print('Reads: %s' % ctg.nreads) print('Segments: %s' % ctg.nsegments) print('Sequence: %s' % ctg.sequence) print('Quality: %s' % ctg.quality) # For each read in contig: for read in ctg.reads: print('Read name: %s' % read.rd.name) print('Align start: %s' % read.qa.align_clipping_start) print('Align end: %s' % read.qa.align_clipping_end) print('Qual start: %s' % read.qa.qual_clipping_start) print('Qual end: %s' % read.qa.qual_clipping_end) print('Read sequence: %s' % read.rd.sequence) print('==========================================')
def _get_gen(self): return ace.parse(open(self.ace_filename))
def snp_count(in_ace, out_file, snp_dict, tags, win_len, max_del, stars): """Genotype individuals at SNPs loci. """ win_buffer = (win_len - 1) / 2 ace_gen = Ace.parse(open(in_ace, 'r')) with open(out_file, "w") as output_file: output_file.write("Contig_nb\tPos\ttag_name\tA\tC\tG\tT\tN\t*\t-\n") while 1: try: contig = ace_gen.next() except: print "***All contigs treated***" break align = Alignment(Gapped(IUPAC.ambiguous_dna, "-")) align.add_sequence(contig.name, contig.sequence) for readn in xrange(len(contig.reads)): clipst = contig.reads[readn].qa.qual_clipping_start # GOOD clipe = contig.reads[readn].qa.qual_clipping_end # GOOD clipst2 = contig.reads[readn].qa.align_clipping_start # Added clipe2 = contig.reads[readn].qa.align_clipping_end # Added if clipst2 > clipst: # Added clipst = clipst2 # Added if clipe2 < clipe2: # Added clipe = clipe2 # Added start = contig.af[readn].padded_start seq = cut_ends(contig.reads[readn].rd.sequence, clipst, clipe) seq = pad_read(seq, start, len(contig.sequence)) if "pseudo" not in contig.reads[readn].rd.name: align.add_sequence(contig.reads[readn].rd.name, seq) sequences = read_fasta(align.format("fasta")) contig_name = re.findall("(Contig_[0-9]+)", sequences[0][0])[0] print "Treating", contig_name positions = [] try: positions = snp_dict[contig_name] except: continue d = {} for pos in positions: if stars == True: pos_ok = correct_position(pos, sequences[0][1]) else: pos_ok = pos left = pos_ok - 5 if left < 0: left = 0 right = pos_ok + 1 + 5 # takes into account the middle nucleotide ref_window = sequences[0][1][left:right] d.setdefault(pos, {}) d[pos].setdefault("XX_noTag", {}) for nuc in list("ACGTN*-"): d[pos]["XX_noTag"].setdefault(nuc, 0) for tag in tags: d[pos].setdefault(tag, {}) for nuc in list("ACGTN*-"): d[pos][tag].setdefault(nuc, 0) for fasta in sequences: window = fasta[1][left:right] del_count = 0 if window.count("-") > win_buffer - 3: continue # Need at least 3 nucleotides on each side for tag in tags: if tag in fasta[0]: t = tag break else: t = "XX_noTag" if len(ref_window) == len(window): for i in xrange(len(window)): if ref_window[i].isalpha() and window[i] == "*" or \ window[i].isalpha() and ref_window[i] == "*": del_count += 1 if del_count > max_del: continue p = pos s = fasta[1] # Sequence n = s[pos_ok - 1].upper() d[p][t][n] += 1 for p in sorted(d): for t in sorted(d[p]): output_file.write(contig_name + "\t" + str(p) + "\t" + str(t)) for n in list("ACGTN*-"): output_file.write("\t" + str(d[p][t][n])) output_file.write("\n")
def AceIterator(handle): """Returns SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags. Ace files include the base quality for each position, which are taken to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's letter_annotations dictionary under the "phred_quality" key. >>> from Bio import SeqIO >>> handle = open("Ace/consed_sample.ace", "rU") >>> for record in SeqIO.parse(handle, "ace"): ... print record.id, record.seq[:10]+"...", len(record) ... print max(record.letter_annotations["phred_quality"]) Contig1 agccccgggc... 1475 90 However, ACE files do not include a base quality for any gaps in the consensus sequence, and these are represented in Biopython with a quality of zero. Using zero is perhaps misleading as there may be very strong evidence to support the gap in the consensus. Previous versions of Biopython therefore used None instead, but this complicated usage, and prevented output of the gapped sequence as FASTQ format. >>> from Bio import SeqIO >>> handle = open("Ace/contig1.ace", "rU") >>> for record in SeqIO.parse(handle, "ace"): ... print record.id, "..." + record.seq[85:95]+"..." ... print record.letter_annotations["phred_quality"][85:95] ... print max(record.letter_annotations["phred_quality"]) Contig1 ...AGAGG-ATGC... [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 90 Contig2 ...GAATTACTAT... [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 90 """ for ace_contig in Ace.parse(handle): #Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence #Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: #Very odd! Error? alpha = generic_nucleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: #For consistency with most other file formats, map #any * gaps into - gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), Gapped(alpha, gap_char="-")) else: consensus_seq = Seq(consensus_seq_str, alpha) #TODO? - Base segments (BS lines) which indicates which read #phrap has chosen to be the consensus at a particular position. #Perhaps as SeqFeature objects? #TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) #Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) #Consensus base quality (BQ lines). Note that any gaps (originally #as * characters) in the consensus do not get a quality entry, so #we assign a quality of None (zero would be missleading as there may #be excelent support for having a gap here). quals = [] i = 0 for base in consensus_seq: if base == "-": quals.append(0) else: quals.append(ace_contig.quality[i]) i += 1 assert i == len(ace_contig.quality) seq_record.letter_annotations["phred_quality"] = quals yield seq_record
def AceIterator(source): """Return SeqRecord objects from an ACE file. This uses the Bio.Sequencing.Ace module to do the hard work. Note that by iterating over the file in a single pass, we are forced to ignore any WA, CT, RT or WR footer tags. Ace files include the base quality for each position, which are taken to be PHRED style scores. Just as if you had read in a FASTQ or QUAL file using PHRED scores using Bio.SeqIO, these are stored in the SeqRecord's letter_annotations dictionary under the "phred_quality" key. >>> from Bio import SeqIO >>> with open("Ace/consed_sample.ace") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s %s... %i" % (record.id, record.seq[:10], len(record))) ... print(max(record.letter_annotations["phred_quality"])) Contig1 agccccgggc... 1475 90 However, ACE files do not include a base quality for any gaps in the consensus sequence, and these are represented in Biopython with a quality of zero. Using zero is perhaps misleading as there may be very strong evidence to support the gap in the consensus. Previous versions of Biopython therefore used None instead, but this complicated usage, and prevented output of the gapped sequence as FASTQ format. >>> from Bio import SeqIO >>> with open("Ace/contig1.ace") as handle: ... for record in SeqIO.parse(handle, "ace"): ... print("%s ...%s..." % (record.id, record.seq[85:95])) ... print(record.letter_annotations["phred_quality"][85:95]) ... print(max(record.letter_annotations["phred_quality"])) Contig1 ...AGAGG-ATGC... [57, 57, 54, 57, 57, 0, 57, 72, 72, 72] 90 Contig2 ...GAATTACTAT... [68, 68, 68, 68, 68, 68, 68, 68, 68, 68] 90 """ for ace_contig in Ace.parse(source): # Convert the ACE contig record into a SeqRecord... consensus_seq_str = ace_contig.sequence # Assume its DNA unless there is a U in it, if "U" in consensus_seq_str: if "T" in consensus_seq_str: # Very odd! Error? alpha = generic_nucleotide else: alpha = generic_rna else: alpha = generic_dna if "*" in consensus_seq_str: # For consistency with most other file formats, map # any * gaps into - gaps. assert "-" not in consensus_seq_str consensus_seq = Seq(consensus_seq_str.replace("*", "-"), alpha) else: consensus_seq = Seq(consensus_seq_str, alpha) # TODO? - Base segments (BS lines) which indicates which read # phrap has chosen to be the consensus at a particular position. # Perhaps as SeqFeature objects? # TODO - Supporting reads (RD lines, plus perhaps QA and DS lines) # Perhaps as SeqFeature objects? seq_record = SeqRecord(consensus_seq, id=ace_contig.name, name=ace_contig.name) # Consensus base quality (BQ lines). Note that any gaps (originally # as * characters) in the consensus do not get a quality entry, so # we assign a quality of None (zero would be misleading as there may # be excellent support for having a gap here). quals = [] i = 0 for base in consensus_seq: if base == "-": quals.append(0) else: quals.append(ace_contig.quality[i]) i += 1 assert i == len(ace_contig.quality) seq_record.letter_annotations["phred_quality"] = quals yield seq_record