def main(): #pdb.set_trace() args = get_args() names = {} temp1 = "{}.temp1".format(args.infile) temp2 = "{}.temp2".format(args.infile) outf = fasta.FastaWriter(temp1) mask_file = os.path.splitext(args.infile)[0] + ".fa.out" f = fasta.FastaReader(args.infile) for seq in f: print seq.identifier gb = seq.identifier.split('|')[3] newname = seq.identifier.split(',')[0].split(' ')[-1] names[gb] = newname seq.identifier = ">{}".format(gb) outf.write(seq) outf.close() cmd = ["maskOutFa", "-softAdd", temp1, mask_file, temp2] subprocess.Popen(cmd).wait() final = "{}.masked".format(args.infile) outf = fasta.FastaWriter(final) for seq in fasta.FastaReader(temp2): iden = seq.identifier.strip('>') seq.identifier = "{}".format(names[iden]) print seq.identifier outf.write(seq) outf.close()
def worker(params): locus, opts = params name, sequences = locus sate, cfg = opts # create a tempdir to hold all our stuff working = tempfile.mkdtemp() # write content to outfile descriptor, path = tempfile.mkstemp(dir=working, suffix='.mpi.fasta') os.close(descriptor) tf = fasta.FastaWriter(path) [tf.write(seq) for seq in sequences] tf.close() # run SATe cli = [ 'python', sate, '--input', path, '--output-directory', working, '--temporaries', working, cfg ] stderr, stdout = subprocess.Popen(cli, stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate() # get contents of output file(s) aln_name = "satejob.marker001.{0}.aln".format( os.path.splitext(os.path.basename(path))[0]) aln_file = os.path.join(working, aln_name) aln = open(aln_file, 'rU').read() # zap working tempdir shutil.rmtree(working) # return filename and align so we can store resulting alignments reasonably return (name, aln)
def create_locus_specific_fasta(sequences): fd, fasta_file = tempfile.mkstemp(suffix='.fasta') os.close(fd) fasta_writer = fasta.FastaWriter(fasta_file) for seq in sequences: fasta_writer.write(seq) fasta_writer.close() return fasta_file
def test_fasta_write(self): """[fasta] fasta writing""" d = tempfile.mkdtemp() outf = fasta.FastaWriter(os.path.join(d,'test_write.fasta')) for s in self.seq: outf.write(s) outf.close() old = self._read_raw_contents('test-data/sequence.fasta') new = self._read_raw_contents(os.path.join(d,'test_write.fasta')) assert old == new shutil.rmtree(d)
def test_fasta_qual_write(self): """[fasta] fasta+qual writing""" d = tempfile.mkdtemp() f = os.path.join(d, 'test_write.fasta') q = os.path.join(d, 'test_write.qual') outf = fasta.FastaWriter(f,q) for s in self.seq: outf.write(s) outf.close() old_s = self._read_raw_contents('test-data/sequence.fasta') old_q = self._read_raw_contents('test-data/sequence.qual') new_s = self._read_raw_contents(f) new_q = self._read_raw_contents(q) assert old_s == new_s assert old_q == new_q shutil.rmtree(d)
def main(): args = get_args() conf = ConfigParser.ConfigParser() conf.read(args.conf) all_files = get_all_files_from_conf(conf) for genome in all_files: name, twobit_name = genome out_file = os.path.join(args.output, name) + ".fasta" out = fasta.FastaWriter(out_file) tb = twobit.TwoBitFile(file(twobit_name)) lz = os.path.join(args.lastz, name) + ".lastz" count = 0 for row in lastz.Reader(lz, long_format=True): sequence = slice_and_return_fasta(tb, row, args.flank) out.write(sequence) count += 1 print "\t{} sequences written to {}".format(count, out_file) out.close()
def main(): args = get_args() conf = ConfigParser.ConfigParser() conf.optionxform = str conf.read(args.conf) all_files = get_all_files_from_conf(conf, args.pattern) #pdb.set_trace() for genome in all_files: short_name, long_name, twobit_name = genome if not args.exclude or (short_name not in args.exclude): out_file = os.path.join(args.output, short_name) + ".fasta" out = fasta.FastaWriter(out_file) tb = twobit.TwoBitFile(file(twobit_name)) lz = os.path.join(args.lastz, long_name) count = 0 for row in lastz.Reader(lz, long_format=True): sequence = slice_and_return_fasta(tb, row, args.flank) out.write(sequence) count += 1 print "\t{} sequences written to {}".format(count, out_file) out.close()
def main(): args = get_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read(args.config) conn = sqlite3.connect(args.db) c = conn.cursor() if args.extend_db: query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db) c.execute(query) organisms = get_names_from_config(config, 'Organisms') uces = get_names_from_config(config, 'Loci') #pdb.set_trace() uce_fasta_out = fasta.FastaWriter(args.output) regex = re.compile("[N,n]{1,21}") for organism in organisms: print "Getting {0} reads...".format(organism) written = [] # going to need to do something more generic w/ suffixes #pdb.set_trace() name = organism.replace('_', '-') if args.notstrict: if not organism.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True) elif args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True) else: if not name.endswith('*'): reads = find_file(args.contigs, name) node_dict, missing = get_nodes_for_uces(c, organism, uces) elif name.endswith('*') and args.extend_dir: # remove the asterisk name = name.rstrip('*') reads = find_file(args.extend_dir, name) node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True) for read in fasta.FastaReader(reads): name = get_name(read.identifier).lower() coverage = get_coverage(read.identifier) if name in node_dict.keys(): uce_seq = fasta.FastaSequence() uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage) # deal with strandedness because aligners dont, which # is annoying if node_dict[name][1] == '-': uce_seq.sequence = transform.DNA_reverse_complement(read.sequence) else: uce_seq.sequence = read.sequence # replace any occurrences of <21 Ns if regex.search(uce_seq.sequence): uce_seq.sequence = re.sub(regex, "", uce_seq.sequence) print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0]) uce_fasta_out.write(uce_seq) written.append(str(node_dict[name][0])) else: pass #pdb.set_trace() if args.notstrict and missing: args.notstrict.write("[{0}]\n".format(organism)) for name in missing: args.notstrict.write("{0}\n".format(name)) written.append(name) assert set(written) == set(uces), "UCE names do not match" #assert set(written) == set(uces), pdb.set_trace() uce_fasta_out.close()
def main(): args = get_args() # compile some regular expressions we'll use later stripnum = re.compile("s_[0-9]+$") manyn = re.compile("[N,n]{20,}") # get names of loci and taxa uces = get_uce_names_from_probes(args.probes) taxa = get_taxa_names_from_fastas(args.fasta) print "\n" if not args.extend: if args.db is None: db = os.path.join(args.output, 'probe.matches.sqlite') else: db = args.db # create db to hold results conn, c = create_probe_database( db, taxa, uces, True ) else: conn, c = extend_probe_database( args.db, taxa ) # get duplicate probe sequences for filtering if args.dupefile: print "Determining duplicate probes..." dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None # iterate over LASTZ files for each taxon for lz in glob.glob(os.path.join(args.lastz, '*')): # get fasta name from lastz file ff = get_fasta_name_from_lastz_pth(lz, args.fasta, args.pattern) # get taxon name from lastz file taxon = get_taxon_from_filename(ff) print "\n{0}\n{1}\n{0}".format('=' * 30, taxon) # get lastz matches print "\tGetting LASTZ matches from GENOME alignments..." matches, probes = get_matches(lz) # remove bad loci (dupes) print "\tGetting bad (potentially duplicate) GENOME matches..." loci_to_skip = [] for k, v in matches.iteritems(): # check matches to makes sure all is well - keep names lc loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False)) #pdb.set_trace() # convert to set, to keep only uniques loci_to_skip = set(loci_to_skip) print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip)) # get (and possibly assemble) non-skipped seqdict = defaultdict(list) # determine those contigs to skip and group those to assemble for contig in fasta.FastaReader(ff): # make sure all names are lowercase contig.identifier = contig.identifier.lower() name = contig.identifier.split('|')[-4].strip() locus = name.split('_')[0] # skip what we identified as bad loci if locus not in loci_to_skip: seqdict[locus].append(contig) output_name = "{}.fasta".format(taxon.replace('_', '-')) fout_name = os.path.join(args.output, output_name) print "\tOutput filename is {}".format(output_name) fout = fasta.FastaWriter(fout_name) # this tracks "fake" contig number count = 0 # this tracks loci kept kept = 0 # when > 1 contig, assemble contigs across matches sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)") for k, v in seqdict.iteritems(): bad = False contig_names = [] if count % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() if len(v) == 1: # trim ambiguous bases on flanks record = v[0] orient = [matches[k][0][1]] if args.flank: record = trim_uce_reads(record, args.flank) contig_names.append(record.identifier) record.sequence = record.sequence.strip('N') # trim many ambiguous bases within contig result = manyn.search(record.sequence) if result: uce_start, uce_end = get_probe_positions(record) uce = record.sequence[uce_start:uce_end] record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False) # change header record.identifier = ">Node_{0}_length_{1}_cov_1000".format( count, len(record.sequence) ) fout.write(v[0]) else: orient = list(set([m[1] for m in matches[k]])) # skip any loci having matches of mixed orientation # ['+', '-'] if len(orient) == 1: # create tempfile for the reads fd, temp = tempfile.mkstemp(suffix='.fasta') os.close(fd) temp_out = fasta.FastaWriter(temp) # write all slices to outfile, trimming if we want #pdb.set_trace() for record in v: if args.flank: record = trim_uce_reads(record, args.flank) # keep names of contigs we assembled to store in db assoc # w/ resulting assembled contig name contig_names.append(record.identifier) record.sequence = record.sequence.strip('N') # trim many ambiguous bases within contig result = manyn.search(record.sequence) if result: uce_start, uce_end = get_probe_positions(record) uce = record.sequence[uce_start:uce_end] record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False) temp_out.write(record) # make sure to close the file temp_out.close() # assemble aln = Align(temp) aln.run_alignment() record = fasta.FastaSequence() record.sequence = aln.alignment_consensus.tostring() record.identifier = ">Node_{0}_length_{1}_cov_1000".format( count, len(record.sequence) ) fout.write(record) else: bad = True if not bad: # track contig assembly and renaming data in db q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k) c.execute(q) # generate db match and match map tables for data orient_key = "node_{0}({1})".format(count, orient[0]) q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k) c.execute(q) # keep track of new name :: old name mapping for old_name in contig_names: q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(taxon, k, old_name, record.identifier) c.execute(q) kept += 1 # tracking "fake" contig number count += 1 conn.commit() print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format( count, len(uces), float(count) / len(uces) * 100, len(loci_to_skip), float(len(loci_to_skip)) / len(uces) * 100, kept, float(kept) / len(uces) * 100 ) #conn.commit() c.close() conn.close()