def main(): args = get_args() pth = os.path.join(args.fastas, "*.fasta") outf = FastaWriter(args.outfile) conn = sqlite3.connect(args.db) cur = conn.cursor() counter = 0 for infile in glob.glob(pth): sp = os.path.basename(infile).split('.')[0].replace('-','_') species = sp.replace('_',' ').capitalize() print "Working on {}".format(species) partial = species.split(' ')[0].lower()[:3] for read in FastaReader(infile): # check for header match, if match get locus name for header nn = read.identifier.split("_")[:2] nn = "{}_{}".format(nn[0].strip('>').lower(), nn[1].lower()) query = "SELECT uce FROM match_map WHERE {0} = '{1}(+)' OR {0} = '{1}(-)'".format(sp, nn) cur.execute(query) result = cur.fetchall() #pdb.set_trace() if result: assert len(result) == 1, "More than 1 result" #pdb.set_trace() if args.fish: uce = result[0][0].split('_')[0] else: uce = result[0][0] read.identifier = """{3}{2} [organism={0}] [molecule=DNA] [moltype=genomic] [location=genomic] [note=ultra conserved element locus {1}] {0} ultra-conserved element locus {1}.""".format(species, uce, partial, counter) # write all to a common fasta outf.write(read) # if not match, pass counter += 1 else: pass outf.close()
def parse_fasta_and_write_new_file(results, contigs, output): #pdb.set_trace() for taxon, rows in results.iteritems(): outp = FastaWriter(os.path.join(output, "{}.fasta".format(taxon))) inp = "{}.contigs.fasta".format(taxon.replace('_', '-')) fasta_file = FastaReader(os.path.join(contigs, inp)) for fasta in fasta_file: name = '_'.join(fasta.identifier.lstrip('>').split('_')[:2]).lower() if name in rows: outp.write(fasta) outp.close()
def parse_fasta_and_write_new_file(results, contigs, output): #pdb.set_trace() for taxon, rows in results.iteritems(): outp = FastaWriter(os.path.join(output, "{}.fasta".format(taxon))) inp = "{}.contigs.fasta".format(taxon.replace('_', '-')) fasta_file = FastaReader(os.path.join(contigs, inp)) for fasta in fasta_file: name = '_'.join( fasta.identifier.lstrip('>').split('_')[:2]).lower() if name in rows: outp.write(fasta) outp.close()
def main(): args = get_args() conf = ConfigParser.ConfigParser(allow_no_value=True) conf.read(args.conf) # get metadata from conf file taxon_excludes = get_excludes(conf, "exclude taxa") locus_excludes = get_excludes(conf, "exclude loci") metadata = get_metadata(conf) vouchers = get_vouchers(conf) #pdb.set_trace() remap = get_remaps(conf) # get fasta and db locations pth = os.path.join(args.fastas, "*.fasta") outf = FastaWriter(args.outfile) conn = sqlite3.connect(args.db) cur = conn.cursor() counter = args.start_value # iterate over fasta files for infile in glob.glob(pth): sp, species, partial, oldname = get_species_name(infile, remap) if species.lower() not in taxon_excludes: print "Working on {}".format(species) for read in FastaReader(infile): nodename = get_node_name(read) query = "SELECT uce FROM match_map WHERE {0} = '{1}(+)' OR {0} = '{1}(-)'".format(oldname, nodename) cur.execute(query) result = cur.fetchall() if result: # ensure we get only 1 result assert len(result) == 1, "More than 1 result" # if getting fish data TODO: deprecate if args.fish: uce = result[0][0].split('_')[0] else: uce = result[0][0] if uce not in locus_excludes: read.identifier = get_new_identifier(species, uce, partial, counter, metadata, vouchers) #read.identifier = """{3}{2} [organism={0}] [molecule=DNA] [moltype=genomic] [location=genomic] [note=ultra conserved element locus {1}] {0} ultra-conserved element locus {1}.""".format(species, uce, partial, counter) # write all to a common fasta outf.write(read) # if not match, pass counter += 1 else: pass else: print "Skipping {0}".format(species) outf.close()
def write_sequences(record, header, output, sample_map, count): if sample_map is not None: header.name = sample_map[header.cluster.lower()] else: header.name = header.cluster record.identifier += " name={}".format(header.name) # create the cluster-specific output directory if not exists outdir = os.path.join(output, header.name) mkdir_p(outdir) outf = FastaWriter( os.path.join(outdir, "{}.fasta".format(header.name)), os.path.join(outdir, "{}.qual".format(header.name)), mode="a", ) if count != 0 and count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() outf.write(record) outf.close() count += 1 return count, header
def main(): args = get_args() pth = os.path.join(args.fastas, "*.fasta") outf = FastaWriter(args.outfile) conn = sqlite3.connect(args.db) cur = conn.cursor() counter = 0 for infile in glob.glob(pth): sp = os.path.basename(infile).split('.')[0].replace('-', '_') species = sp.replace('_', ' ').capitalize() print "Working on {}".format(species) partial = species.split(' ')[0].lower()[:3] for read in FastaReader(infile): # check for header match, if match get locus name for header nn = read.identifier.split("_")[:2] nn = "{}_{}".format(nn[0].strip('>').lower(), nn[1].lower()) query = "SELECT uce FROM match_map WHERE {0} = '{1}(+)' OR {0} = '{1}(-)'".format( sp, nn) cur.execute(query) result = cur.fetchall() #pdb.set_trace() if result: assert len(result) == 1, "More than 1 result" #pdb.set_trace() if args.fish: uce = result[0][0].split('_')[0] else: uce = result[0][0] read.identifier = """{3}{2} [organism={0}] [molecule=DNA] [moltype=genomic] [location=genomic] [note=ultra conserved element locus {1}] {0} ultra-conserved element locus {1}.""".format( species, uce, partial, counter) # write all to a common fasta outf.write(read) # if not match, pass counter += 1 else: pass outf.close()
def main(): """Main loop""" start_time = time.time() motd() args = get_args() print 'Started: ', time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(start_time)) # build our configuration object w/ input params conf = ConfigParser.ConfigParser() conf.read(args.config) params = Parameters(conf) # create the db and tables, returning connection # and cursor conn, cur = db.create_db_and_new_tables(params.db) # get num reads and split up work num_reads, work = get_work(params) # setup monolithic output files outf = FastaWriter(params.output_fasta, params.output_qual) # MULTICORE if params.multiprocessing and params.num_procs > 1: jobs = Queue() results = JoinableQueue() # We're stacking groups of jobs on the work # Queue, conceivably to save the overhead of # placing them on there one-by-one. for unit in work: jobs.put(unit) # setup the processes for the jobs sys.stdout.write("Starting {} workers\n".format(params.num_procs)) sys.stdout.flush() sys.stdout.write('Running') # start the worker processes [Process(target = multiproc, args=(jobs, results, params)).start() for i in xrange(params.num_procs)] # we're putting single results on the results Queue so # that the db can (in theory) consume them at # a rather consistent rate rather than in spurts #for unit in xrange(num_reads): for unit in xrange(num_reads): tagged = results.get() results.task_done() db.insert_record_to_db(cur, tagged) if tagged.cluster: tagged.read.identifier += " cluster={0} outer={1} inner={2}".format( tagged.cluster, tagged.outer_type, tagged.inner_type ) outf.write(tagged.read) # make sure we put None at end of Queue # in an amount equiv. to num_procs for unit in xrange(params.num_procs): jobs.put(None) # join the results, so that they can finish results.join() # close up our queues jobs.close() results.close() # SINGLECORE else: # fake a multiprocessing queue, so stacking and accessing results # is identical. results = ListQueue() singleproc(work, results, params) for tagged in results: db.insert_record_to_db(cur, tagged) if tagged.cluster: tagged.read.identifier += " cluster={0} outer={1} inner={2}".format( tagged.cluster, tagged.outer_type, tagged.inner_type ) outf.write(tagged.read) conn.commit() cur.close() conn.close() outf.close() end_time = time.time() pretty_end_time = time.strftime("%a %b %d, %Y %H:%M:%S", time.localtime(end_time)) print "\nEnded: {} (run time {} minutes)".format(pretty_end_time, round((end_time - start_time)/60, 3))