def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_gbs = opts.input_gbs.split(',') output_dir = opts.output_dir verbose = opts.verbose tag = opts.tag existing_fp = opts.existing max_failures = opts.max_failures makedirs(output_dir) logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0]) observed_records = parse_column(open(existing_fp)) sequences_fp = os.path.join(output_dir, '%s_sequences.fasta.gz' % tag) gg_records_fp = os.path.join(output_dir, '%s_ggrecords.txt.gz' % tag) obs_records_fp = os.path.join(output_dir, '%s_obsrecords.txt.gz' % tag) sequences = open(sequences_fp,'w') gg_records = open(gg_records_fp, 'w') obs_records = open(obs_records_fp, 'w') seen = set([]) for gb_fp in input_gbs: logline = log_f("Start parsing of %s..." % gb_fp) logger.write(logline) if verbose: stdout.write(logline) records = MinimalGenbankParser(open(gb_fp)) failure_count = 0 alpha = set(['A','T','G','C', 'a','t','g','c', 'N','n', 'R','Y','S','M', 'r','y','s','m', 'K','k','W','w', 'V','v','H','h','B','b','D','d']) while True and (failure_count < max_failures): # gracefully handle parser errors to a limit try: next_record = records.next() except PartialRecordError, e: failure_count += 1 continue except StopIteration: break except Exception, e: logline = log_f("Caught: %s, previous accession: %s" % (e, accession)) logger.write(logline) if verbose: stdout.write(logline) failure_count += 1
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) gg_records_fp = opts.gg_records output_dir = opts.output_dir verbose = opts.verbose existing_fp = opts.existing tag = opts.tag gg_id = opts.starting_gg_id invariants = parse_invariants(open(opts.invariants)) makedirs(output_dir) logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0]) # gg records are not going out as gzip as python's gzip is slow relative # to native linux gzip and doesn't compress as well out the door (latter # probably fixable) output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag) output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag) output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \ % tag) existing_records = parse_column(open(existing_fp)) #records = dict([(r['ncbi_acc_w_ver'], r) \ # for r in MinimalGreengenesParser(open(gg_records_fp))]) for record in MinimalGreengenesParser(open(gg_records_fp)): acc = record['ncbi_acc_w_ver'] ### NEED DOMAIN! aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned]) noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned]) if not aln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue if not unaln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue # if > 1 rec, complain for aln_id, aln_seq in MinimalFastaParser(open(f)): id_ = aln_id.split()[0] # strip of any comments record = records.get(id_, None) if record is None: logline = log_f("Aligned seq %s does not have a GG record" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if id_ in existing_records: logline = log_f("%s has previously been observed!" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if record['gg_id'] is not None: logline = log_f("%s already has gg_id %d!" %\ (id_,record['gg_id'])) logger.write(logline) if verbose: stdout.write(logline) continue record['gg_id'] = gg_id if domain != 'eukarya': record['prokMSA_id'] = gg_id gg_id += 1 inv_score = calc_invariant(seq, invariants) non_ACGT = calc_nonACGT(seq) record['perc_ident_to_invariant_core'] = inv_score record['non_ACGT_percent'] = non_ACGT record['aligned_seq'] = seq record['n_pos_aligned'] = len(seq) - seq.count('-') for f in opts.unaligned.split(','): logline = log_f("Parsing %s..." % f) logger.write(logline) if verbose: stdout.write(logline) domain = get_domain(f) for unaln_id, unaln_seq in MinimalFastaParser(open(f)): id_ = unaln_id.split()[0] # strip off any comments record = records.get(id_, None) if record is None: logline = log_f("Unaligned seq %s does not have a GG record" %\ id_) logger.write(logline) if verbose: stdout.write(logline) continue # a gg_id should be assigned while trolling the alignment seqs if record['gg_id'] is None: logline = log_f("%s should have a gg_id by now!" % (id_)) logger.write(logline) if verbose: stdout.write(logline) continue record['unaligned_seq'] = seq record['n_pos_unaligned'] = len(seq) logline = log_f("Beginning output...") logger.write(logline) if verbose: stdout.write(logline) output_map = open(output_map_fp, 'w') output_gg = open(output_gg_fp, 'w') output_gg_noggid = open(output_gg_noggid_fp, 'w') output_gg_broken = open(output_gg_broken_fp, 'w') for record in records.items(): if record['gg_id'] is None: write_gg_record(output_gg_noggid, record) else: try: record.sanityCheck() except: write_gg_record(output_gg_broken, record) else: write_gg_record(output_gg, record) output_map.write("%s\t%s\n" % (record['gg_id'], record['ncbi_acc_w_ver'])) output_gg.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) gg_records_fp = opts.gg_records output_dir = opts.output_dir verbose = opts.verbose existing_fp = opts.existing tag = opts.tag gg_id = opts.starting_gg_id invariants = parse_invariants(open(opts.invariants)) makedirs(output_dir) logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0]) # gg records are not going out as gzip as python's gzip is slow relative # to native linux gzip and doesn't compress as well out the door (latter # probably fixable) output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag) output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag) output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \ % tag) existing_records = parse_column(open(existing_fp)) #records = dict([(r['ncbi_acc_w_ver'], r) \ # for r in MinimalGreengenesParser(open(gg_records_fp))]) for record in MinimalGreengenesParser(open(gg_records_fp)): acc = record['ncbi_acc_w_ver'] ### NEED DOMAIN! aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned]) noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned]) if not aln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue if not unaln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue # if > 1 rec, complain for aln_id, aln_seq in MinimalFastaParser(open(f)): id_ = aln_id.split()[0] # strip of any comments record = records.get(id_, None) if record is None: logline = log_f("Aligned seq %s does not have a GG record" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if id_ in existing_records: logline = log_f("%s has previously been observed!" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if record['gg_id'] is not None: logline = log_f("%s already has gg_id %d!" %\ (id_,record['gg_id'])) logger.write(logline) if verbose: stdout.write(logline) continue record['gg_id'] = gg_id if domain != 'eukarya': record['prokMSA_id'] = gg_id gg_id += 1 inv_score = calc_invariant(seq, invariants) non_ACGT = calc_nonACGT(seq) record['perc_ident_to_invariant_core'] = inv_score record['non_ACGT_percent'] = non_ACGT record['aligned_seq'] = seq record['n_pos_aligned'] = len(seq) - seq.count('-') for f in opts.unaligned.split(','): logline = log_f("Parsing %s..." % f) logger.write(logline) if verbose: stdout.write(logline) domain = get_domain(f) for unaln_id, unaln_seq in MinimalFastaParser(open(f)): id_ = unaln_id.split()[0] # strip off any comments record = records.get(id_, None) if record is None: logline = log_f("Unaligned seq %s does not have a GG record" %\ id_) logger.write(logline) if verbose: stdout.write(logline) continue # a gg_id should be assigned while trolling the alignment seqs if record['gg_id'] is None: logline = log_f("%s should have a gg_id by now!" % (id_)) logger.write(logline) if verbose: stdout.write(logline) continue record['unaligned_seq'] = seq record['n_pos_unaligned'] = len(seq) logline = log_f("Beginning output...") logger.write(logline) if verbose: stdout.write(logline) output_map = open(output_map_fp,'w') output_gg = open(output_gg_fp,'w') output_gg_noggid = open(output_gg_noggid_fp, 'w') output_gg_broken = open(output_gg_broken_fp, 'w') for record in records.items(): if record['gg_id'] is None: write_gg_record(output_gg_noggid, record) else: try: record.sanityCheck() except: write_gg_record(output_gg_broken, record) else: write_gg_record(output_gg, record) output_map.write("%s\t%s\n" % (record['gg_id'], record['ncbi_acc_w_ver'])) output_gg.close()