def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if opts.counts_dir is None: raise RuntimeError('No data available') if os.path.exists(opts.save_path) and opts.force_overwrite is False: raise RuntimeError( 'Output BED file exists and overwriting not enabled. Exiting.\n') output_bed_file = open(opts.save_path, 'w') dir_list = os.listdir(opts.counts_dir) for chrom_file in dir_list: if chrom_file[-7:] == '.txt.gz': chrom_number = (chrom_file.lstrip('chr').rstrip('.txt.gz')) print 'Chromosome file name: %s, chromosome number: %s\n' % ( chrom_file, chrom_number) read_file = os.path.join(opts.counts_dir, chrom_file) make_bed_entries(read_file, chrom_number, opts.feature_name, output_bed_file, opts.max_read_length, opts.count_max_length, chrom_number) else: print "\nSkipping file: %s as not data file.\n" % (chrom_file) output_bed_file.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) taxlookup = load_consensus_map(open(opts.ref_taxonomy_map), False) cs_results = parse_cs_chimeras(open(opts.input_cs)) b3_results = parse_b3_chimeras(open(opts.input_bellerophon)) output = open(opts.output,'w') output.write("#accession\treason\tnote\tnote\n") overlap = get_overlap(b3_results, cs_results) for id_ in overlap: output.write("%s\tFound by both Bellerophon and ChimeraSlayer\n" % id_) for id_, score, parent_a, parent_b in b3_results: if id_ in overlap: continue if determine_taxon_conflict(taxlookup, parent_a, parent_b): o = [id_,"Class conflict found by Bellerophon"] o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a]))) o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b]))) output.write('\t'.join(o)) output.write('\n') for id_, parent_a, parent_b in cs_results: if id_ in overlap: continue if determine_taxon_conflict(taxlookup, parent_a, parent_b): o = [id_,"Class conflict found by ChimeraSlayer"] o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a]))) o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b]))) output.write('\t'.join(o)) output.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) taxlookup = load_consensus_map(open(opts.ref_taxonomy_map), False) cs_results = parse_cs_chimeras(open(opts.input_cs)) b3_results = parse_b3_chimeras(open(opts.input_bellerophon)) output = open(opts.output, 'w') output.write("#accession\treason\tnote\tnote\n") overlap = get_overlap(b3_results, cs_results) for id_ in overlap: output.write("%s\tFound by both Bellerophon and ChimeraSlayer\n" % id_) for id_, score, parent_a, parent_b in b3_results: if id_ in overlap: continue if determine_taxon_conflict(taxlookup, parent_a, parent_b): o = [id_, "Class conflict found by Bellerophon"] o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a]))) o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b]))) output.write('\t'.join(o)) output.write('\n') for id_, parent_a, parent_b in cs_results: if id_ in overlap: continue if determine_taxon_conflict(taxlookup, parent_a, parent_b): o = [id_, "Class conflict found by ChimeraSlayer"] o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a]))) o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b]))) output.write('\t'.join(o)) output.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) input_gbs = opts.input_gbs.split(',') output_dir = opts.output_dir verbose = opts.verbose tag = opts.tag existing_fp = opts.existing max_failures = opts.max_failures makedirs(output_dir) logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0]) observed_records = parse_column(open(existing_fp)) sequences_fp = os.path.join(output_dir, '%s_sequences.fasta.gz' % tag) gg_records_fp = os.path.join(output_dir, '%s_ggrecords.txt.gz' % tag) obs_records_fp = os.path.join(output_dir, '%s_obsrecords.txt.gz' % tag) sequences = open(sequences_fp,'w') gg_records = open(gg_records_fp, 'w') obs_records = open(obs_records_fp, 'w') seen = set([]) for gb_fp in input_gbs: logline = log_f("Start parsing of %s..." % gb_fp) logger.write(logline) if verbose: stdout.write(logline) records = MinimalGenbankParser(open(gb_fp)) failure_count = 0 alpha = set(['A','T','G','C', 'a','t','g','c', 'N','n', 'R','Y','S','M', 'r','y','s','m', 'K','k','W','w', 'V','v','H','h','B','b','D','d']) while True and (failure_count < max_failures): # gracefully handle parser errors to a limit try: next_record = records.next() except PartialRecordError, e: failure_count += 1 continue except StopIteration: break except Exception, e: logline = log_f("Caught: %s, previous accession: %s" % (e, accession)) logger.write(logline) if verbose: stdout.write(logline) failure_count += 1
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if None in (opts.hostname, opts.user, opts.passwd): assert len(set((opts.hostname, opts.user, opts.passwd))) == 1,\ 'You must provide all MySQL options, or none at all.' if opts.hostname is not None: account = HostAccount(opts.hostname,opts.user,opts.passwd) elif 'ENSEMBL_ACCOUNT' in os.environ: h, u, p = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(h,u,p) else: account = None if opts.test_run: print account outdir = os.path.abspath(opts.outdir) if not os.path.exists(outdir): print 'FAIL: %s directory does not exist' % outdir exit(-1) if not opts.by_chrom: outfile_name = os.path.join(outdir, '%s-%s.fasta' % (opts.species, opts.release)) if not opts.test_run: outfile = open(outfile_name, 'w') if opts.test_run: print 'Will write to: %s' % outdir if not opts.by_chrom: print outfile_name for chrom in get_chrom_seqs(opts.species, opts.release, account, debug=opts.test_run): fasta = chrom.toFasta() if opts.by_chrom: outfile_name = os.path.join(outdir, '%s.fasta' % chrom.Name) if opts.test_run: print 'Will write to: %s' % outfile_name break if opts.by_chrom: outfile = open(outfile_name, 'w') outfile.write(fasta+'\n') if opts.by_chrom: outfile.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) taxlookup = load_consensus_map(open(opts.ref_taxonomy_map)) uchime_results = parse_uchime_chimeras(open(opts.input_uchime)) output = open(opts.output,'w') output.write("#accession\treason\tnote\tnote\n") for id_, score, parent_a, parent_b in uchime_results: if determine_taxon_conflict(taxlookup, parent_a, parent_b): o = [id_,"Class conflict found by UCHIME"] o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a]))) o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b]))) output.write('\t'.join(o)) output.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) taxlookup = load_consensus_map(open(opts.ref_taxonomy_map)) uchime_results = parse_uchime_chimeras(open(opts.input_uchime)) output = open(opts.output, 'w') output.write("#accession\treason\tnote\tnote\n") for id_, score, parent_a, parent_b in uchime_results: if determine_taxon_conflict(taxlookup, parent_a, parent_b): o = [id_, "Class conflict found by UCHIME"] o.append("%s: %s" % (parent_a, '; '.join(taxlookup[parent_a]))) o.append("%s: %s" % (parent_b, '; '.join(taxlookup[parent_b]))) output.write('\t'.join(o)) output.write('\n')
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # setup DB connection cred = Credentials() con = connect(cred.liveMetadataDatabaseConnectionString) cursor = con.cursor() existing_kit_ids = get_used_kit_ids(cursor) if opts.input: kit_passwd_map, kit_barcode_map = preassigned_kits(opts, cursor, existing_kit_ids) else: if not opts.tag or not opts.number_of_kits \ or not opts.swabs_per_kit: option_parser.error("Must specify tag, number of samples and number of swabs") kit_passwd_map, kit_barcode_map = unassigned_kits(opts, cursor, existing_kit_ids) f = open(opts.output + '.printouts', 'w') f.write('\n'.join(get_printout_data(kit_passwd_map, kit_barcode_map))) f.write('\n') f.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # if we already have these records, then we do not need to reobtain them if opts.existing_gb: existing_gis = set([l.strip() for l in open(opts.existing_gb)]) else: existing_gis = set([]) if opts.verbose: print "Number of existing GIs: %d" % len(existing_gis) if opts.possible_new_gb_out is None: option_parser.error("Need to specify --possible-new-gb-output") if opts.cached_ids: possible_gis = set([l.strip() for l in open(opts.cached_ids)]) else: #ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal'] ncbi_record_queries = ['16S AND tm7'] # grab all the ids possible_gis = set([]) for query in ncbi_record_queries: if opts.verbose: cur_size = len(possible_gis) possible_gis.update(esearch(query, retmax=10000000)) if opts.verbose: print "Query %s added %d to set" % (query, len(possible_gis) - cur_size) # drop out any existing ids possible_gis = possible_gis - existing_gis if opts.verbose: print "Total number of GIs to query: %d" % len(possible_gis) chunk_count = 0 total_bytes = 0 if opts.use_gz: poss_output = open_gz(opts.possible_new_gb_out,'w') else: poss_output = open(opts.possible_new_gb_out,'w') collected = set([]) retries = 0 while possible_gis and retries < 100: try: for chunk in bulk_efetch(possible_gis): chunk_count += 1 total_bytes += len(chunk) # Occasionally, and silently, NCBI corrupts records. if '<html>' in chunk: if verbose: print "Erroneous record in chunk, disregarding full chunk" continue # pullout the GIs records = [] for l in chunk.splitlines(): if l.startswith('VERSION'): records.append(l.split(':')[1]) if opts.verbose: print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % \ (time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes) poss_output.write(chunk) collected.update(set(records)) except Exception, e: retries += 1 print "Caught exception: ", e possible_gis = possible_gis - collected collected = set([]) possible_gis_at_retry = open('possible_retries_at_retry_%d.txt.gz' % retries, 'w') possible_gis_at_retry.write('\n'.join(possible_gis)) possible_gis_at_retry.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) gg_records_fp = opts.gg_records output_dir = opts.output_dir verbose = opts.verbose existing_fp = opts.existing tag = opts.tag gg_id = opts.starting_gg_id invariants = parse_invariants(open(opts.invariants)) makedirs(output_dir) logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0]) # gg records are not going out as gzip as python's gzip is slow relative # to native linux gzip and doesn't compress as well out the door (latter # probably fixable) output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag) output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag) output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \ % tag) existing_records = parse_column(open(existing_fp)) #records = dict([(r['ncbi_acc_w_ver'], r) \ # for r in MinimalGreengenesParser(open(gg_records_fp))]) for record in MinimalGreengenesParser(open(gg_records_fp)): acc = record['ncbi_acc_w_ver'] ### NEED DOMAIN! aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned]) noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned]) if not aln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue if not unaln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue # if > 1 rec, complain for aln_id, aln_seq in MinimalFastaParser(open(f)): id_ = aln_id.split()[0] # strip of any comments record = records.get(id_, None) if record is None: logline = log_f("Aligned seq %s does not have a GG record" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if id_ in existing_records: logline = log_f("%s has previously been observed!" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if record['gg_id'] is not None: logline = log_f("%s already has gg_id %d!" %\ (id_,record['gg_id'])) logger.write(logline) if verbose: stdout.write(logline) continue record['gg_id'] = gg_id if domain != 'eukarya': record['prokMSA_id'] = gg_id gg_id += 1 inv_score = calc_invariant(seq, invariants) non_ACGT = calc_nonACGT(seq) record['perc_ident_to_invariant_core'] = inv_score record['non_ACGT_percent'] = non_ACGT record['aligned_seq'] = seq record['n_pos_aligned'] = len(seq) - seq.count('-') for f in opts.unaligned.split(','): logline = log_f("Parsing %s..." % f) logger.write(logline) if verbose: stdout.write(logline) domain = get_domain(f) for unaln_id, unaln_seq in MinimalFastaParser(open(f)): id_ = unaln_id.split()[0] # strip off any comments record = records.get(id_, None) if record is None: logline = log_f("Unaligned seq %s does not have a GG record" %\ id_) logger.write(logline) if verbose: stdout.write(logline) continue # a gg_id should be assigned while trolling the alignment seqs if record['gg_id'] is None: logline = log_f("%s should have a gg_id by now!" % (id_)) logger.write(logline) if verbose: stdout.write(logline) continue record['unaligned_seq'] = seq record['n_pos_unaligned'] = len(seq) logline = log_f("Beginning output...") logger.write(logline) if verbose: stdout.write(logline) output_map = open(output_map_fp, 'w') output_gg = open(output_gg_fp, 'w') output_gg_noggid = open(output_gg_noggid_fp, 'w') output_gg_broken = open(output_gg_broken_fp, 'w') for record in records.items(): if record['gg_id'] is None: write_gg_record(output_gg_noggid, record) else: try: record.sanityCheck() except: write_gg_record(output_gg_broken, record) else: write_gg_record(output_gg, record) output_map.write("%s\t%s\n" % (record['gg_id'], record['ncbi_acc_w_ver'])) output_gg.close()
def main(): from optparse import make_option from cogent.util.misc import parse_command_line_parameters from sys import exit, stdout script_info = {} script_info['brief_description'] = "Parse raw Greengenes 16S records" script_info['script_description'] = """Parse out specific fields from raw Greengenes 16S records. These records are rich but often only a subset of each record is required for downstream processing.""" script_info['script_usage'] = [] script_info['script_usage'].append(("""Example:""","""Greengenes taxonomy and raw sequences are needed:""","""python greengenes.py -i greengenes16SrRNAgenes.txt -o gg_seq_and_tax.txt -f prokMSA_id,greengenes_tax_string,aligned_seq""")) script_info['script_usage'].append(("""Example:""","""Spitting out the available fields from Greengenes:""","""python greengenes.py -i greengenes16SrRNAgenes.txt --print-fields""")) script_info['output_description'] = """The resulting output file will contain a header that is prefixed with a # and delimited by the specified delimiter (default is tab). All records will follow in the same order with the same delimiter. It is possible for some key/value pairs within a record to lack a value. In this case, the value placed will be ''""" script_info['required_options']=[make_option('--input','-i',dest='input',\ help='Greengenes Records')] script_info['optional_options']=[\ make_option('--output','-o',dest='output',help='Output file'), make_option('--fields','-f',dest='fields',\ help='Greengenes fields to keep'), make_option('--delim','-d',dest='delim',help='Output delimiter',\ default="\t"), make_option('--list-of-ids','-l',dest='ids',default=None,\ help='File with a single column list of ids to retrieve'), make_option('--print-fields','-p',dest='print_fields',\ help='Prints available fields from first Greengenes Record',\ action='store_true',default=False)] script_info['version'] = __version__ option_parser, opts, args = parse_command_line_parameters(**script_info) if opts.print_fields: gg_parser = MinimalGreengenesParser(open(opts.input)) rec = gg_parser.next() print '\n'.join(sorted(rec.keys())) exit(0) if not opts.fields: print option_parser.usage() print print "Greengenes fields must be specified!" exit(1) if not opts.output: output = stdout else: output = open(opts.output,'w') fields = opts.fields.split(',') output.write("#%s\n" % opts.delim.join(fields)) if opts.ids: ids = set([l.strip() for l in open(opts.ids, 'U')]) else: ids = None gg_parser = SpecificGreengenesParser(open(opts.input), fields, ids) for record in gg_parser: output.write(opts.delim.join(record)) output.write('\n') if opts.output: output.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) gg_records_fp = opts.gg_records output_dir = opts.output_dir verbose = opts.verbose existing_fp = opts.existing tag = opts.tag gg_id = opts.starting_gg_id invariants = parse_invariants(open(opts.invariants)) makedirs(output_dir) logger = WorkflowLogger(generate_log_fp(output_dir), script_name=argv[0]) # gg records are not going out as gzip as python's gzip is slow relative # to native linux gzip and doesn't compress as well out the door (latter # probably fixable) output_gg_fp = os.path.join(output_dir, "%s.records.txt" % tag) output_map_fp = os.path.join(output_dir, "%s.mapping.txt.gz" % tag) output_gg_noggid_fp = os.path.join(output_dir, "%s.records.noggid.txt" \ % tag) existing_records = parse_column(open(existing_fp)) #records = dict([(r['ncbi_acc_w_ver'], r) \ # for r in MinimalGreengenesParser(open(gg_records_fp))]) for record in MinimalGreengenesParser(open(gg_records_fp)): acc = record['ncbi_acc_w_ver'] ### NEED DOMAIN! aln = filter(None, [get_indexed_sequence(i, acc) for i in aligned]) noaln = filter(None, [get_indexed_sequence(i, acc) for i in unaligned]) if not aln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue if not unaln: logline = log_f("GG record %s does not have aligned seq!" % acc) logger.write(logline) if verbose: stdout.write(logline) continue # if > 1 rec, complain for aln_id, aln_seq in MinimalFastaParser(open(f)): id_ = aln_id.split()[0] # strip of any comments record = records.get(id_, None) if record is None: logline = log_f("Aligned seq %s does not have a GG record" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if id_ in existing_records: logline = log_f("%s has previously been observed!" % id_) logger.write(logline) if verbose: stdout.write(logline) continue if record['gg_id'] is not None: logline = log_f("%s already has gg_id %d!" %\ (id_,record['gg_id'])) logger.write(logline) if verbose: stdout.write(logline) continue record['gg_id'] = gg_id if domain != 'eukarya': record['prokMSA_id'] = gg_id gg_id += 1 inv_score = calc_invariant(seq, invariants) non_ACGT = calc_nonACGT(seq) record['perc_ident_to_invariant_core'] = inv_score record['non_ACGT_percent'] = non_ACGT record['aligned_seq'] = seq record['n_pos_aligned'] = len(seq) - seq.count('-') for f in opts.unaligned.split(','): logline = log_f("Parsing %s..." % f) logger.write(logline) if verbose: stdout.write(logline) domain = get_domain(f) for unaln_id, unaln_seq in MinimalFastaParser(open(f)): id_ = unaln_id.split()[0] # strip off any comments record = records.get(id_, None) if record is None: logline = log_f("Unaligned seq %s does not have a GG record" %\ id_) logger.write(logline) if verbose: stdout.write(logline) continue # a gg_id should be assigned while trolling the alignment seqs if record['gg_id'] is None: logline = log_f("%s should have a gg_id by now!" % (id_)) logger.write(logline) if verbose: stdout.write(logline) continue record['unaligned_seq'] = seq record['n_pos_unaligned'] = len(seq) logline = log_f("Beginning output...") logger.write(logline) if verbose: stdout.write(logline) output_map = open(output_map_fp,'w') output_gg = open(output_gg_fp,'w') output_gg_noggid = open(output_gg_noggid_fp, 'w') output_gg_broken = open(output_gg_broken_fp, 'w') for record in records.items(): if record['gg_id'] is None: write_gg_record(output_gg_noggid, record) else: try: record.sanityCheck() except: write_gg_record(output_gg_broken, record) else: write_gg_record(output_gg, record) output_map.write("%s\t%s\n" % (record['gg_id'], record['ncbi_acc_w_ver'])) output_gg.close()
by_length = {} for ((name, email), (kits, codes)) in mapping.items(): n_kits = len(kits) if n_kits not in by_length: by_length[n_kits] = [] new_rec = [name, email] new_rec.extend(kits) new_rec.extend(codes) by_length[n_kits].append(new_rec) return by_length if __name__ == '__main__': option_parser, opts, args = parse_command_line_parameters(**script_info) con = connect(user=opts.user, password=opts.password, dsn=opts.dsn) cur = con.cursor() if opts.full_query: cur.execute(FULL_QUERY) else: cur.execute(UNVER_QUERY) results = cur.fetchall() collapsed = collapse_names(results) for n_kits in sorted(collapsed.keys()): f = open(opts.outfile_fp + '_%d_kits.txt' % n_kits, 'w') f.write("#name\temail\t") f.write('\t'.join(["kit_id"] * n_kits))
def main(): """ dump stableIDs for expressing genes in a study based on commonality """ script_info = set_environment() option_parser, opts, args =\ parse_command_line_parameters(**script_info) rr = RunRecord() if opts.sample1 is None: raise RuntimeError('No samples given') # These will hold the ids we want to intersect, etc sample1_ids = set() sample2_ids = set() sample3_ids = set() # Get all the genes and build ensembl ID sets session = _create_session() sample1_genes, rr = getExpressedGenes( session, opts.sample1, opts.sample1_type, opts.m1, opts.sample_extremes, ignore_bulk=opts.ignore_bulk, ignore_top_extreme=opts.ignore_top_extreme, ignore_bottom_extreme=opts.ignore_bottom_extreme, rr=rr) for gene in sample1_genes: sample1_ids.add(gene.ensembl_id) session.close() session = _create_session() sample2_genes, rr = getExpressedGenes( session, opts.sample2, opts.sample2_type, opts.m2, opts.sample_extremes, ignore_bulk=opts.ignore_bulk, ignore_top_extreme=opts.ignore_top_extreme, ignore_bottom_extreme=opts.ignore_bottom_extreme, rr=rr) for gene in sample2_genes: sample2_ids.add(gene.ensembl_id) session.close() if opts.sample3 is not None: session = _create_session() sample3_genes, rr = getExpressedGenes( session, opts.sample3, opts.sample3_type, opts.m3, opts.sample_extremes, ignore_bulk=opts.ignore_bulk, ignore_top_extreme=opts.ignore_top_extreme, ignore_bottom_extreme=opts.ignore_bottom_extreme, rr=rr) for gene in sample3_genes: sample3_ids.add(gene.ensembl_id) session.close() # Find the IDs we're interested in based on sample relationship comparison_type = opts.comparison_type.split(':')[0] output_id_set, rr = compare_ids(sample1_ids, sample2_ids, sample3_ids, comparison_type, rr) # Narrow search if needed by top genes if opts.num_genes is not None: output_id_set, rr = restrict_by_num_genes( output_id_set, opts.num_genes, opts.expression_sample1, opts.expression_sample2, opts.expression_sample3, opts.favoured_expression_sample, rr) rr.addInfo('gene_overlap', 'Total genes in output', len(output_id_set)) # now save to file outfile = open(opts.genefile, 'w') # Add the standard header so that we can import with add_expression_db.py outfile.write('gene\n') for id in output_id_set: outfile.write(str(id) + '\n') outfile.close() rr.display()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # if we already have these records, then we do not need to reobtain them if opts.existing_gb: existing_gis = set([l.strip() for l in open(opts.existing_gb)]) else: existing_gis = set([]) if opts.verbose: print "Number of existing GIs: %d" % len(existing_gis) if opts.possible_new_gb_out is None: option_parser.error("Need to specify --possible-new-gb-output") if opts.cached_ids: possible_gis = set([l.strip() for l in open(opts.cached_ids)]) else: # ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal'] ncbi_record_queries = ["16S AND tm7"] # grab all the ids possible_gis = set([]) for query in ncbi_record_queries: if opts.verbose: cur_size = len(possible_gis) possible_gis.update(esearch(query, retmax=10000000)) if opts.verbose: print "Query %s added %d to set" % (query, len(possible_gis) - cur_size) # drop out any existing ids possible_gis = possible_gis - existing_gis if opts.verbose: print "Total number of GIs to query: %d" % len(possible_gis) chunk_count = 0 total_bytes = 0 if opts.use_gz: poss_output = open_gz(opts.possible_new_gb_out, "w") else: poss_output = open(opts.possible_new_gb_out, "w") collected = set([]) retries = 0 while possible_gis and retries < 100: try: for chunk in bulk_efetch(possible_gis): chunk_count += 1 total_bytes += len(chunk) # Occasionally, and silently, NCBI corrupts records. if "<html>" in chunk: if verbose: print "Erroneous record in chunk, disregarding full chunk" continue # pullout the GIs records = [] for l in chunk.splitlines(): if l.startswith("VERSION"): records.append(l.split(":")[1]) if opts.verbose: print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % ( time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes, ) poss_output.write(chunk) collected.update(set(records)) except Exception, e: retries += 1 print "Caught exception: ", e possible_gis = possible_gis - collected collected = set([]) possible_gis_at_retry = open("possible_retries_at_retry_%d.txt.gz" % retries, "w") possible_gis_at_retry.write("\n".join(possible_gis)) possible_gis_at_retry.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if ',' not in opts.ylim: raise RuntimeError('ylim must be comma separated') ylim = map(float, opts.ylim.strip().split(',')) print 'Loading counts data' data_collection1 = RegionCollection(filename=opts.collection1) window_size = data_collection1.info['args']['window_size'] data_collection2 = RegionCollection(filename=opts.collection2) # filter both if opts.cutoff < 0 or opts.cutoff > 1: raise RuntimeError('The cutoff must be between 0 and 1') data_collection1 = data_collection1.filteredChebyshevUpper(opts.cutoff) data_collection2 = data_collection2.filteredChebyshevUpper(opts.cutoff) # make sure each collection consists ot the same genes shared_labels = set(data_collection1.labels) & \ set(data_collection2.labels) data_collection1 = data_collection1.filteredByLabel(shared_labels) data_collection2 = data_collection2.filteredByLabel(shared_labels) assert set(data_collection1.labels) == set(data_collection2.labels) if opts.sample_top is None: sample_top = data_collection1.N else: sample_top = opts.sample_top indices = range(sample_top) data_collection1 = data_collection1.take(indices) data_collection2 = data_collection2.take(indices) print 'Starting to plot' if opts.bgcolor == 'black': grid = {'color': 'w'} bgcolor = '0.1' vline_color = 'w' else: grid = {'color': 'k'} vline_color = 'k' bgcolor = '1.0' vline = dict(x=0, linewidth=opts.vline_width, linestyle=opts.vline_style, color=vline_color) plot = PlottableSingle(height=opts.fig_height / 2.5, width=opts.fig_width / 2.5, bgcolor=bgcolor, grid=grid, ylim=ylim, xlim=(-window_size, window_size), xtick_space=opts.xgrid_lines, ytick_space=opts.ygrid_lines, xtick_interval=opts.xlabel_interval, ytick_interval=opts.ylabel_interval, xlabel_fontsize=opts.xfontsize, ylabel_fontsize=opts.yfontsize, vline=vline, ioff=True) x = numpy.arange(-window_size, window_size) if opts.metric == 'Mean counts': stat = averaged else: data_collection1 = data_collection1.asfreqs() data_collection2 = data_collection2.asfreqs() stat = summed plot_sample(plot, data_collection1, stat_maker(stat, data_collection1), x, opts.title, opts.xlabel, opts.ylabel, 'b', opts.legend1, opts.plot_stderr) plot_sample(plot, data_collection2, stat_maker(stat, data_collection2), x, opts.title, opts.xlabel, opts.ylabel, 'r', opts.legend2, opts.plot_stderr) plot.legend() plot.show() if opts.plot_filename and not opts.test_run: plot.savefig(opts.plot_filename) else: print opts.plot_filename
def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) rr = RunRecord() rr.addMessage('export_feature_counts', LOG_INFO, 'Chosen sample', opts.sample) if opts.sample is None: rr.display() raise RuntimeError('No samples available') return sample_name = _samples_name(opts.sample) rr.addMessage('export_feature_counts', LOG_INFO, 'Chosen sample', opts.sample) rr.addMessage('export_feature_counts', LOG_INFO, 'Immuno precipitated counts path', opts.IP_counts_path) rr.addMessage('export_feature_counts', LOG_INFO, 'Input counts path', opts.IN_counts_path) rr.addMessage('export_feature_counts', LOG_INFO, 'No. of most expressed genes sampled', opts.sample_top) rr.addMessage('export_feature_counts', LOG_INFO, 'maximum_read_length', opts.maximum_read_length) rr.addMessage('export_feature_counts', LOG_INFO, 'count_max_length', opts.count_max_length) rr.addMessage('export_feature_counts', LOG_INFO, 'upstream_size', opts.upstream_size) rr.addMessage('export_feature_counts', LOG_INFO, 'pseudo_count', opts.pseudo_count) genes = db_query.get_ranked_expression(session, sample_name, biotype='protein_coding', rank_by='mean', test_run=opts.test_run) genes = genes[:opts.sample_top] chrom_gene_groups = grouped_by_chrom(genes) ip_table = get_sum_counts_table(session, chrom_gene_groups, opts.IP_counts_path, opts.maximum_read_length, opts.count_max_length, opts.upstream_size, test_run=opts.test_run) # IP stands for Immuno precipitated ip_table.Title = 'IP' # renamed the IP table counts header for consistency with result of # joining the IP and IN tables ip_table = ip_table.withNewHeader(['counts'], ['IP_counts']) in_table = get_sum_counts_table(session, chrom_gene_groups, opts.IN_counts_path, opts.maximum_read_length, opts.count_max_length, opts.upstream_size, test_run=opts.test_run) # IN stands for Input in_table.Title = 'IN' combined = ip_table.joined(in_table, columns_self=('region_type', 'ensembl_id', 'region_rank')) combined.Title = '' ratio = CalcRatio(opts.pseudo_count) combined = combined.withNewColumn('ratio', ratio, columns=['IP_counts', 'IN_counts']) if not opts.test_run: combined.writeToFile(opts.save_table_name, sep='\t') rr.addMessage('export_feature_counts', LOG_INFO, 'Wrote counts to', opts.save_table_name) else: print combined[:10] rr.display()