def test_bulk_efetch(self): """efetch a set of ids""" exp = [''.join([id_AGIY01000001_1_gb, id_FO117587_1_gb])] obs = list(bulk_efetch(['354825968','FO117587.1'])) # ncbi records are dynamic even with the same accession versions. so, # a test here is not reliable. self.assertEqual(exp[0][:100], obs[0][:100]) self.assertEqual(obs[0].count('//\n'), 2)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # if we already have these records, then we do not need to reobtain them if opts.existing_gb: existing_gis = set([l.strip() for l in open(opts.existing_gb)]) else: existing_gis = set([]) if opts.verbose: print "Number of existing GIs: %d" % len(existing_gis) if opts.possible_new_gb_out is None: option_parser.error("Need to specify --possible-new-gb-output") if opts.cached_ids: possible_gis = set([l.strip() for l in open(opts.cached_ids)]) else: #ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal'] ncbi_record_queries = ['16S AND tm7'] # grab all the ids possible_gis = set([]) for query in ncbi_record_queries: if opts.verbose: cur_size = len(possible_gis) possible_gis.update(esearch(query, retmax=10000000)) if opts.verbose: print "Query %s added %d to set" % (query, len(possible_gis) - cur_size) # drop out any existing ids possible_gis = possible_gis - existing_gis if opts.verbose: print "Total number of GIs to query: %d" % len(possible_gis) chunk_count = 0 total_bytes = 0 if opts.use_gz: poss_output = open_gz(opts.possible_new_gb_out,'w') else: poss_output = open(opts.possible_new_gb_out,'w') collected = set([]) retries = 0 while possible_gis and retries < 100: try: for chunk in bulk_efetch(possible_gis): chunk_count += 1 total_bytes += len(chunk) # Occasionally, and silently, NCBI corrupts records. if '<html>' in chunk: if verbose: print "Erroneous record in chunk, disregarding full chunk" continue # pullout the GIs records = [] for l in chunk.splitlines(): if l.startswith('VERSION'): records.append(l.split(':')[1]) if opts.verbose: print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % \ (time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes) poss_output.write(chunk) collected.update(set(records)) except Exception, e: retries += 1 print "Caught exception: ", e possible_gis = possible_gis - collected collected = set([]) possible_gis_at_retry = open('possible_retries_at_retry_%d.txt.gz' % retries, 'w') possible_gis_at_retry.write('\n'.join(possible_gis)) possible_gis_at_retry.close()
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # if we already have these records, then we do not need to reobtain them if opts.existing_gb: existing_gis = set([l.strip() for l in open(opts.existing_gb)]) else: existing_gis = set([]) if opts.verbose: print "Number of existing GIs: %d" % len(existing_gis) if opts.possible_new_gb_out is None: option_parser.error("Need to specify --possible-new-gb-output") if opts.cached_ids: possible_gis = set([l.strip() for l in open(opts.cached_ids)]) else: # ncbi_record_queries = ['16S','18S','small subunit','rrna[fkey]','ribosomal'] ncbi_record_queries = ["16S AND tm7"] # grab all the ids possible_gis = set([]) for query in ncbi_record_queries: if opts.verbose: cur_size = len(possible_gis) possible_gis.update(esearch(query, retmax=10000000)) if opts.verbose: print "Query %s added %d to set" % (query, len(possible_gis) - cur_size) # drop out any existing ids possible_gis = possible_gis - existing_gis if opts.verbose: print "Total number of GIs to query: %d" % len(possible_gis) chunk_count = 0 total_bytes = 0 if opts.use_gz: poss_output = open_gz(opts.possible_new_gb_out, "w") else: poss_output = open(opts.possible_new_gb_out, "w") collected = set([]) retries = 0 while possible_gis and retries < 100: try: for chunk in bulk_efetch(possible_gis): chunk_count += 1 total_bytes += len(chunk) # Occasionally, and silently, NCBI corrupts records. if "<html>" in chunk: if verbose: print "Erroneous record in chunk, disregarding full chunk" continue # pullout the GIs records = [] for l in chunk.splitlines(): if l.startswith("VERSION"): records.append(l.split(":")[1]) if opts.verbose: print "%s - retry: %d, Chunk %d, covering %d records, writing %d bytes, %d written in total" % ( time.strftime("%m-%d-%y %H:%M:%S"), retries, chunk_count, len(records), len(chunk), total_bytes, ) poss_output.write(chunk) collected.update(set(records)) except Exception, e: retries += 1 print "Caught exception: ", e possible_gis = possible_gis - collected collected = set([]) possible_gis_at_retry = open("possible_retries_at_retry_%d.txt.gz" % retries, "w") possible_gis_at_retry.write("\n".join(possible_gis)) possible_gis_at_retry.close()