def ExtractExperimentIDs_tax(taxid): ''' Extract experiments which have runs associated from taxid >>> ExtractExperimentIDs_tax('211968') ['SRX1308653', 'SRX1308716', 'SRX1308789', 'SRX1308879', 'SRX337751'] ''' ena_url = ('http://www.ebi.ac.uk/ena/data/warehouse/search?' 'query="tax_tree(%s)"&' 'result=read_experiment')%(taxid) countquery = '&resultcount' display = '&display=report&fields=experiment_accession' # Find number of entries for the provided taxid count = 0 with openurl(ena_url+countquery) as u: for l in u: l = l.strip() if ':' in l: tmp = l.split(':') if tmp[0] == 'Number of results': count = int(tmp[1].replace(',','')) # Extract experiment IDs experiments = [] if count > 0: length = 100000 pages = ceil(count/float(length)) for p in xrange(pages): page_offset = '&offset=%s&length=%s'%(p*length+1, length) with openurl(ena_url+display+page_offset) as u: header = u.readline() for l in u: l = l.strip() if l[:3] in acctypes and acctypes[l[:3]] == 'experiment': experiments.append(l) else: print("Unknown Experiment ID: %s (taxid=%s)"%(l,taxid)) return experiments
def main(): args = parse_args(sys.argv[1:]) if args.a is not None and args.t is not None: sys.exit("Usage: -a PATH/ACC -t PATH/TAX -out PATH [-m JSON]") experiments = [] if args.a is not None: # Extract accession related experiments if os.path.exists(args.a): accfile = args.a elif args.a[:3] in acctypes: accfile = "tmp.acc" with open(accfile, "w") as f: f.write("\n".join(args.a.split(","))) experiments.extend(SetupParallelDownload(accfile)) if args.t is not None: # Extract tax id related experiments if os.path.exists(args.t): taxfile = args.a else: taxfile = "tmp.tax" with open(taxfile, "w") as f: f.write("\n".join(args.t.split(","))) experiments.extend(SetupParallelDownload(taxfile)) # Remove doublicate experiments experiments = list(set(experiments)) elen = len(experiments) print("Found %s unique experiment Accessions IDs!" % (elen)) if elen > 0: # Create out directory cwd = os.getcwd() out_dir = "%s/%s/" % (cwd, args.out) if not os.path.exists(out_dir): os.mkdir(out_dir) os.chdir(out_dir) # Split experiments in batches epb = ceil(elen / float(args.nodes)) batches = [experiments[s : s + epb] for s in xrange(0, elen, epb)] # Run batch downloads ps = [] for batch_dir, eids in enumerate(batches): # Save experiment IDs to file batch_acc_list = "%s/%s.acc" % (out_dir, batch_dir) with open(batch_acc_list, "w") as f: f.write("\n".join(eids)) # Prepare cmdline nargs = ["-a", batch_acc_list, "-out", str(batch_dir)] if args.preserve: nargs.append("-p") if args.m is not None: nargs.extend(["-m", "%s/%s" % (cwd, args.m)]) if args.all_runs_as_samples: nargs.append("--all_runs_as_samples") cmd = GetCMD("download-accession-list", nargs) # Execute batch download ps.append(Popen(cmd, shell=True, executable="/bin/bash")) # Wait for all batches to finish esum = 0 for p in ps: esum += p.wait() if esum == 0: print("All batches finished succesfully!") else: print("Something failed!") else: print("No experiments could be found!")