Ejemplo n.º 1
0
def ExtractExperimentIDs_tax(taxid):
    ''' Extract experiments which have runs associated from taxid
    >>> ExtractExperimentIDs_tax('211968')
    ['SRX1308653', 'SRX1308716', 'SRX1308789', 'SRX1308879', 'SRX337751']
    '''
    ena_url = ('http://www.ebi.ac.uk/ena/data/warehouse/search?'
               'query="tax_tree(%s)"&'
               'result=read_experiment')%(taxid)
    countquery = '&resultcount'
    display = '&display=report&fields=experiment_accession'
    # Find number of entries for the provided taxid 
    count = 0
    with openurl(ena_url+countquery) as u:
        for l in u:
            l = l.strip()
            if ':' in l:
                tmp = l.split(':')
                if tmp[0] == 'Number of results':
                    count = int(tmp[1].replace(',',''))
    # Extract experiment IDs
    experiments = []
    if count > 0:
        length = 100000
        pages = ceil(count/float(length))
        for p in xrange(pages):
            page_offset = '&offset=%s&length=%s'%(p*length+1, length)
            with openurl(ena_url+display+page_offset) as u:
                header = u.readline()
                for l in u:
                    l = l.strip()
                    if l[:3] in acctypes and acctypes[l[:3]] == 'experiment':
                        experiments.append(l)
                    else:
                        print("Unknown Experiment ID: %s (taxid=%s)"%(l,taxid))
    return experiments
Ejemplo n.º 2
0
def main():
    args = parse_args(sys.argv[1:])
    if args.a is not None and args.t is not None:
        sys.exit("Usage: -a PATH/ACC -t PATH/TAX -out PATH [-m JSON]")
    experiments = []
    if args.a is not None:
        # Extract accession related experiments
        if os.path.exists(args.a):
            accfile = args.a
        elif args.a[:3] in acctypes:
            accfile = "tmp.acc"
            with open(accfile, "w") as f:
                f.write("\n".join(args.a.split(",")))
        experiments.extend(SetupParallelDownload(accfile))
    if args.t is not None:
        # Extract tax id related experiments
        if os.path.exists(args.t):
            taxfile = args.a
        else:
            taxfile = "tmp.tax"
            with open(taxfile, "w") as f:
                f.write("\n".join(args.t.split(",")))
        experiments.extend(SetupParallelDownload(taxfile))
    # Remove doublicate experiments
    experiments = list(set(experiments))
    elen = len(experiments)
    print("Found %s unique experiment Accessions IDs!" % (elen))
    if elen > 0:
        # Create out directory
        cwd = os.getcwd()
        out_dir = "%s/%s/" % (cwd, args.out)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        os.chdir(out_dir)
        # Split experiments in batches
        epb = ceil(elen / float(args.nodes))
        batches = [experiments[s : s + epb] for s in xrange(0, elen, epb)]
        # Run batch downloads
        ps = []
        for batch_dir, eids in enumerate(batches):
            # Save experiment IDs to file
            batch_acc_list = "%s/%s.acc" % (out_dir, batch_dir)
            with open(batch_acc_list, "w") as f:
                f.write("\n".join(eids))
            # Prepare cmdline
            nargs = ["-a", batch_acc_list, "-out", str(batch_dir)]
            if args.preserve:
                nargs.append("-p")
            if args.m is not None:
                nargs.extend(["-m", "%s/%s" % (cwd, args.m)])
            if args.all_runs_as_samples:
                nargs.append("--all_runs_as_samples")
            cmd = GetCMD("download-accession-list", nargs)
            # Execute batch download
            ps.append(Popen(cmd, shell=True, executable="/bin/bash"))
        # Wait for all batches to finish
        esum = 0
        for p in ps:
            esum += p.wait()
        if esum == 0:
            print("All batches finished succesfully!")
        else:
            print("Something failed!")
    else:
        print("No experiments could be found!")