Beispiel #1
0
def downloadRun(run, myAPI, dryRun, files=[], force=False):
    # you can only pull 1024 items at once, so we have to loop over "pages" of items, 1024 at a time
    # this is done by incrementing the offset by 1024 each time, so the next loop gets the next page
    # the limit can be adjusted as long as the limit is equal to the offset
    page = 0
    pageFiles = run.getFiles(
        myAPI,
        QueryParameters.QueryParameters({
            'Limit': 1024,
            'Offset': int(1024 * page)
        }))
    totalSize = 0
    #did the user select files?
    fileSel = False
    if files:
        fileSel = True
    # todo: insert regex matching to pull down only those required for demultiplex
    while len(pageFiles) > 0:
        for fn in pageFiles:
            if fileSel and fn.Name not in files:
                # user selected some particular files, but this aint one of em
                continue
            elif files and fn.Name in files:
                # we found it! cut it from the list
                files.pop(files.index(fn.Name))
            thisSize = fn.__dict__['Size']
            totalSize += thisSize
            if dryRun:
                continue
            savePath = str(run) + "/" + pathFromFile(fn, myAPI)
            if not os.path.exists(savePath):
                os.makedirs(savePath)
            if not force and os.path.exists(os.path.join(savePath, fn.Name)):
                print("already have " + savePath + fn.Name + ". Skipping...")
                continue
            else:
                fn.downloadFile(myAPI, savePath)
        if fileSel and len(files) == 0:
            # user selected some file(s) and we found them all; return
            break
        page += 1
        pageFiles = run.getFiles(
            myAPI,
            QueryParameters.QueryParameters({
                'Limit': 1024,
                'Offset': int(1024 * page)
            }))
    if files:
        # files was user-defined, but didn't successfully pop all elements
        # i.e. something was selected and never found
        print("warning: could not find these selected files")
        for fn in files:
            print('\t' + fn)
    print(
        humanFormat(totalSize) + '\t' + str(run) + '\t' +
        str(run.ExperimentName))
    return totalSize
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p', '--profile', default="DEFAULT", help="the .basespacepy.cfg profile to load")
    parser.add_argument('-d', '--dry', action='store_true', default=False, help="dry run; return size of selected items")
    parser.add_argument('-f', '--force', action='store_true', default=False, help="force overwrite; otherwise cat counters on new filenames")
    parser.add_argument('-j', '--project', required=True, nargs="+", help="project to download; can accept multiple values")
    parser.add_argument('-t', '--type', choices=['b','f','bam','fastq'], default='f', help='type of file to download')

    args = parser.parse_args()
    myAPI = BaseSpaceAPI(profile=args.profile, timeout=500)
    user = myAPI.getUserById('current')
    qp = QueryParameters.QueryParameters({'Limit':1024})

    projects = user.getProjects(myAPI, qp)
    
    if args.type in ['b', 'bam']:
        download = downloadProjectBam
    elif args.type in ['f', 'fastq']:
        download = downloadProjectFastq
      
    userProjs = stringsToBSObj(projects, args.project)
    for lostProj in set(args.project) - set([str(x) for x in userProjs]):
        warning("cannot find " + str(lostProj))
    TotalSize = 0
    for project in userProjs:
        TotalSize += download(project , myAPI, args.dry, force=args.force)
    if len(userProjs) > 1:
            print(humanFormat(TotalSize) + "\tTotal")
def downloadProjectBam(project, myAPI, dryRun, samples=[], force=False, qp=QueryParameters.QueryParameters({'Limit':1024})):
    totalSize = 0
    results = project.getAppResults(myAPI, qp)
    for result in results:
        bams = [ x for x in result.getFiles(myAPI, qp) if "bam" in str(x) ]
        if samples:
            if type(samples[0]) == str:
                samples = stringsToBSObj(project.getSamples(myAPI, qp), samples)
            # user picked particular samples
            # subset the list of bams accordingly
            #bams = [x for x in bams if ]
            #WIP
            print("\n\nuser picked particular samples, but this isn't coded in yet\n")
            stop()
        savePath = str(project).replace(" ","_") + "/" + pathFromFile(bams[0], myAPI)
        tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(bams[0], myAPI) + "/partial/"
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        if not os.path.exists(tmpPath):
            os.makedirs(tmpPath)
        for fn in bams:
            thisSize = fn.__dict__['Size']
            # totalSize += thisSize
            if dryRun:
                totalSize += thisSize
                print(humanFormat(thisSize) + '\t' + fn.Name)
                continue
            # savePath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI)
            # tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI) + "partial/"
            # if not os.path.exists(savePath):
            #     os.makedirs(savePath)
            # if not os.path.exists(tmpPath):
            #     os.makedirs(tmpPath)
            pathToFn = os.path.join(savePath, fn.Name)
            if not force and fileExists(pathToFn, fn):
                print("already have " + savePath + "/" + fn.Name + ". Skipping...")
                continue
            else:
                while os.path.exists(os.path.join(savePath, fn.Name)):
                    # if the path exists, append this string to the end to avoid overwriting
                    counter = 1
                    fn.Name = os.path.basename(fn.Path) + "." + str(counter)
                    counter += 1 
                print(os.path.join(savePath, fn.Name))
                totalSize += thisSize
                fn.downloadFile(myAPI, tmpPath)
                shutil.move(os.path.join(tmpPath, os.path.split(fn.Path)[1] ) , os.path.join(savePath,fn.Name) )
        if os.path.exists(tmpPath) and not os.listdir(tmpPath):
            os.rmdir(tmpPath)    
    if not dryRun:
        downloadProjectMetadata(project, myAPI, samples=samples, outdir=savePath)
    print( humanFormat(totalSize) + '\t' + str(project) )
    return totalSize
def downloadProjectFastq(project, myAPI, dryRun, samples=[], force=False, qp=QueryParameters.QueryParameters({'Limit':1024})):
    totalSize = 0
    if not samples:
        samples = project.getSamples(myAPI, qp)
    elif samples and type(samples[0]) == str: 
        #convert samples strings to sample objects
        samples = stringsToBSObj(project.getSamples(myAPI, qp), samples)
    for sample in samples:
        fns = sample.getFiles(myAPI, qp)
        savePath = str(project).replace(" ","_") + "/" + pathFromFile(fns[0], myAPI)
        tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(fns[0], myAPI) + "/partial/"
        if not os.path.exists(savePath):
            os.makedirs(savePath)
        if not os.path.exists(tmpPath):
            os.makedirs(tmpPath)
        for fn in fns:
            thisSize = fn.__dict__['Size']
            # skip addition until we know this will be downloaded
            #totalSize += thisSize                       
            if dryRun:
                totalSize += thisSize
                print(humanFormat(thisSize) + '\t' + fn.Name)
                continue
            # savePath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI)
            # tmpPath = str(project).replace(" ","_") + "/" + pathFromFile(fn, myAPI) + "partial/"
            # files are downloaded by ID so we have no control to rename them until they are on disk
            # download to a temp dir and rename accordingly when finished with this file             
            # if not os.path.exists(savePath):
            #     os.makedirs(savePath)
            # if not os.path.exists(tmpPath):
            #     os.makedirs(tmpPath)
            pathToFn = os.path.join(savePath, fn.Name)
            if not force and fileExists(pathToFn, fn):
                print("already have " + savePath + fn.Name + ". Skipping...")
                continue
            else:
                while os.path.exists(os.path.join(savePath, fn.Name)):
                    # if the path exists, append this string to the end to avoid overwriting
                    counter = 1
                    fn.Name = os.path.basename(fn.Path) + "." + str(counter)
                    counter += 1 
                totalSize += thisSize
                print(os.path.join(savePath, fn.Name))
                fn.downloadFile(myAPI, tmpPath)
                shutil.move(os.path.join(tmpPath,os.path.split(fn.Path)[1] ) , os.path.join(savePath,fn.Name) )
        if os.path.exists(tmpPath) and not os.listdir(tmpPath):
            # delete the temp directory if it is empty 
            os.rmdir(tmpPath)         
    if not dryRun:
        downloadProjectMetadata(project, myAPI, samples=samples)
    print( humanFormat(totalSize) + '\t' + str(project) )
    return totalSize
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--profile',
                        default="DEFAULT",
                        help="the .basespacepy.cfg profile to load")
    parser.add_argument('-d',
                        '--dry',
                        action='store_true',
                        default=False,
                        help="dry run; return size of selected items")
    parser.add_argument(
        '-f',
        '--force',
        action='store_true',
        default=False,
        help="force overwrite; otherwise cat counters on new filenames")
    parser.add_argument(
        '-r',
        '--run',
        default=[],
        nargs="+",
        help="run name to download; can accept multiple values")
    parser.add_argument(
        '--file',
        default=[],
        nargs="+",
        help=
        "specific file(s) to pull from each run; can accept multiple values")

    args = parser.parse_args()
    myAPI = BaseSpaceAPI(profile=args.profile, timeout=500)
    user = myAPI.getUserById('current')
    qp = QueryParameters.QueryParameters({'Limit': 1024})

    runs = user.getRuns(myAPI, qp)
    userRuns = stringsToBSObj(runs, args.run)
    if not args.run:
        userRuns = runs
    for lostRun in set(args.run) - set([str(x) for x in userRuns]):
        warning("cannot find " + str(lostRun))
    TotalSize = 0
    userFiles = args.file
    for run in userRuns:
        # must create a copy of userFiles or the downloadRun function will strip entries from this instance of the list
        TotalSize += downloadRun(run,
                                 myAPI,
                                 args.dry,
                                 files=[x for x in userFiles],
                                 force=args.force)
Beispiel #6
0
def downloadProjectMetadata(project,
                            myAPI,
                            samples=[],
                            qp=QueryParameters.QueryParameters({'Limit':
                                                                1024})):
    totalSize = 0
    sampleMetadata = pd.DataFrame()
    sindx = 0
    fileMetadata = pd.DataFrame()
    findx = 0
    if not samples:
        samples = project.getSamples(myAPI, qp)
    elif samples and type(samples[0]) == str:
        #convert samples strings to sample objects
        samples = stringsToBSObj(project.getSamples(myAPI, qp), samples)
    for sample in samples:
        sampleMetadata = sampleMetadata.append(
            pd.DataFrame(pullMetadata(sample), index=[sindx]))
        sindx += 1
        fns = sample.getFiles(myAPI, qp)
        for fn in fns:
            thisFileMeta = pd.DataFrame(pullMetadata(fn), index=[findx])
            thisFileMeta['SID'] = str(sample)
            fileMetadata = fileMetadata.append(thisFileMeta)
            findx += 1
    timestamp = str(datetime.datetime.today()).replace(' ', '_')
    savePath = str(project).replace(" ", "_") + "/" + pathFromFile(
        fns[0], myAPI)
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    sampleMetadata.to_csv(os.path.join(
        savePath,
        str(project) + '_SampleMetadata.' + timestamp + '.txt'),
                          sep='\t',
                          header=True,
                          index=False)
    fileMetadata.to_csv(os.path.join(
        savePath,
        str(project) + '_FileMetadata.' + timestamp + '.txt'),
                        sep='\t',
                        header=True,
                        index=False)
    return sampleMetadata, fileMetadata
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--profile',
                        default="DEFAULT",
                        help="the .basespacepy.cfg profile to load")
    parser.add_argument('-j',
                        '--project',
                        required=True,
                        nargs="+",
                        help="project to download; can accept multiple values")

    args = parser.parse_args()
    myAPI = BaseSpaceAPI(profile=args.profile, timeout=500)
    user = myAPI.getUserById('current')
    qp = QueryParameters.QueryParameters({'Limit': 1024})

    projects = user.getProjects(myAPI, qp)
    userProjs = stringsToBSObj(projects, args.project)
    for lostProj in set(args.project) - set([str(x) for x in userProjs]):
        warning("cannot find " + str(lostProj))

    fullSampleMetadata = pd.DataFrame()
    fullFileMetadata = pd.DataFrame()
    for project in userProjs:
        smout, fmout = downloadProjectMetadata(project, myAPI)
        fullSampleMetadata = fullSampleMetadata.append(smout)
        fullFileMetadata = fullFileMetadata.append(fmout)
    thisInstant = str(datetime.datetime.today()).replace(' ', ';')
    fullSampleMetadata.to_csv('fullSampleMetadata.' + thisInstant + '.txt',
                              sep='\t',
                              header=True,
                              index=False)
    fullFileMetadata.to_csv('fullFileMetadata.' + thisInstant + '.txt',
                            sep='\t',
                            header=True,
                            index=False)