def upload_archive(config, sdrf_metadata, archive2metadata, exclude_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log):
    '''
    uploads the files in the archive that meet the conditions
    
    parameters:
        config: the configuration map
        sdrf_metadata: metadata map to update
        archive2metadata: archive metadata
        exclude_samples: list of ffpe preserved samples or samples without a project assigned not to upload
        archive_fields: archive name, creation date, and URL
        upload_archives: map of level to center to platform of archives to upload
        seen_files: files that have been seen in a previously processed archive
        nonupload_files: list of file extensions of files not to upload
        access: either open or controlled
        log: logger to log any messages
    '''
    archive_path = None
    if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata):
        log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0]))
        try:
            level = archive_fields[0].split('.')[-4].replace('_', ' ')
            user_info = config['user_info']
            archive_path = util.setup_archive(config, archive_fields, log, user_info['user'], user_info['password'])
            file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, exclude_samples, level, log)
            if 0 < len(file2metadata):
                upload_files(config, archive_path, file2metadata, log)
            else:
                log.warning('did not find files to load for %s' % (archive_fields[0]))
        finally:
            if archive_path:
                shutil.rmtree(archive_path)
        log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0]))
    else:
        log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
def upload_archive(config, log, archive_fields, archive2metadata,
                   sdrf_metadata, access):
    user_info = config['user_info']
    log.info('\tchecking %s-access maf archive %s.' %
             (access, archive_fields[0]))

    try:
        if config['download_archives']:
            archive_path = util.setup_archive(archive_fields, log,
                                              user_info['user'],
                                              user_info['password'])
            filenames = process_files(archive_path, log)
            if 0 < len(filenames):
                file2metadata = {}
                for file_name in filenames:
                    if file_name.endswith('maf'):
                        file2metadata[file_name] = parse_maf_file(
                            file_name, archive_path, log, archive_fields,
                            archive2metadata, sdrf_metadata)
                upload_archives.upload_files(config, archive_path,
                                             file2metadata, log)
            else:
                log.warning('\tdid not find files to load for %s' %
                            (archive_fields[0]))
        else:
            log.info('\tskipping %s-access archive %s' %
                     (access, archive_fields[0]))
    finally:
        shutil.rmtree(archive_path)
Example #3
0
def process_sdrf(config, log, magetab_archives, archive2metadata, barcode2annotations):
    """
    return types:
        barcode2files2term2values: maps aliquot barcode to a map with filenames for that barcode as keys to another map of terms
          based on the ['metadata_locations']['sdrf'] section of the config file 
    """
    log.info('start processing sdrf')
    sdrf_pat = re.compile("^.*sdrf.txt$")
    anti_pat = re.compile("^.*antibody_annotation.txt$")
    barcode2files2term2values = {}
    archive2barcodes = {}
    for archive_fields in magetab_archives:
        try:
            log.info('\tprocessing %s' % (archive_fields[0]))
            archive_path = util.setup_archive(archive_fields, log)
            files = os.listdir(archive_path)
            antibody_files = []
            cur_barcode2files2term2values = {}
            for file_name in files:
                if sdrf_pat.match(file_name):
                    parse_sdrf(config, log, archive_path + file_name, archive2metadata, cur_barcode2files2term2values, 
                        archive2barcodes, archive_fields, file_name, barcode2annotations)
                    util.merge_metadata(barcode2files2term2values, cur_barcode2files2term2values, archive_fields[0] + ': ' + ','.join([archive_fields[0] for archive_fields in magetab_archives]), log)
                    upload_sdrf_file(config, archive_path, file_name, barcode2files2term2values.values()[0].values()[0], log)
                elif anti_pat.match(file_name):
                    antibody_files += [file_name]
            for file_name in antibody_files:
                upload_sdrf_file(config, archive_path, file_name, barcode2files2term2values.values()[0].values()[0], log)
        finally:
            shutil.rmtree(archive_path)
    log.info('finished processing sdrf')
    return barcode2files2term2values
Example #4
0
def parse_archives(config, log, archives, study, archive2metadata, clinical_metadata, biospecimen_metadata, exclude_samples):
    '''
    downloads and unpacks the archives.  then parses, and if appropriate for the archive, uploads the files to GCS

    parameters:
        config: the configuration map
        log: logger to log any messages
        archives: information on the archives to unpack
        study: name of the TCGA study the files belongs to
        archive2metadata: metadata of the archive
        clinical_metadata: the return map for clinical metadata
        biospecimen_metadata: the return map for biospecimen metadata
    '''
    tmp_dir_parent = os.environ.get('ISB_TMP', '/tmp/')
    for archive_fields in archives:
        if not 'Level_1' in archive_fields[0]:
            log.info('skipping bio archive %s' % (archive_fields[0]))
            continue
        log.info('processing archive %s' % (archive_fields[0]))
        archive_path = os.path.join(tmp_dir_parent, archive_fields[0] + '/')
        if not os.path.isdir(archive_path):
            os.makedirs(archive_path)
        archive_path = util.setup_archive(config, archive_fields, log)
        files = os.listdir(archive_path)
        parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, exclude_samples, clinical_metadata, biospecimen_metadata)
        shutil.rmtree(archive_path)
Example #5
0
def parse_archives(config, log, archives, study, archive2metadata, clinical_metadata, biospecimen_metadata):
    tmp_dir_parent = os.environ.get('ISB_TMP', '/tmp/')
    for archive_fields in archives:
        if not 'Level_1' in archive_fields[0]:
            log.info('skipping bio archive %s' % (archive_fields[0]))
            continue
        log.info('processing archive %s' % (archive_fields[0]))
        archive_path = os.path.join(tmp_dir_parent, archive_fields[0] + '/')
        if not os.path.isdir(archive_path):
            os.makedirs(archive_path)
        archive_path = util.setup_archive(archive_fields, log)
        files = os.listdir(archive_path)
        parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, clinical_metadata, biospecimen_metadata)
        shutil.rmtree(archive_path)
Example #6
0
def upload_archive(config, sdrf_metadata, archive2metadata, ffpe_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log):
    archive_path = None
    if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata):
        log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0]))
        try:
            level = archive_fields[0].split('.')[-4].replace('_', ' ')
            user_info = config['user_info']
            archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password'])
            file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, ffpe_samples, level, log)
            if 0 < len(file2metadata):
                upload_files(config, archive_path, file2metadata, log)
            else:
                log.warning('did not find files to load for %s' % (archive_fields[0]))
        finally:
            if archive_path:
                shutil.rmtree(archive_path)
        log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0]))
    else:
        log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
def upload_archive(config, log, archive_fields, archive2metadata, sdrf_metadata, access):
    user_info = config['user_info']
    log.info('\tchecking %s-access maf archive %s.' % (access, archive_fields[0]))

    try:
        if config['download_archives']:
            archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password'])
            filenames = process_files(archive_path, log)
            if 0 < len(filenames):
                file2metadata = {}
                for file_name in filenames:
                    if file_name.endswith('maf'):
                        file2metadata[file_name] = parse_maf_file(file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata)
                upload_archives.upload_files(config, archive_path, file2metadata, log)
            else:
                log.warning('\tdid not find files to load for %s' % (archive_fields[0]))
        else:
            log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
    finally:
        shutil.rmtree(archive_path)
def upload_archive(config, log, archive_fields, archive2metadata, sdrf_metadata, seen_files, access):
    '''
    uploads and gathers metadata on the maf-related files in the archive
    
    parameters:
        config: the configuration map
        log: logger to log any messages
        archive_fields: archive name, creation date, and URL
        archive2metadata: archive metadata
        sdrf_metadata: metadata map to update
        access: either open or controlled
    '''
    user_info = config['user_info']
    log.info('\tchecking %s-access maf archive %s.' % (access, archive_fields[0]))

    archive_path = None
    try:
        if config['download_archives']:
            archive_path = util.setup_archive(config, archive_fields, log, user_info['user'], user_info['password'])
            maf_upload_files = config['maf_upload_files']
            filenames = process_files(archive_path, maf_upload_files, seen_files, log)
            if 0 < len(filenames):
                file2metadata = {}
                for file_name in filenames:
                    if file_name.endswith('maf'):
                        file2metadata[file_name] = parse_maf_file(file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata)
                    elif file_name.endswith('vcf') or file_name.endswith('vcf.gz'):
                        file2metadata[file_name] = parse_vcf_file(file_name, archive_path, log, archive_fields, archive2metadata, sdrf_metadata)
                upload_archives.upload_files(config, archive_path, file2metadata, log)
            else:
                log.warning('\tdid not find files to load for %s' % (archive_fields[0]))
        else:
            log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
    finally:
        if archive_path:
            shutil.rmtree(archive_path)