def upload_archive(config, sdrf_metadata, archive2metadata, exclude_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log):
    '''
    uploads the files in the archive that meet the conditions
    
    parameters:
        config: the configuration map
        sdrf_metadata: metadata map to update
        archive2metadata: archive metadata
        exclude_samples: list of ffpe preserved samples or samples without a project assigned not to upload
        archive_fields: archive name, creation date, and URL
        upload_archives: map of level to center to platform of archives to upload
        seen_files: files that have been seen in a previously processed archive
        nonupload_files: list of file extensions of files not to upload
        access: either open or controlled
        log: logger to log any messages
    '''
    archive_path = None
    if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata):
        log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0]))
        try:
            level = archive_fields[0].split('.')[-4].replace('_', ' ')
            user_info = config['user_info']
            archive_path = util.setup_archive(config, archive_fields, log, user_info['user'], user_info['password'])
            file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, exclude_samples, level, log)
            if 0 < len(file2metadata):
                upload_files(config, archive_path, file2metadata, log)
            else:
                log.warning('did not find files to load for %s' % (archive_fields[0]))
        finally:
            if archive_path:
                shutil.rmtree(archive_path)
        log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0]))
    else:
        log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
Beispiel #2
0
def parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, exclude_samples, clinical_metadata, biospecimen_metadata):
    '''
    iterate through the list of filenames to parse, and if appropriate, upload them
    
    parameters:
        config: the configuration map
        log: logger to log any messages
        files: the file names to iterate through
        archive_path: path to the file being parsed
        archive_fields: list of archive_name, date of upload, and URL
        study: name of the TCGA study the files belongs to
        archive2metadata: metadata of the archive
        clinical_metadata: the return map for clinical metadata
        biospecimen_metadata: the return map for biospecimen metadata
    '''
    sample_code2letter = config['sample_code2letter']
    sample_code2type = config['sample_code2type']

    upload_archive = util.is_upload_archive(archive_fields[0], config['upload_archives'], archive2metadata) and config['upload_open']
    clinical_barcode2field2value = {}
    auxiliary_barcode2field2value = {}
    ssf_clinical_barcode2field2value = {}
    ssf_sample_uuid2field2value = {}
    omf_barcode2field2value = {}
    biospecimen_uuid2field2value = {}
    for file_name in files:
        if clinical_pat.match(file_name):
            parse_file(parse_clinical, config, archive_path, file_name, study, upload_archive, log, clinical_barcode2field2value, 'bcr_patient_barcode')
        elif auxiliary_pat.match(file_name):
            parse_file(parse_auxiliary, config, archive_path, file_name, study, upload_archive, log, auxiliary_barcode2field2value, 'bcr_patient_barcode')
        elif ssf_pat.match(file_name):
            parse_file(parse_ssf_clinical, config, archive_path, file_name, study, upload_archive, log, ssf_clinical_barcode2field2value, 'bcr_patient_barcode')
            # parsing the clinical will upload the file, set upload_archive to False to prevent the 'found <filename> in <bucket>' error
            parse_file(parse_ssf_biospecimen, config, archive_path, file_name, study, False, log, ssf_sample_uuid2field2value, 'sample:bcr_sample_uuid')
        elif biospecimen_pat.match(file_name):
            parse_file(parse_biospecimen, config, archive_path, file_name, study, upload_archive, log, biospecimen_uuid2field2value, 'sample:bcr_sample_uuid', exclude_samples, sample_code2letter, sample_code2type)
        elif omf_pat.match(file_name):
            parse_file(parse_omf, config, archive_path, file_name, study, upload_archive, log, omf_barcode2field2value, 'bcr_patient_barcode')
    
    clinical_auxiliary_barcode2field2value = merge_maps_master_other(clinical_barcode2field2value, auxiliary_barcode2field2value, 'aux')
    clinical_ssf_auxiliary_barcode2field2value = merge_maps_master_other(clinical_auxiliary_barcode2field2value, ssf_clinical_barcode2field2value, 'ssf')
    clinical_omf_ssf_auxiliary_barcode2field2value = merge_maps_master_other(clinical_ssf_auxiliary_barcode2field2value, omf_barcode2field2value, 'omf')
    clinical_auxiliary_omf_ssf_filters = config['metadata_locations']['clinical']
    clinical_auxiliary_omf_ssf_filters.update(config['metadata_locations']['auxiliary'])
    clinical_auxiliary_omf_ssf_filters.update(config['metadata_locations']['omf'])
    clinical_auxiliary_omf_ssf_filters.update(config['metadata_locations']['ssf_clinical'])
    clinical_omf_ssf_auxiliary_barcode2field2value = filter_data(log, clinical_omf_ssf_auxiliary_barcode2field2value, clinical_auxiliary_omf_ssf_filters)

    biospecimen_ssf_uuid2field2value = merge_maps_master_other(biospecimen_uuid2field2value, ssf_sample_uuid2field2value, 'ssf')
    biospecimen_filters = config['metadata_locations']['biospecimen']
    biospecimen_filters.update(config['metadata_locations']['ssf_biospecimen'])
    biospecimen_uuid2field2value = filter_data(log, biospecimen_ssf_uuid2field2value, biospecimen_filters)
    
    clinical_metadata.update(clinical_omf_ssf_auxiliary_barcode2field2value)
    biospecimen_metadata.update(biospecimen_uuid2field2value)
Beispiel #3
0
def upload_archive(config, sdrf_metadata, archive2metadata, ffpe_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log):
    archive_path = None
    if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata):
        log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0]))
        try:
            level = archive_fields[0].split('.')[-4].replace('_', ' ')
            user_info = config['user_info']
            archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password'])
            file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, ffpe_samples, level, log)
            if 0 < len(file2metadata):
                upload_files(config, archive_path, file2metadata, log)
            else:
                log.warning('did not find files to load for %s' % (archive_fields[0]))
        finally:
            if archive_path:
                shutil.rmtree(archive_path)
        log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0]))
    else:
        log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
Beispiel #4
0
def parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, clinical_metadata, biospecimen_metadata):
    sample_code2letter = config['sample_code2letter']
    sample_code2type = config['sample_code2type']
    clinical_auxiliary_filters = config['metadata_locations']['clinical']
    clinical_auxiliary_filters.update(config['metadata_locations']['auxiliary'])
    biospecimen_filters = config['metadata_locations']['biospecimen']

    upload_archive = util.is_upload_archive(archive_fields[0], config['upload_archives'], archive2metadata) and config['upload_open']
    clinical_barcode2field2value = {}
    auxiliary_barcode2field2value = {}
    biospecimen_barcode2field2value = {}
    for file_name in files:
        if clinical_pat.match(file_name):
            parse_clinical(archive_path + file_name, log, clinical_barcode2field2value, 'bcr_patient_barcode')
            if upload_archive:
                upload_bio_file(config, archive_path, file_name, study, log)
            else:
                log.info('\tskipping upload of %s' % file_name)
        elif auxiliary_pat.match(file_name):
            parse_auxiliary(archive_path + file_name, log, auxiliary_barcode2field2value, 'bcr_patient_barcode')
            if upload_archive:
                upload_bio_file(config, archive_path, file_name, study, log)
            else:
                log.info('\tskipping upload of %s' % file_name)
        elif biospecimen_pat.match(file_name):
            parse_biospecimen(archive_path + file_name, log, biospecimen_barcode2field2value, 'sample:bcr_sample_barcode', sample_code2letter, sample_code2type)
            if upload_archive:
                upload_bio_file(config, archive_path, file_name, study, log)
            else:
                log.info('\tskipping upload of %s' % file_name)
    
    clinical_auxiliary_barcode2field2value = merge_clinical_auxiliary(clinical_barcode2field2value, auxiliary_barcode2field2value)
    clinical_auxiliary_barcode2field2value = filter_data(log, clinical_auxiliary_barcode2field2value, clinical_auxiliary_filters)
    biospecimen_barcode2field2value = filter_data(log, biospecimen_barcode2field2value, biospecimen_filters)
    
    clinical_metadata.update(clinical_auxiliary_barcode2field2value.iteritems())
    biospecimen_metadata.update(biospecimen_barcode2field2value.iteritems())