def upload_archive(config, sdrf_metadata, archive2metadata, exclude_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log): ''' uploads the files in the archive that meet the conditions parameters: config: the configuration map sdrf_metadata: metadata map to update archive2metadata: archive metadata exclude_samples: list of ffpe preserved samples or samples without a project assigned not to upload archive_fields: archive name, creation date, and URL upload_archives: map of level to center to platform of archives to upload seen_files: files that have been seen in a previously processed archive nonupload_files: list of file extensions of files not to upload access: either open or controlled log: logger to log any messages ''' archive_path = None if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata): log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0])) try: level = archive_fields[0].split('.')[-4].replace('_', ' ') user_info = config['user_info'] archive_path = util.setup_archive(config, archive_fields, log, user_info['user'], user_info['password']) file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, exclude_samples, level, log) if 0 < len(file2metadata): upload_files(config, archive_path, file2metadata, log) else: log.warning('did not find files to load for %s' % (archive_fields[0])) finally: if archive_path: shutil.rmtree(archive_path) log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
def parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, exclude_samples, clinical_metadata, biospecimen_metadata): ''' iterate through the list of filenames to parse, and if appropriate, upload them parameters: config: the configuration map log: logger to log any messages files: the file names to iterate through archive_path: path to the file being parsed archive_fields: list of archive_name, date of upload, and URL study: name of the TCGA study the files belongs to archive2metadata: metadata of the archive clinical_metadata: the return map for clinical metadata biospecimen_metadata: the return map for biospecimen metadata ''' sample_code2letter = config['sample_code2letter'] sample_code2type = config['sample_code2type'] upload_archive = util.is_upload_archive(archive_fields[0], config['upload_archives'], archive2metadata) and config['upload_open'] clinical_barcode2field2value = {} auxiliary_barcode2field2value = {} ssf_clinical_barcode2field2value = {} ssf_sample_uuid2field2value = {} omf_barcode2field2value = {} biospecimen_uuid2field2value = {} for file_name in files: if clinical_pat.match(file_name): parse_file(parse_clinical, config, archive_path, file_name, study, upload_archive, log, clinical_barcode2field2value, 'bcr_patient_barcode') elif auxiliary_pat.match(file_name): parse_file(parse_auxiliary, config, archive_path, file_name, study, upload_archive, log, auxiliary_barcode2field2value, 'bcr_patient_barcode') elif ssf_pat.match(file_name): parse_file(parse_ssf_clinical, config, archive_path, file_name, study, upload_archive, log, ssf_clinical_barcode2field2value, 'bcr_patient_barcode') # parsing the clinical will upload the file, set upload_archive to False to prevent the 'found <filename> in <bucket>' error parse_file(parse_ssf_biospecimen, config, archive_path, file_name, study, False, log, ssf_sample_uuid2field2value, 'sample:bcr_sample_uuid') elif biospecimen_pat.match(file_name): parse_file(parse_biospecimen, config, archive_path, file_name, study, upload_archive, log, biospecimen_uuid2field2value, 'sample:bcr_sample_uuid', exclude_samples, sample_code2letter, sample_code2type) elif omf_pat.match(file_name): parse_file(parse_omf, config, archive_path, file_name, study, upload_archive, log, omf_barcode2field2value, 'bcr_patient_barcode') clinical_auxiliary_barcode2field2value = merge_maps_master_other(clinical_barcode2field2value, auxiliary_barcode2field2value, 'aux') clinical_ssf_auxiliary_barcode2field2value = merge_maps_master_other(clinical_auxiliary_barcode2field2value, ssf_clinical_barcode2field2value, 'ssf') clinical_omf_ssf_auxiliary_barcode2field2value = merge_maps_master_other(clinical_ssf_auxiliary_barcode2field2value, omf_barcode2field2value, 'omf') clinical_auxiliary_omf_ssf_filters = config['metadata_locations']['clinical'] clinical_auxiliary_omf_ssf_filters.update(config['metadata_locations']['auxiliary']) clinical_auxiliary_omf_ssf_filters.update(config['metadata_locations']['omf']) clinical_auxiliary_omf_ssf_filters.update(config['metadata_locations']['ssf_clinical']) clinical_omf_ssf_auxiliary_barcode2field2value = filter_data(log, clinical_omf_ssf_auxiliary_barcode2field2value, clinical_auxiliary_omf_ssf_filters) biospecimen_ssf_uuid2field2value = merge_maps_master_other(biospecimen_uuid2field2value, ssf_sample_uuid2field2value, 'ssf') biospecimen_filters = config['metadata_locations']['biospecimen'] biospecimen_filters.update(config['metadata_locations']['ssf_biospecimen']) biospecimen_uuid2field2value = filter_data(log, biospecimen_ssf_uuid2field2value, biospecimen_filters) clinical_metadata.update(clinical_omf_ssf_auxiliary_barcode2field2value) biospecimen_metadata.update(biospecimen_uuid2field2value)
def upload_archive(config, sdrf_metadata, archive2metadata, ffpe_samples, archive_fields, upload_archives, seen_files, nonupload_files, access, log): archive_path = None if config['download_archives'] and util.is_upload_archive(archive_fields[0], upload_archives, archive2metadata): log.info('\tuploading %s-access archive %s.' % (access, archive_fields[0])) try: level = archive_fields[0].split('.')[-4].replace('_', ' ') user_info = config['user_info'] archive_path = util.setup_archive(archive_fields, log, user_info['user'], user_info['password']) file2metadata = process_files(config, archive_path, sdrf_metadata, seen_files, nonupload_files, ffpe_samples, level, log) if 0 < len(file2metadata): upload_files(config, archive_path, file2metadata, log) else: log.warning('did not find files to load for %s' % (archive_fields[0])) finally: if archive_path: shutil.rmtree(archive_path) log.info('\tfinished uploading %s-access archive %s' % (access, archive_fields[0])) else: log.info('\tskipping %s-access archive %s' % (access, archive_fields[0]))
def parse_files(config, log, files, archive_path, archive_fields, study, archive2metadata, clinical_metadata, biospecimen_metadata): sample_code2letter = config['sample_code2letter'] sample_code2type = config['sample_code2type'] clinical_auxiliary_filters = config['metadata_locations']['clinical'] clinical_auxiliary_filters.update(config['metadata_locations']['auxiliary']) biospecimen_filters = config['metadata_locations']['biospecimen'] upload_archive = util.is_upload_archive(archive_fields[0], config['upload_archives'], archive2metadata) and config['upload_open'] clinical_barcode2field2value = {} auxiliary_barcode2field2value = {} biospecimen_barcode2field2value = {} for file_name in files: if clinical_pat.match(file_name): parse_clinical(archive_path + file_name, log, clinical_barcode2field2value, 'bcr_patient_barcode') if upload_archive: upload_bio_file(config, archive_path, file_name, study, log) else: log.info('\tskipping upload of %s' % file_name) elif auxiliary_pat.match(file_name): parse_auxiliary(archive_path + file_name, log, auxiliary_barcode2field2value, 'bcr_patient_barcode') if upload_archive: upload_bio_file(config, archive_path, file_name, study, log) else: log.info('\tskipping upload of %s' % file_name) elif biospecimen_pat.match(file_name): parse_biospecimen(archive_path + file_name, log, biospecimen_barcode2field2value, 'sample:bcr_sample_barcode', sample_code2letter, sample_code2type) if upload_archive: upload_bio_file(config, archive_path, file_name, study, log) else: log.info('\tskipping upload of %s' % file_name) clinical_auxiliary_barcode2field2value = merge_clinical_auxiliary(clinical_barcode2field2value, auxiliary_barcode2field2value) clinical_auxiliary_barcode2field2value = filter_data(log, clinical_auxiliary_barcode2field2value, clinical_auxiliary_filters) biospecimen_barcode2field2value = filter_data(log, biospecimen_barcode2field2value, biospecimen_filters) clinical_metadata.update(clinical_auxiliary_barcode2field2value.iteritems()) biospecimen_metadata.update(biospecimen_barcode2field2value.iteritems())