def main(): args = parse_args() init_log(args.log) with open(args.app) as app_file: # parse and validate the requested data application JSON file application = Application(app_file) logging.info("Input data application parsed: {}".format(args.app)) # Create output directory for the results application_dir = create_app_dir(application) # check what data types are allowed for this application allowed_data_types = application.allowed_data_types() logging.info("Allowed data types: {}".format( ' '.join(allowed_data_types))) if len(allowed_data_types) > 0: # Get all the sample metadata for all requested cohorts requested_cohorts = application.cohorts() metadata = Metadata(args.data, requested_cohorts) logging.info("Metadata collected for requested cohorts: {}".format( ' '.join(requested_cohorts))) metadata_sample_ids = sorted(metadata.get_sample_ids()) logging.info("Metadata for sample IDs: {}".format( ' '.join(metadata_sample_ids))) # Filter the sample metadata based on patient consent metadata.filter_consent(args.consent, allowed_data_types) logging.warning("Consent not handled yet. FIXME") # Find all the file paths for requested file types for each # consented sample requested_file_types = application.file_types() logging.info("Requested file types: {}".format( ' '.join(requested_file_types))) fastqs, bams, bais, vcfs = get_files(args.data, requested_file_types, metadata) logging.info("VCF files selected:\n{}".format('\n'.join(vcfs))) logging.info("BAM files selected:\n{}".format('\n'.join(bams))) logging.info("BAI files selected:\n{}".format('\n'.join(bais))) logging.info("FASTQ files selected:\n{}".format('\n'.join(fastqs))) output_files = [] if 'Anonymised' in allowed_data_types: # generate random IDs for all output samples randomised_ids = make_random_ids(args.usedids, metadata.sample_ids) metadata.anonymise(randomised_ids) metadata.write(args.metaout) logging.info("Anonymised metadata written to: {}".format( args.metaout)) new_vcfs = anonymise_files(vcfs, randomised_ids, application_dir, VCF_filename, vcf_edit) new_bams = anonymise_files(bams, randomised_ids, application_dir, BAM_filename, bam_edit) # BAIs and FASTQs are just sym-linked to output with randomised name new_bais = anonymise_files(bais, randomised_ids, application_dir, BAI_filename) new_fastqs = anonymise_files(fastqs, randomised_ids, application_dir, FASTQ_filename) output_files.extend(new_vcfs + new_bams + new_bais + new_fastqs) logging.info("Output files are anonymised") elif 'Re-identifiable' in allowed_data_types: new_links = link_files(application_dir, vcfs + bams + bais + fastqs) output_files.extend(new_links) logging.info( "Files linked in directory: {}".format(application_dir)) metadata.write(args.metaout) logging.info("Output files are re-identifiable") else: print_error( "Allowed data is neither anonymised nor re-identifiable") exit(ERROR_BAD_ALLOWED_DATA) logging.info("Generating MD5 checksums on output files") md5_files(args.md5, output_files) else: logging.warning("No data available for this application")