def find_mutation_files( input_files, combined_maf=False, max_peptide_length=31): """ Collect all .vcf/.maf file paths in the `input_filenames` list. Returns a dictionary mapping patient IDs to DataFrames containing basic variant information (chr, pos, ref, alt). The patient IDs will be each filename without its extension, unless the argument combined_maf is True. In this case, patient IDs are derived from the tumor barcode column in each MAF file. """ mutation_files = OrderedDict() for path in input_filenames: _, filename = split(path) base, ext = splitext(filename) if ext in MUTATION_FILE_EXTENSIONS: if ext.endswith('maf') and combined_maf: maf_df = load_maf(path) file_patients = {} for barcode, group_df in ( maf_df.groupby(['Tumor_Sample_Barcode'])): vcf_df = maf_to_vcf(group_df) patient_id = get_patient_id(barcode) file_patients[patient_id] = vcf_df else: patient_id = get_patient_id(base) vcf_df = load_variants(path) file_patients = {patient_id: vcf_df} for patient_id, vcf_df in file_patients.iteritems(): patient_id = "-".join(patient_id.split("-")[:3]) if patient_id in mutation_files: logging.warning( "Already processed patient %s before file %s", patient_id, path) else: mutation_files[patient_id] = vcf_df if args.debug_patient_id: patient_id = args.debug_patient_id mutation_files = {patient_id: mutation_files[patient_id]} return mutation_files
def find_mutation_files(input_files, combined_maf=False, max_peptide_length=31): """ Collect all .vcf/.maf file paths in the `input_filenames` list. Returns a dictionary mapping patient IDs to DataFrames containing basic variant information (chr, pos, ref, alt). The patient IDs will be each filename without its extension, unless the argument combined_maf is True. In this case, patient IDs are derived from the tumor barcode column in each MAF file. """ mutation_files = OrderedDict() for path in input_filenames: _, filename = split(path) base, ext = splitext(filename) if ext in MUTATION_FILE_EXTENSIONS: if ext.endswith('maf') and combined_maf: maf_df = load_maf(path) file_patients = {} for barcode, group_df in (maf_df.groupby( ['Tumor_Sample_Barcode'])): vcf_df = maf_to_vcf(group_df) patient_id = get_patient_id(barcode) file_patients[patient_id] = vcf_df else: patient_id = get_patient_id(base) vcf_df = load_variants(path) file_patients = {patient_id: vcf_df} for patient_id, vcf_df in file_patients.iteritems(): patient_id = "-".join(patient_id.split("-")[:3]) if patient_id in mutation_files: logging.warning( "Already processed patient %s before file %s", patient_id, path) else: mutation_files[patient_id] = vcf_df if args.debug_patient_id: patient_id = args.debug_patient_id mutation_files = {patient_id: mutation_files[patient_id]} return mutation_files
def collect_files(input_dir_string, read_file_fn, permissive_parsing): """ Collect all files in the dir(s) given as a comma-separated string, and then perform per-patient ID file_type-specific processing. """ patient_to_data = {} for dirpath in input_dir_string.split(","): for filename in listdir(dirpath): base, ext = splitext_permissive(filename, [".txt"]) if is_valid_tcga(base): patient_id = get_patient_id(base) path = join(dirpath, filename) result = read_file_fn(path, permissive_parsing) if result: patient_to_data[patient_id] = result if args.debug_patient_id: patient_id = args.debug_patient_id patient_to_data = {patient_id: patient_to_data[patient_id]} return patient_to_data