def create_variants(file, patient_id): filename = secure_filename(file.filename) filepath = join(app.config['UPLOAD_FOLDER'], filename) file.save(filepath) vcf_df = load_variants(filepath) variants = [] for index, row in vcf_df.iterrows(): chr = row['chr'] pos = row['pos'] ref = row['ref'] alt = row['alt'] variant = Variant(patient_id=patient_id, chr=chr, pos=pos, ref=ref, alt=alt) variants.append(variant) return variants
def find_mutation_files( input_files, combined_maf=False, max_peptide_length=31): """ Collect all .vcf/.maf file paths in the `input_filenames` list. Returns a dictionary mapping patient IDs to DataFrames containing basic variant information (chr, pos, ref, alt). The patient IDs will be each filename without its extension, unless the argument combined_maf is True. In this case, patient IDs are derived from the tumor barcode column in each MAF file. """ mutation_files = OrderedDict() for path in input_filenames: _, filename = split(path) base, ext = splitext(filename) if ext in MUTATION_FILE_EXTENSIONS: if ext.endswith('maf') and combined_maf: maf_df = load_maf(path) file_patients = {} for barcode, group_df in ( maf_df.groupby(['Tumor_Sample_Barcode'])): vcf_df = maf_to_vcf(group_df) patient_id = get_patient_id(barcode) file_patients[patient_id] = vcf_df else: patient_id = get_patient_id(base) vcf_df = load_variants(path) file_patients = {patient_id: vcf_df} for patient_id, vcf_df in file_patients.iteritems(): patient_id = "-".join(patient_id.split("-")[:3]) if patient_id in mutation_files: logging.warning( "Already processed patient %s before file %s", patient_id, path) else: mutation_files[patient_id] = vcf_df if args.debug_patient_id: patient_id = args.debug_patient_id mutation_files = {patient_id: mutation_files[patient_id]} return mutation_files
def find_mutation_files(input_files, combined_maf=False, max_peptide_length=31): """ Collect all .vcf/.maf file paths in the `input_filenames` list. Returns a dictionary mapping patient IDs to DataFrames containing basic variant information (chr, pos, ref, alt). The patient IDs will be each filename without its extension, unless the argument combined_maf is True. In this case, patient IDs are derived from the tumor barcode column in each MAF file. """ mutation_files = OrderedDict() for path in input_filenames: _, filename = split(path) base, ext = splitext(filename) if ext in MUTATION_FILE_EXTENSIONS: if ext.endswith('maf') and combined_maf: maf_df = load_maf(path) file_patients = {} for barcode, group_df in (maf_df.groupby( ['Tumor_Sample_Barcode'])): vcf_df = maf_to_vcf(group_df) patient_id = get_patient_id(barcode) file_patients[patient_id] = vcf_df else: patient_id = get_patient_id(base) vcf_df = load_variants(path) file_patients = {patient_id: vcf_df} for patient_id, vcf_df in file_patients.iteritems(): patient_id = "-".join(patient_id.split("-")[:3]) if patient_id in mutation_files: logging.warning( "Already processed patient %s before file %s", patient_id, path) else: mutation_files[patient_id] = vcf_df if args.debug_patient_id: patient_id = args.debug_patient_id mutation_files = {patient_id: mutation_files[patient_id]} return mutation_files