Esempio n. 1
0
def find_mutation_files(
        input_files, combined_maf=False, max_peptide_length=31):
    """
    Collect all .vcf/.maf file paths in the `input_filenames` list.

    Returns a dictionary mapping patient IDs to DataFrames containing basic
    variant information (chr, pos, ref, alt). The patient IDs will be each
    filename without its extension, unless the argument combined_maf is True.
    In this case, patient IDs are derived from the tumor barcode column in
    each MAF file.
    """
    mutation_files = OrderedDict()

    for path in input_filenames:
        _, filename = split(path)
        base, ext = splitext(filename)
        if ext in MUTATION_FILE_EXTENSIONS:
            if ext.endswith('maf') and combined_maf:
                maf_df = load_maf(path)
                file_patients = {}
                for barcode, group_df in (
                        maf_df.groupby(['Tumor_Sample_Barcode'])):
                    vcf_df = maf_to_vcf(group_df)
                    patient_id = get_patient_id(barcode)
                    file_patients[patient_id] = vcf_df
            else:
                patient_id = get_patient_id(base)
                vcf_df = load_variants(path)
                file_patients = {patient_id: vcf_df}

            for patient_id, vcf_df in file_patients.iteritems():
                patient_id = "-".join(patient_id.split("-")[:3])
                if patient_id in mutation_files:
                    logging.warning(
                        "Already processed patient %s before file %s",
                            patient_id,
                            path)
                else:
                    mutation_files[patient_id] = vcf_df
    if args.debug_patient_id:
        patient_id = args.debug_patient_id
        mutation_files = {patient_id: mutation_files[patient_id]}
    return mutation_files
Esempio n. 2
0
def find_mutation_files(input_files,
                        combined_maf=False,
                        max_peptide_length=31):
    """
    Collect all .vcf/.maf file paths in the `input_filenames` list.

    Returns a dictionary mapping patient IDs to DataFrames containing basic
    variant information (chr, pos, ref, alt). The patient IDs will be each
    filename without its extension, unless the argument combined_maf is True.
    In this case, patient IDs are derived from the tumor barcode column in
    each MAF file.
    """
    mutation_files = OrderedDict()

    for path in input_filenames:
        _, filename = split(path)
        base, ext = splitext(filename)
        if ext in MUTATION_FILE_EXTENSIONS:
            if ext.endswith('maf') and combined_maf:
                maf_df = load_maf(path)
                file_patients = {}
                for barcode, group_df in (maf_df.groupby(
                    ['Tumor_Sample_Barcode'])):
                    vcf_df = maf_to_vcf(group_df)
                    patient_id = get_patient_id(barcode)
                    file_patients[patient_id] = vcf_df
            else:
                patient_id = get_patient_id(base)
                vcf_df = load_variants(path)
                file_patients = {patient_id: vcf_df}

            for patient_id, vcf_df in file_patients.iteritems():
                patient_id = "-".join(patient_id.split("-")[:3])
                if patient_id in mutation_files:
                    logging.warning(
                        "Already processed patient %s before file %s",
                        patient_id, path)
                else:
                    mutation_files[patient_id] = vcf_df
    if args.debug_patient_id:
        patient_id = args.debug_patient_id
        mutation_files = {patient_id: mutation_files[patient_id]}
    return mutation_files
Esempio n. 3
0
def load_variants(input_filename):
    """
    Read the input file into a DataFrame containing (at least)
    the basic columns of a VCF:
        - chr
        - pos
        - ref
        - alt
    """
    # VCF and MAF files give us the raw mutations in genomic coordinates
    if input_filename.endswith(".vcf"):
        vcf_df = load_vcf(input_filename)
    elif input_filename.endswith(".maf"):
        maf_df = load_maf(input_filename)
        vcf_df = maf_to_vcf(maf_df)
    elif input_filename.endswith("tab"):
        tab_df = pd.read_csv(input_filename, sep='\t', header=0)
        vcf_df = tab_to_vcf(tab_df)
    else:
        assert False, "Unrecognized file type %s" % input_filename
    return vcf_df
Esempio n. 4
0
def load_variants(input_filename):
    """
    Read the input file into a DataFrame containing (at least)
    the basic columns of a VCF:
        - chr
        - pos
        - ref
        - alt
    """
    # VCF and MAF files give us the raw mutations in genomic coordinates
    if input_filename.endswith(".vcf"):
        vcf_df = load_vcf(input_filename)
    elif input_filename.endswith(".maf"):
        maf_df = load_maf(input_filename)
        vcf_df = maf_to_vcf(maf_df)
    elif input_filename.endswith("tab"):
        tab_df = pd.read_csv(input_filename, sep='\t', header=0)
        vcf_df = tab_to_vcf(tab_df)
    else:
        assert False, "Unrecognized file type %s" % input_filename
    return vcf_df