Beispiel #1
0
    def sci_variant_bldr(self):
        import allel
        import subprocess
        import collections
        import pandas as pd
        import os
        if len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]) > 1:
            print("Multiple VCFs detected. Files will be merged")
            if len([
                    _ for _ in os.listdir(self.path) if _.endswith('.vcf')
            ]) < len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]):
                print("VCFs not compressed - compressing")
                for i in [
                        _ for _ in os.listdir(self.path) if _.endswith('.vcf')
                ]:
                    #testing
                    #i = [_ for _ in os.listdir(path) if _.endswith('.vcf')][0]
                    vcf = path + i
                    subprocess.run(['bgzip', "-c", vcf, ">"],
                                   stdout=open(vcf + ".gz", "w"))
                    # required?
                    subprocess.run(['tabix', '-p', 'vcf', vcf + ".vcf"])
            command = 'bcftools merge --force-samples ' + path + "*.gz" + ' -o ' + path + 'INPUT.vcf'
            subprocess.run(command, shell=True)
            vcfdata = allel.read_vcf(path + 'INPUT.vcf',
                                     fields=[
                                         'samples', 'calldata/GT',
                                         'variants/ALT', 'variants/REF',
                                         'variants/CHROM', 'variants/POS',
                                         'variants/svlen'
                                     ])
            vcfdf = allel.vcf_to_dataframe(
                path + 'INPUT.vcf',
                exclude_fields=['QUAL', 'FILTER_PASS', 'ID'])
        else:
            vcffile = [_ for _ in os.listdir(self.path) if _.endswith('.vcf')]
            vcfdata = allel.read_vcf(self.path + vcffile[0],
                                     fields=[
                                         'samples', 'calldata/GT',
                                         'variants/ALT', 'variants/REF',
                                         'variants/CHROM', 'variants/POS',
                                         'variants/svlen'
                                     ])
            #vcfdata = allel.read_vcf("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf")
            vcfdf = allel.vcf_to_dataframe(
                self.path + vcffile[0],
                exclude_fields=['QUAL', 'FILTER_PASS', 'ID'])
        #vcfdf = allel.vcf_to_dataframe("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf")
        sample_set = list(collections.OrderedDict.fromkeys(vcfdata['samples']))
        gt = allel.GenotypeArray(
            vcfdata['calldata/GT']).to_n_alt()  # drop additional information
        gt_data = pd.DataFrame(gt, columns=sample_set)

        data = pd.concat([vcfdf, gt_data], axis=1, join='inner')
        return data
Beispiel #2
0
def save_mutect2(*args):

    patient = args[0]
    sample = args[1]

    path_to_mutect2 = f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/s{sample}"
    mutect2_name = f"output_transcr_new_predicted_dbsnp.vcf"
    mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join(
        path_to_mutect2, mutect2_name),
                                            fields=["numalt"],
                                            alt_number=1)
    alt_len = max(mutect2_wo_ann["numalt"])
    col_list = [
        "CHROM", "POS", "ID", "REF", "DP", "FILTER_PASS", "ANN_Annotation",
        "ANN_Annotation_Impact", "ANN_Gene_Name", "ANN_Gene_ID", "ANN_HGVS_c",
        "ANN_HGVS_p", "ANN_AA_pos"
    ]
    triple_list = []
    alt_cols = [
        "ALT", "dbNSFP_Polyphen2_HVAR_score", "dbNSFP_SIFT_score",
        "dbNSFP_MetaLR_score", "dbNSFP_Polyphen2_HDIV_score",
        "dbNSFP_Uniprot_acc", "dbNSFP_CADD_phred",
        "dbNSFP_Polyphen2_HDIV_pred", "dbNSFP_MutationTaster_score",
        "dbNSFP_SIFT_pred", "dbNSFP_MutationTaster_pred",
        "dbNSFP_Polyphen2_HVAR_pred", "dbNSFP_MetaLR_pred"
    ]
    for j in alt_cols:
        for i in range(1, alt_len + 1):
            triple_list.append(j + f"_{i}")
            col_list.append(j + f"_{i}")
    mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join(
        path_to_mutect2, mutect2_name),
                                            fields="*",
                                            alt_number=alt_len,
                                            exclude_fields="ANN")
    mutect2_w_ann = allel.vcf_to_dataframe(os.path.join(
        path_to_mutect2, mutect2_name),
                                           fields="ANN",
                                           alt_number=alt_len,
                                           transformers=allel.ANNTransformer())
    mutect2 = pd.concat([mutect2_wo_ann, mutect2_w_ann], axis=1)
    mutect2 = mutect2[col_list]
    mutect2.fillna("-", inplace=True)
    for i in alt_cols:
        mutect2[i] = mutect2[i+"_1"].map(str)+","+mutect2[i+"_2"].map(str)+","+ \
        mutect2[i+"_3"].map(str)
    mutect2.drop(triple_list, axis=1, inplace=True)
    return mutect2
    mutect2 = pfam_annotate(mutect2)
    mutect2.to_pickle(os.path.join(path_to_mutect2, "mutect2.pkl"))
def compute_sample_freqs(
    variants_vcf: str,
    output: str,
    snp_only: bool,
):
    """
	Creates JSON file of form 
	{
		<position>: {
			"REF": [ <ref_alt1>, <ref_alt2>, ...]
			"ALT": [ <alt1>, <alt2>, ...]
			"probs": [<p_alt1>, <p_alt2>, ... <p_ref>] 
		}
	}
	"""
    variants = (
        allel.vcf_to_dataframe(variants_vcf, fields=[
            'POS', 'REF', 'ALT'
        ]).drop(['ALT_2', 'ALT_3'], axis=1)  # ALT_2, ALT_3 are always empty
    )
    genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT'])
    genotypes = genotypes['calldata/GT']
    # scikit-allel reads missing values as -1
    genotypes = np.where(genotypes == -1, 0, genotypes)
    haplo_1 = genotypes[:, :, 0]
    haplo_2 = genotypes[:, :, 1]

    num_samples = haplo_1.shape[1] * 2

    haplos = pd.concat([variants, haplo_1, haplo_2], axis=1)
    if snp_only:
        haplos = haplos[(haplos['REF'].str.len() == 1) &
                        (haplos['ALT_1'].str.len()
                         == 1)]  # subset to only keep SNPs

    var_freqs = {}

    for pos in tqdm(haplos['POS'].unique()):
        sum_alt = 0
        pos_variants = haplos[haplos['POS'] == pos]
        refs = []
        variants = []
        freqs = []
        # frequencies of each variant at this position
        for row in pos_variants.itertuples(index=False):
            # each tuple is of form (POS, REF, ALT, ...)
            num_alt = np.sum(row[3:])
            sum_alt += num_alt
            refs.append(row[1])
            variants.append(row[2])
            freqs.append(num_alt / num_samples)
        # frequency of the reference
        freqs.append((num_samples - sum_alt) / num_samples)
        # add result to main dictionary
        var_freqs[int(pos)] = {'REF': refs, 'ALT': variants, 'freq': freqs}

    with open(output, 'w') as fp:
        json.dump(var_freqs, fp, sort_keys=True)

    return
Beispiel #4
0
def read_vcf(fp, all_columns):
    try:
        print('Reading file ' + fp + ' - size: ' +
              str(osutils.format_bytes(os.path.getsize(fp))))
        if (all_columns):
            df = allel.vcf_to_dataframe(fp, fields='*', alt_number=1)
        else:
            df = allel.vcf_to_dataframe(fp)
        # Possible other similar fields are 'REF':'Ref', 'ALT':'Obs'
        df.rename(columns={'CHROM': 'Chr', 'POS': 'Start'}, inplace=True)

        return df
    except (OSError, IOError) as e:
        print('>>> read_vcf: {0} - error: {1} <<<'.format(
            fp, os.strerror(e.errno)))
        return None
Beispiel #5
0
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir,
                                 input_ref_pgx_vcf):
    '''
    extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output
    '''

    print(
        'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.'
    )
    path_output = os.path.join(
        output_dir,
        obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz')

    input_vcf_cyvcf2 = VCF(input_vcf)
    input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf)

    # get pgx regions in each chromosome
    input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf)
    input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[
        'CHROM'].replace({
            'chr': ''
        }, regex=True).astype(str).astype(int)
    ref_pgx_regions = input_ref_pgx_pos_pandas.groupby(
        ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index()
    # fix chr names
    chr_name_match = re.compile("^chr")
    if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames):
        # chromosomes have leading 'chr' characters in the original VCF
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)),
            axis=1).replace({'^': 'chr'}, regex=True)
    else:
        # chromosomes do not have leading 'chr' characters in the original VCF
        # add chromosome name with leading 'chr' to the VCF header
        for single_chr in input_vcf_cyvcf2.seqnames:
            input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr +
                                           '>')
        # pgx regions to be extracted
        ref_pgx_regions = ref_pgx_regions.apply(
            lambda row: ':'.join(row.values.astype(str)), axis=1)

    # write to a VCF output file
    # header
    output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz")
    # content
    for single_region in ref_pgx_regions:
        for single_variant in input_vcf_cyvcf2(single_region):
            single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1',
                                          single_variant.CHROM)
            output_vcf_cyvcf2.write_record(single_variant)

    # close pipe
    input_vcf_cyvcf2.close()
    input_ref_pgx_pos_cyvcf2.close()
    output_vcf_cyvcf2.close()

    tabix_index_vcf(tabix_executable_path, path_output)

    return path_output
Beispiel #6
0
def get_max_num_ann(temp_out_name):
    num_ann_guess = 500
    callset = allel.vcf_to_dataframe(temp_out_name, fields='ANN', numbers={'ANN': num_ann_guess})
    num_ann = callset.apply(lambda x: sum(x != ''), axis=1)
    num_ann_max = num_ann.max()  # num_ann_max = 175

    return num_ann_max
Beispiel #7
0
 def alleles(self):
     import allel
     vcfInfo = allel.vcf_to_dataframe(
         self.vcf,
         ['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT'],
         alt_number=1)
     vcfList = vcfInfo.values.tolist()
     return vcfList
Beispiel #8
0
    def parse_mutations(self):
        '''
        purpose: parse input mutation vcf file
        input: vcf file or gz vcf file, one alternate per line
        format: vcf 4.0 standard format
        output: mutation dataframe with mutation id and genomic position
        '''
        print("Parsing mutations file:{}".format(self.mutation_file))
        log.info("Parsing mutations file:{}".format(self.mutation_file))

        # check that file exists and is not empty
        self.file_check(self.mutation_file)

        # read in mutation file, truncate to only one mutation per line
        mutation_df = allel.vcf_to_dataframe(self.mutation_file,
                                             fields=[
                                                 'CHROM', 'POS', 'ID', 'REF',
                                                 'ALT', 'variants/STRAND',
                                                 'variants/svlen'
                                             ],
                                             alt_number=1,
                                             types={
                                                 'CHROM': 'object',
                                                 'POS': 'int32',
                                                 'ID': 'object',
                                                 'REF': 'object',
                                                 'ALT': 'object',
                                                 'STRAND': 'S1',
                                                 'variants/svlen': int
                                             },
                                             numbers={
                                                 "ALT": 1,
                                                 "STRAND": 1
                                             })

        # test that required columns are present
        self.test_cols(mutation_df, "mutation vcf",
                       ["CHROM", "POS", "ID", "REF", "ALT", "STRAND"])
        mutation_df.rename(columns={"CHROM":"chrom", "POS":"pos", "ID":"id", "REF":"ref", \
                             "ALT":"alt", "STRAND":"strand"}, inplace=True)

        # test that no two mutation ID's are the same
        assert mutation_df["id"].nunique() == mutation_df.shape[0]

        # drop any identical mutations
        mutation_df.drop_duplicates(["chrom", "pos", "ref", "alt", "strand"],
                                    inplace=True)

        # convert mutation lengths to mutation types
        mutation_df['mut_type'] = ""
        mutation_df['mut_type'][mutation_df["svlen"] == 0] = "SNV"
        mutation_df['mut_type'][mutation_df["svlen"] < 0] = "DEL"
        mutation_df['mut_type'][mutation_df["svlen"] > 0] = "INS"
        mutation_df["id"] = mutation_df.id.astype(
            str) + "_" + mutation_df.mut_type.astype(str)

        return mutation_df
Beispiel #9
0
def generate_var_id_for_exac(vcf_file):
    callset = allel.vcf_to_dataframe(vcf_file, fields=['CHROM', 'POS', 'REF', 'ALT'], alt_number=num_alt)
    var_all = pd.DataFrame()
    for i in range(1, num_alt+1):
        ALT_i = 'ALT_' + str(i)
        var_i = callset[['CHROM', 'POS', 'REF', ALT_i]].apply(lambda x: "-".join(x.map(str)), axis=1)
        var_all = pd.concat([var_all, var_i], axis=1)

    return var_all
Beispiel #10
0
    def parse_mutations(self):
        '''
        purpose: parse input mutation vcf file
        input: vcf file or gz vcf file, one alternate per line
        format: vcf 4.0 standard format
        output: mutation dataframe with mutation id and genomic position
        '''
        print("Parsing mutations file:{}".format(self.mutation_file))
        log.info("Parsing mutations file:{}".format(self.mutation_file))
        
        # check that file exists and is not empty 
        self.file_check(self.mutation_file)
        
        try:
            
            # read in mutation file, truncate to only one mutation per line
            mutation_df = allel.vcf_to_dataframe(self.mutation_file, 
                                                 fields=['CHROM', 'POS', 'ID', 
                                                         'REF', 'ALT', 
                                                         'variants/STRAND', 
                                                         'is_snp'], 
                                                 alt_number=1,
                                                 types={'CHROM':'object', 'POS':'int32',
                                                        'ID':'object', 'REF':'object',
                                                        'ALT':'object', 'STRAND':'S1',
                                                        'is_snp':'object'},
                                                 numbers={"ALT":1, "STRAND":1})

            # test that required columns are present
            self.test_cols(mutation_df, "mutation vcf", ["CHROM", "POS", "ID", "REF", "ALT", "STRAND"]) 
            mutation_df.rename(columns={"CHROM":"chrom", "POS":"pos", "ID":"id", "REF":"ref", \
                                 "ALT":"alt", "STRAND":"strand"}, inplace=True)

            # test that no two mutation ID's are the same
            assert mutation_df["id"].nunique() == mutation_df.shape[0]

            # drop any identical mutations
            mutation_df.drop_duplicates(["chrom", "pos", "ref", "alt", "strand"], inplace=True)
                                      
            # label mutation by length for bucketing
            conditions = [
                (mutation_df['ref'].str.len() <  mutation_df['alt'].str.len()),
                (mutation_df['ref'].str.len() >  mutation_df['alt'].str.len()),
                (mutation_df['is_snp'])
                ]
            choices = ['ins', 'del', 'snv']
             
            # add column with mutation type
            mutation_df['mut_type'] = np.select(conditions, choices, default='')      
            mutation_df.drop("is_snp", inplace=True, axis=1)      
            return mutation_df
        
        except Exception as e:
            error_msg = "Error parsing mutation vcf file: \n{}".format(e)
            log.error(error_msg)
            raise SystemExit(error_msg)
Beispiel #11
0
def read10xlargeSVs(input, sv_type, qual_filter):

    df = allel.vcf_to_dataframe(input, fields=['variants/CHROM', 'variants/POS', 'variants/ID', 'variants/REF', 'variants/ALT', 'variants/QUAL', 'variants/FILTER_PASS', 'variants/END', 'variants/SVLEN'])
    if qual_filter:
        scores_cutoff = np.mean(df.QUAL) + 1*np.std(df.QUAL)
        df = df.loc[df['QUAL'] > scores_cutoff]
    df = df.loc[df['ALT_1']==sv_type]
    df.reset_index(inplace=True, drop=True)
    df['CHROM'] = df['CHROM'].map(lambda x: x.lstrip('chr'))

    return(df)
Beispiel #12
0
def fit_em_smm(
    variants_vcf: str,
    n_iterations: int,
    K: int,
    seed: int,
    logsum_approx: bool,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    variants = (
        allel.vcf_to_dataframe(variants_vcf, fields=[
            'POS', 'REF', 'ALT'
        ]).drop(['ALT_2', 'ALT_3'], axis=1)  # ALT_2, ALT_3 are always empty
    )
    genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT'])
    genotypes = genotypes['calldata/GT']
    # scikit-allel reads missing values as -1
    genotypes = np.where(genotypes == -1, 0, genotypes)
    haplo_1 = genotypes[:, :, 0]
    haplo_2 = genotypes[:, :, 1]
    haplos = np.hstack((haplo_1, haplo_2)).T

    n_variants_pos = (
        variants  # find number of variants by position
        .groupby('POS'
                 )  # add 1 to account for fact that we always have a reference
        .count()['REF'].values) + 1
    max_n_variants = np.sort(n_variants_pos)[-1]
    n_loci = len(variants['POS'].unique())
    n_samples = haplos.shape[0]
    haplos = _encode_haplotypes(variants['POS'].values, haplos, n_samples,
                                n_loci)

    # em initialization
    rng = np.random.default_rng(seed)
    group_e_ini = rng.random(size=(n_samples, K))
    group_e = group_e_ini / np.sum(group_e_ini, axis=1, keepdims=1)
    group_probs = np.full(6, 1 / K)  # make this a probability vector
    variant_ini = rng.random(size=(K, n_loci, max_n_variants))
    variant_probs = variant_ini / np.sum(variant_ini, axis=2, keepdims=1)
    # TODO: add step filtering this to correct number of variants

    return _em_loop(
        n_iterations,
        K,
        n_samples,
        n_loci,
        n_variants_pos,
        group_e,
        group_probs,
        variant_probs,
        haplos,
        logsum_approx,
    )
    def merge_vcf_into_fasta(self, fasta_file, vcf_file):
        '''

        '''
        fasta = FileName(fasta_file)
        vcf = FileName(vcf_file)

        output_file_name = vcf.name + "_merged_into_" + fasta.name + ".fasta"
        fasta_out = open(output_file_name, 'w')
        print(f'Threshold cutoff, not applying calls below {self.qual_threshold} QUAL')
        print(f'Threshold cutoff, not applying calls below {self.map_threshold} MQ')
        if self.ambiguity_NOT:
            print(f'AC=1 calls will NOT be reported with IUPAC ambiguity nucleotide codes.')
        else:
            print(f'AC=1 calls will be reported with IUPAC ambiguity nucleotide codes')
        

        for seq_record in SeqIO.parse(fasta.file, "fasta"):
            qual_threshold = int(self.qual_threshold)
            map_threshold = int(self.map_threshold)
            ambiguity_NOT = self.ambiguity_NOT

            chrom_seq_dict = {}
            record = 0
            chrom = seq_record.id
            sequence = seq_record.seq
            for base in sequence:
                record += 1 
                chrom_seq_dict[chrom + "-" + str(record)] = base
            fasta_file_df = pd.DataFrame.from_dict(chrom_seq_dict, orient='index')
            fasta_file_df.columns = ["REF"]
            fasta_file_df = fasta_file_df.rename(columns={"REF": "ALT"})
            fasta_file_df.index.name = "Absolute_Pos"

            vcf_df = allel.vcf_to_dataframe(vcf.file, fields=['variants/CHROM', 'variants/POS', 'variants/QUAL', 'variants/REF', 'variants/ALT', 'variants/AC', 'variants/DP', 'variants/MQ'], alt_number=1)
            df1 = vcf_df[((vcf_df.QUAL >= qual_threshold) & (vcf_df.ALT.str.len() == 1) & (vcf_df.MQ >= map_threshold) | (vcf_df.REF == "N"))]
            if not ambiguity_NOT:  #aka applying ambiguity
                df1.ALT = np.where(df1.AC.eq(1), df1.REF + df1.ALT, df1.ALT)  #cat to ALT column when AC=1
                df1.ALT = df1.ALT.replace({"AG": "R", "CT": "Y", "GC": "S", "AT": "W", "GT": "K", "AC": "M", "GA": "R", "TC": "Y", "CG": "S", "TA": "W", "TG": "K", "CA": "M"})
            df1.ALT = np.where(df1.REF.eq("N"), df1.REF, df1.ALT) #move the Ns to the ALT column
            df1["Absolute_Pos"] = df1["CHROM"].map(str) + "-" + df1["POS"].map(str)
            vcf_merge_read = df1[["Absolute_Pos", "ALT"]]
            vcf_merge_read = vcf_merge_read.set_index('Absolute_Pos')

            fasta_file_df.update(vcf_merge_read)  #merge vcf changes to fasta dataframe
            print(">{}".format(vcf.name + "_merged_into_" + fasta.name), file=fasta_out)
            print("{}".format(''.join(list(fasta_file_df.to_dict()['ALT'].values()))), file=fasta_out)

        print(f'\nFile written to: {output_file_name}\n')
        fasta_out.close()
        FastaLineBreaks(output_file_name)
Beispiel #14
0
def read10xlargeSVs(input, sv_type):

    df = allel.vcf_to_dataframe(input,
                                fields=[
                                    'variants/CHROM', 'variants/POS',
                                    'variants/ID', 'variants/REF',
                                    'variants/ALT', 'variants/QUAL',
                                    'variants/FILTER_PASS', 'variants/END',
                                    'variants/SVLEN'
                                ])
    df = df[df['ALT_1'].notna()]
    df = df.loc[df['ALT_1'].str.contains("DUP")]
    df.reset_index(inplace=True, drop=True)
    df['CHROM'] = df['CHROM'].map(lambda x: x.lstrip('chr'))

    return (df)
def read_vcf():

    global index, df, vcf, Chr
    df = allel.vcf_to_dataframe(vcf,
                                fields=['CHROM', 'POS', 'REF', 'ALT'],
                                alt_number=1)

    chromosome_vcf = df.CHROM
    i = 0
    for chr in chromosome_vcf:
        if chr == Chr:
            index.append(i)
        i += 1
    print(index)
    for j in index:
        print(df.iloc[[j]])
def read10x(input):

    df = allel.vcf_to_dataframe(input,
                                fields=[
                                    'variants/CHROM', 'variants/POS',
                                    'variants/ID', 'variants/REF',
                                    'variants/ALT', 'variants/QUAL',
                                    'variants/FILTER_PASS', 'variants/END',
                                    'variants/SVLEN'
                                ])
    # scores_cutoff = np.mean(df.QUAL) - 2*np.std(df.QUAL)
    # df = df.loc[df['FILTER_PASS']==True]
    # df = df.loc[df['QUAL']>scores_cutoff]
    df.reset_index(inplace=True, drop=True)
    df['CHROM'] = df['CHROM'].map(lambda x: x.lstrip('chr'))

    return (df)
Beispiel #17
0
def extract_from_vcf(reference_fasta: str, variants_vcf: str, sample_ids: str,
                     output: str, start: int, end: int):
    reference = Bio.SeqIO.read(reference_fasta, 'fasta')
    reference_seq = str(reference.seq)
    reference_gene = reference_seq[start:end]

    ids = pd.read_csv(sample_ids, header=None)
    ids = ids[0].tolist()

    variants = (
        allel.vcf_to_dataframe(variants_vcf, fields=[
            'POS', 'REF', 'ALT'
        ]).drop(['ALT_2', 'ALT_3'], axis=1)  # ALT_2, ALT_3 are always empty
    )
    # shift POS to start from 0 in the gene's sequence
    variants['POS'] = variants['POS'] - start

    genotypes = allel.read_vcf(variants_vcf,
                               fields=['calldata/GT'],
                               samples=ids)
    genotypes = genotypes['calldata/GT']
    haplo_1 = pd.DataFrame(genotypes[:, :, 0])
    haplo_2 = pd.DataFrame(genotypes[:, :, 1])

    haplos = pd.concat([variants, haplo_1, haplo_2], axis=1)
    haplos = haplos[(haplos['REF'].str.len() == 1)
                    & (haplos['ALT_1'].str.len() == 1)]
    haplos = haplos.reset_index()  # reset index after filtering

    # iterate over all haplotype columns, building the fulls sequence for each
    seqs = []
    for j in range(3, haplos.shape[1]):
        seq = list(reference_gene)  # cannot modify a string
        for i in range(len(haplos)):
            if haplos.iloc[i, j] == 1:
                alt = haplos.loc[i, 'ALT_1']
                seq[haplos.loc[i, 'POS']] = alt

        seq = pd.Series(seq, dtype='category')
        seqs.append(seq)
        print('Completed sample {}\n'.format(j))

    seqs_df = pd.DataFrame(
        seqs)  # as feather is columnar prefer taking transpose after loading
    seqs_df.columns = seqs_df.columns.astype(str)
    seqs_df.to_feather(output)
 def __init__(self, fileName):
     self.vcfFileName = fileName
     # AVAILABLE FIELDS
     # 'CHROM', 'POS', 'ID', 'REF', 'ALT_1', 'ALT_2', 'ALT_3', 'QUAL', 'PRO',
     #       'EPP_1', 'EPP_2', 'EPP_3', 'SRF', 'NS', 'AB_1', 'AB_2', 'AB_3',
     #       'NUMALT', 'SRR', 'RPPR', 'QA_1', 'QA_2', 'QA_3', 'RUN_1', 'RUN_2',
     #       'RUN_3', 'MQM_1', 'MQM_2', 'MQM_3', 'DPB', 'PAIREDR', 'SAR_1', 'SAR_2',
     #       'SAR_3', 'DPRA_1', 'DPRA_2', 'DPRA_3', 'BVAR', 'DP', 'RO', 'GTI',
     #       'ODDS', 'AC_1', 'AC_2', 'AC_3', 'AF_1', 'AF_2', 'AF_3', 'PAO_1',
     #       'PAO_2', 'PAO_3', 'PAIRED_1', 'PAIRED_2', 'PAIRED_3', 'CIGAR_1',
     #       'CIGAR_2', 'CIGAR_3', 'PQR', 'AO_1', 'AO_2', 'AO_3', 'LEN_1', 'LEN_2',
     #       'LEN_3', 'SRP', 'ABP_1', 'ABP_2', 'ABP_3', 'RPP_1', 'RPP_2', 'RPP_3',
     #       'MEANALT_1', 'MEANALT_2', 'MEANALT_3', 'AN', 'MQMR', 'QR', 'SAP_1',
     #       'SAP_2', 'SAP_3', 'PQA_1', 'PQA_2', 'PQA_3', 'TYPE_1', 'TYPE_2',
     #       'TYPE_3', 'EPPR', 'SAF_1', 'SAF_2', 'SAF_3', 'FILTER_PASS', 'numalt',
     #       'svlen_1', 'svlen_2', 'svlen_3', 'is_snp'
     self.dataFrame = allel.vcf_to_dataframe(fileName, fields='*')
Beispiel #19
0
def sim_to_mhs(sim,
               vcf_path=os.getcwd() + '/vcf_mhs/',
               vcf_filename=datetime.now().strftime("%Y%m%d"),
               mhs_path=os.getcwd() + '/vcf_mhs/',
               mhs_filename=datetime.now().strftime("%Y%m%d") + '_mhs',
               suffix='',
               verbose="True"):
    # function: given an msprime simulation, write a vcf for it and then call multihetsep() to write mhs
    # sim: msprime simulation object
    # vcf_path: path where you want to save vcf. Default is current directory
    # vcf_filename: the name of the vcf file that you want to write. Default is current date and time
    # vcf_filename = vcf_name
    # mhs_path: where do you want to save this mhs
    # mhs_filename: where do you want to save this file
    # verbose: Whether to print progress or not

    # get array of genotype for each variant. TODO this isn't necessary for PSMC's purposes
    # vars = [varient.genotypes for varient in sim.variants()]

    # check given directories are valid
    dir_check(vcf_path, mhs_path)

    #write vcf of genotypes
    if verbose: print('Writing vcf...')
    with open(vcf_path + vcf_filename + suffix + ".vcf", "w") as vcf_file:
        sim.write_vcf(vcf_file, 2)
    if verbose: print('vcf written to {}'.format(vcf_path))

    # write matrix of genotypes, maybe delete this
    gen_mat = sim.genotype_matrix()

    #read vcf as data frame
    sim_vcf = allel.vcf_to_dataframe(vcf_path + vcf_filename + suffix + ".vcf",
                                     fields='*')

    # generate and write mhs
    multihetsep(sim_vcf, mhs_path, mhs_filename, suffix, gen_mat, verbose)

    # return the vcf file as a dataframe
    return None
Beispiel #20
0
def compare_cosmic(*args):

    patient = args[0]
    sample = args[1]

    path_to_rvboost = f"/media/emir/Storage/Cancer/mutect/rvboost/{patient}s{sample}"
    rvboost_name = f"intersect.{patient}_s{sample}_0.0.vcf"
    rvboost = allel.vcf_to_dataframe(os.path.join(path_to_rvboost,
                                                  rvboost_name),
                                     fields="*",
                                     alt_number=1)
    rvboost.query("SNPEFF_IMPACT == \"HIGH\" | SNPEFF_IMPACT == \"MODERATE\"",
                  inplace=True)
    merged_rvb = pd.merge(rvboost,
                          cosmic,
                          left_on=["CHROM", "POS"],
                          right_on=["CHROM", "POS"],
                          how="inner")

    path_to_mutect2 = f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/s{sample}"
    mutect2_name = "mutect2.pkl"
    mutect2 = pd.read_pickle(os.path.join(path_to_mutect2, mutect2_name))
    mutect2.query(
        "ANN_Annotation_Impact == \"HIGH\" | ANN_Annotation_Impact == \"MODERATE\"",
        inplace=True)

    merged_m2 = pd.merge(mutect2,
                         cosmic,
                         left_on=["CHROM", "POS"],
                         right_on=["CHROM", "POS"],
                         how="inner")

    merged = pd.merge(merged_rvb,
                      merged_m2,
                      left_on=["CHROM", "POS"],
                      right_on=["CHROM", "POS"],
                      how="inner")
    venn2(subsets=(merged_m2.shape[0], merged_rvb.shape[0], merged.shape[0]),
          set_labels=(f"Mut2_pat{patient}s{sample}",
                      f"rvb_pat{patient}s{sample}"))
Beispiel #21
0
def rvboost_s1_s2(patient):

    #     fig=plt.figure()
    d = {}
    for i in range(1, 3):
        path_to_rvb_files = f"/media/emir/Storage/Cancer/mutect/rvboost/{patient}s{i}"
        rvboost_name = f"intersect.{patient}_s{i}_0.0.vcf"
        d["s" + str(i)] = allel.vcf_to_dataframe(os.path.join(
            path_to_rvb_files, rvboost_name),
                                                 fields=["CHROM", "POS"],
                                                 alt_number=1)
    merged = pd.merge(d["s1"],
                      d["s2"],
                      left_on=["CHROM", "POS"],
                      right_on=["CHROM", "POS"],
                      how="inner")
    venn2(subsets=(d["s1"].shape[0], d["s2"].shape[0], merged.shape[0]),
          set_labels=(f"RVB_pat{patient}s1", f"RVB_pat{patient}s2"))
    plt.savefig(
        f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/pics/rvb_s1_s2.png",
        dpi=250)
    plt.clf()
Beispiel #22
0
def hapcall_s1_s2(patient):

    #     fig=plt.figure()
    d = {}
    for i in range(1, 3):
        path_to_vcfs = f"/media/emir/Storage/LINUX/gatk/gatk_source/Patient_{patient}/file_samples"
        #         vcf_name = f"{patient}_t_{i}.bam.g.vcf"
        vcf_name = f"gvcf_list_{i}.list.raw_snp.vcf"
        d["s" + str(i)] = allel.vcf_to_dataframe(os.path.join(
            path_to_vcfs, vcf_name),
                                                 fields="*",
                                                 alt_number=1)
    merged = pd.merge(d["s1"],
                      d["s2"],
                      left_on=["CHROM", "POS"],
                      right_on=["CHROM", "POS"],
                      how="inner")
    venn2(subsets=(d["s1"].shape[0], d["s2"].shape[0], merged.shape[0]),
          set_labels=(f"Mut2_pat{patient}s1", f"Mut2_pat{patient}s2"))
    plt.savefig(
        f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/pics/mut2_s1_s2.png",
        dpi=250)
def read_breakend(path, qual_filter):
    sample_frame = allel.vcf_to_dataframe(
        path,
        fields=[
            'variants/CHROM', 'variants/POS', 'variants/ID', 'variants/REF',
            'variants/ALT', 'variants/QUAL', 'variants/FILTER_PASS',
            'variants/MATEID', 'variants/SVTYPE'
        ])
    if qual_filter:
        scores_cutoff = np.mean(
            sample_frame.QUAL) + 1 * np.std(sample_frame.QUAL)
        sample_frame = sample_frame.loc[sample_frame['QUAL'] > scores_cutoff]
    df = sample_frame.loc[sample_frame['SVTYPE'].str.contains("BND", na=False)]
    df = pd.merge(df,
                  df[['ID', 'MATEID', 'CHROM', 'POS']],
                  left_on="ID",
                  right_on="MATEID")
    df = df[~df.MATEID_x.str.endswith("_1")]
    df = df[~df.MATEID_x.str.endswith("_3")]
    df = df.rename(columns={'ID_x': 'ID'})
    df['CHROM_x'] = df['CHROM_x'].map(lambda x: x.lstrip('chr'))
    df['CHROM_y'] = df['CHROM_y'].map(lambda x: x.lstrip('chr'))

    return df
Beispiel #24
0
working_dir = "/home/nick_rose/nick/Downloads/vcf/"

#Create list of file names within direcory
for root, dirs, files in os.walk(working_dir):
    file_list = []
    for filename in files:
        if filename.endswith('.vcf'):
            file_list.append(os.path.join(root, filename))

#Create master file
df_master = pd.DataFrame()

#Extract data from VCF files and Append to Master
for f in file_list:
    #Create header of loci
    df = allel.vcf_to_dataframe(f)
    df["LOCI"] = df["CHROM"] + "_" + df["POS"].astype(str)
    df = df[['LOCI']]
    df = df.T
    new_header = df.iloc[0]
    #Define Callset Data
    callset = allel.read_vcf(f, fields=['calldata/REPCN'])
    callset = callset['calldata/REPCN']
    #Create list of Sample names (Temporary naming system, only works for samples 7n long)
    sam = f[len(working_dir):(len(working_dir) + 7)]
    df2 = pd.DataFrame(callset, columns=[sam])
    #Insert REPCN data
    df2 = df2.T
    df2.columns = (new_header)
    #Append file to master
    df_master = pd.concat([df_master, df2])
Beispiel #25
0
    #Select only vcf files
    vcf_files = []
    for file in files:
        if file.split(".")[-1].casefold() == "vcf":
            vcf_files.append(file)
            #print(file)
    vcf_files.sort()
    #print(vcf_files)

    ### START THE MERGING PROCESS
    print("Started merging...")

    # Read the first two files
    vcf_1 = allel.vcf_to_dataframe(path + "/" + str(vcf_files[0]),
                                   fields=['CHROM', 'POS', 'REF', 'ALT'],
                                   alt_number=1)
    vcf_2 = allel.vcf_to_dataframe(path + "/" + str(vcf_files[1]),
                                   fields=['CHROM', 'POS', 'REF', 'ALT'],
                                   alt_number=1)

    #Dropping rows with insertions e.g ACCT, AT
    vcf_1 = vcf_1[vcf_1['ALT'].str.len().lt(2)]
    vcf_2 = vcf_2[vcf_2['ALT'].str.len().lt(2)]

    print("Merging " + str(vcf_files[0]))
    print("Merging " + str(vcf_files[1]))

    # Merge the first two vcf files
    merged_vcf = pd.merge(vcf_1,
                          vcf_2,
    ax2 = fig.add_subplot(212, sharex=ax1)
    draw_motif(sm, ax2)
    return fig


### main ###
if False:
    db_f = '/home/sergio/tools/deepbind/db/db.tsv'

    db = pd.read_csv(db_f, sep='\t', comment="#", index_col=0)
    db = db[db['Labels'].isnull()]

    fn = '/home/sergio/media/NAS4/PFlab/TLX3_project/WES-seq/references/mouse_mm9_reference_genome.fa'
    vn = '/home/sergio/Res_CIML/TLX3_project/data/tracks/WGS-WES/Exomes/WES_TLX3_TAP.vcf'

    var = allel.vcf_to_dataframe(vn, fields='*', numbers={'ALT': 2})
    vnt = var.loc[1]

    fa = Fasta(fn)
    rg = fa['chr1'][6372606:6372646]

    if rg.name == vnt['CHROM']:
        pos = vnt['POS'] - rg.start
    else:
        print('It is not correct variant')
        pos = np.nan

    ref = vnt['REF'].upper()
    alt = vnt['ALT_1'].upper()

    fs = rg.seq.upper()
def chromosome_plotter():
    def onclick(event):
        global Chr, plotted

        i = 0
        for y in ax_y:

            if event.ydata > (y - 0.1) and event.ydata < (y + 0.3):
                Chr = chromosomes[i]
                print("The chromosome is :" + str(chromosomes[i]))

            i += 1

    global chromosomes, vcf

    root = tk.Tk()
    root.title("Variation Visualizer (CHROMOSOME VIEWER)")
    root.state('zoomed')
    df = allel.vcf_to_dataframe(vcf,
                                fields=['CHROM', 'POS', 'REF', 'ALT'],
                                alt_number=1)
    chromosome_vcf = df.CHROM

    fig = Figure(figsize=(10, 20))
    ax = fig.subplots()

    ax.set_ylim(0, 50)
    ax.set_xlim(1, 200)

    canvas = FigureCanvasTkAgg(fig, root)
    fig.set_canvas(canvas=canvas)

    ax_y = []
    ax_labels = []

    p = 0
    m = 48
    v = 47.5

    for chr in chromosomes:
        print(chr)
        chromosome = '{}.{}'.format(chr, 'fa')
        with open(chromosome) as fasta_file:  # Will close handle cleanly
            lengths = []
            for record in SeqIO.parse(
                    fasta_file, "fasta"
            ):  # (generator)...in this case it is not a multi fasta file
                lengths.append(record.seq)

        seq = str(record.seq)
        print("Initial length :" + str(len(seq)))

        length = int(len(seq) / 2000000)
        print("LEngth is : " + str(length))
        start = 0
        p = 0

        for i in range(start, length):
            ax.broken_barh([(p, 1)], (m, 0.2), facecolors='#1a1a00')
            p += 1

        i = 0
        index = []
        counter = 0
        for chrm in chromosome_vcf:
            if chrm == chr:
                counter += 1
                index.append(i)
            i += 1

        ax.annotate(str(counter) + " variations", (p + 2, m), fontsize=6)
        for j in index:
            pos = df.POS[j]
            pos = int(pos / 2000000)
            ax.broken_barh([(pos, 1)], (v, 1), facecolors='#b30000')

        ax_y.append(m + 0.1)
        ax_labels.append(chr)
        m -= 2
        v -= 2

    plotted = True
    ax.set_yticks(ax_y)
    ax.set_yticklabels(ax_labels, fontsize=12)

    fig.canvas.mpl_connect('button_press_event', onclick)
    canvas.draw()
    canvas.get_tk_widget().pack(side=tk.BOTTOM, fill=tk.BOTH, expand=True)
    root.mainloop()
def extract_data_from_vcf(filepath):
    '''
    Extracts data from .vcf file, stores it in a pd.dataframe and
    returns the dataframe.
    '''
    return allel.vcf_to_dataframe(filepath, fields='*')
Beispiel #29
0
def main(clinvar_url, vcf_file_name, output_directory):
    """
		Main function downloads vcf file, extract clinical variations and links between clinvar IDs (ID) and dbsnp IDs (RS) then write data in json format files.

		Args:
		vcf_file_name: <str> Name of vcf file in the website, this argument is optional.
		output_directory: <str> Path of the output directory, this argument is optional.

	"""
    # Check if output directory is provided
    if (output_directory != None):
        if (os.path.exists(output_directory)):
            # Remove / at the end of output directory in case if the user put it in the path
            if output_directory.endswith("/"):
                output_directory = output_directory[:-1]
        else:
            os.mkdir(output_directory)
    else:
        os.system("mkdir output")
        output_directory = os.getcwd() + "/output"

    # Parse ncbi clinvar page
    page = requests.get(clinvar_url)
    page_parser = BeautifulSoup(page.content, "html.parser")

    # Get all vcf names in the web page
    vcf_files_list = [
        element["href"] for element in page_parser.find_all(href=True)
        if element["href"].endswith(".vcf.gz")
    ]

    # Case file name provided in the config
    if vcf_file_name != None:
        # Verify if the file is in the ncbi clinvar ftp
        if vcf_file_name not in vcf_files_list:
            print("This file " + vcf_file_name + " is not in " + clinvar_url +
                  " website, please use vcf file available in the website.")
            exit()
    # Case file name is not provided, it downloads the first vcf file of the clinvar page
    else:
        vcf_file_name = vcf_files_list[0]

    # Download files in vcf, tbi and md5 formats
    print("Start to download " + vcf_file_name + ", " + vcf_file_name +
          ".tbi, " + vcf_file_name + ".md5  files :")
    vcf_file = complementTools.download_url(clinvar_url + vcf_file_name,
                                            output_directory)
    index_vcf_file = complementTools.download_url(
        clinvar_url + vcf_file_name + ".tbi", output_directory)
    md5_vcf_file = complementTools.download_url(
        clinvar_url + vcf_file_name + ".md5", output_directory)

    # Parse vcf file and extract clinical data
    print("Currently parsing vcf file.")
    vcf_data = allel.vcf_to_dataframe(vcf_file,
                                      fields=['variants/*', 'calldata/*'])
    nods_data, links_data = complementTools.extract_clinical_data(vcf_data)

    # Save data to json files
    print("Nodes and Links json files are saved in " + output_directory)
    nods_data.to_json(output_directory + "/nodes.json", orient="records")
    links_data.to_json(output_directory + "/links.json", orient="records")
Beispiel #30
0
# later needed to reffere to each sample by position in genome array (calc Fst)
subpop1 = np.arange(len(subpopulation1))
subpop2 = np.arange(len(subpopulation1), len(my_samples))



###   data extraction

# needed for Genotype data
callset = allel.read_vcf(inputfile, fields='*', samples=my_samples, log=sys.stdout)
callset.keys()


##  what categories do exist in vcf file / what is their name
df = allel.vcf_to_dataframe(inputfile, fields='*')

# if X == True:  print(df)


#####   filtering

print('filtering vcf file \n')

###   filter out unimportant vcf info (dependent on what you want to do later)

# set info_keep to user specifications, when given as ARGV
info_keep = ['ID', 'CHROM', 'POS', 'END', 'SVTYPE', 'QUAL', 'PRECISE', 'REF']

info_keep = info_keep + info_keep_opt