def sci_variant_bldr(self): import allel import subprocess import collections import pandas as pd import os if len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]) > 1: print("Multiple VCFs detected. Files will be merged") if len([ _ for _ in os.listdir(self.path) if _.endswith('.vcf') ]) < len([_ for _ in os.listdir(self.path) if _.endswith('.vcf')]): print("VCFs not compressed - compressing") for i in [ _ for _ in os.listdir(self.path) if _.endswith('.vcf') ]: #testing #i = [_ for _ in os.listdir(path) if _.endswith('.vcf')][0] vcf = path + i subprocess.run(['bgzip', "-c", vcf, ">"], stdout=open(vcf + ".gz", "w")) # required? subprocess.run(['tabix', '-p', 'vcf', vcf + ".vcf"]) command = 'bcftools merge --force-samples ' + path + "*.gz" + ' -o ' + path + 'INPUT.vcf' subprocess.run(command, shell=True) vcfdata = allel.read_vcf(path + 'INPUT.vcf', fields=[ 'samples', 'calldata/GT', 'variants/ALT', 'variants/REF', 'variants/CHROM', 'variants/POS', 'variants/svlen' ]) vcfdf = allel.vcf_to_dataframe( path + 'INPUT.vcf', exclude_fields=['QUAL', 'FILTER_PASS', 'ID']) else: vcffile = [_ for _ in os.listdir(self.path) if _.endswith('.vcf')] vcfdata = allel.read_vcf(self.path + vcffile[0], fields=[ 'samples', 'calldata/GT', 'variants/ALT', 'variants/REF', 'variants/CHROM', 'variants/POS', 'variants/svlen' ]) #vcfdata = allel.read_vcf("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf") vcfdf = allel.vcf_to_dataframe( self.path + vcffile[0], exclude_fields=['QUAL', 'FILTER_PASS', 'ID']) #vcfdf = allel.vcf_to_dataframe("/mnt/9e6ae416-938b-4e9a-998e-f2c5b22032d2/PD/Workspace/Alexa_VCF/denovo.Africa_Chr6.final_filtered_var_pca.vcf") sample_set = list(collections.OrderedDict.fromkeys(vcfdata['samples'])) gt = allel.GenotypeArray( vcfdata['calldata/GT']).to_n_alt() # drop additional information gt_data = pd.DataFrame(gt, columns=sample_set) data = pd.concat([vcfdf, gt_data], axis=1, join='inner') return data
def save_mutect2(*args): patient = args[0] sample = args[1] path_to_mutect2 = f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/s{sample}" mutect2_name = f"output_transcr_new_predicted_dbsnp.vcf" mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join( path_to_mutect2, mutect2_name), fields=["numalt"], alt_number=1) alt_len = max(mutect2_wo_ann["numalt"]) col_list = [ "CHROM", "POS", "ID", "REF", "DP", "FILTER_PASS", "ANN_Annotation", "ANN_Annotation_Impact", "ANN_Gene_Name", "ANN_Gene_ID", "ANN_HGVS_c", "ANN_HGVS_p", "ANN_AA_pos" ] triple_list = [] alt_cols = [ "ALT", "dbNSFP_Polyphen2_HVAR_score", "dbNSFP_SIFT_score", "dbNSFP_MetaLR_score", "dbNSFP_Polyphen2_HDIV_score", "dbNSFP_Uniprot_acc", "dbNSFP_CADD_phred", "dbNSFP_Polyphen2_HDIV_pred", "dbNSFP_MutationTaster_score", "dbNSFP_SIFT_pred", "dbNSFP_MutationTaster_pred", "dbNSFP_Polyphen2_HVAR_pred", "dbNSFP_MetaLR_pred" ] for j in alt_cols: for i in range(1, alt_len + 1): triple_list.append(j + f"_{i}") col_list.append(j + f"_{i}") mutect2_wo_ann = allel.vcf_to_dataframe(os.path.join( path_to_mutect2, mutect2_name), fields="*", alt_number=alt_len, exclude_fields="ANN") mutect2_w_ann = allel.vcf_to_dataframe(os.path.join( path_to_mutect2, mutect2_name), fields="ANN", alt_number=alt_len, transformers=allel.ANNTransformer()) mutect2 = pd.concat([mutect2_wo_ann, mutect2_w_ann], axis=1) mutect2 = mutect2[col_list] mutect2.fillna("-", inplace=True) for i in alt_cols: mutect2[i] = mutect2[i+"_1"].map(str)+","+mutect2[i+"_2"].map(str)+","+ \ mutect2[i+"_3"].map(str) mutect2.drop(triple_list, axis=1, inplace=True) return mutect2 mutect2 = pfam_annotate(mutect2) mutect2.to_pickle(os.path.join(path_to_mutect2, "mutect2.pkl"))
def compute_sample_freqs( variants_vcf: str, output: str, snp_only: bool, ): """ Creates JSON file of form { <position>: { "REF": [ <ref_alt1>, <ref_alt2>, ...] "ALT": [ <alt1>, <alt2>, ...] "probs": [<p_alt1>, <p_alt2>, ... <p_ref>] } } """ variants = ( allel.vcf_to_dataframe(variants_vcf, fields=[ 'POS', 'REF', 'ALT' ]).drop(['ALT_2', 'ALT_3'], axis=1) # ALT_2, ALT_3 are always empty ) genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT']) genotypes = genotypes['calldata/GT'] # scikit-allel reads missing values as -1 genotypes = np.where(genotypes == -1, 0, genotypes) haplo_1 = genotypes[:, :, 0] haplo_2 = genotypes[:, :, 1] num_samples = haplo_1.shape[1] * 2 haplos = pd.concat([variants, haplo_1, haplo_2], axis=1) if snp_only: haplos = haplos[(haplos['REF'].str.len() == 1) & (haplos['ALT_1'].str.len() == 1)] # subset to only keep SNPs var_freqs = {} for pos in tqdm(haplos['POS'].unique()): sum_alt = 0 pos_variants = haplos[haplos['POS'] == pos] refs = [] variants = [] freqs = [] # frequencies of each variant at this position for row in pos_variants.itertuples(index=False): # each tuple is of form (POS, REF, ALT, ...) num_alt = np.sum(row[3:]) sum_alt += num_alt refs.append(row[1]) variants.append(row[2]) freqs.append(num_alt / num_samples) # frequency of the reference freqs.append((num_samples - sum_alt) / num_samples) # add result to main dictionary var_freqs[int(pos)] = {'REF': refs, 'ALT': variants, 'freq': freqs} with open(output, 'w') as fp: json.dump(var_freqs, fp, sort_keys=True) return
def read_vcf(fp, all_columns): try: print('Reading file ' + fp + ' - size: ' + str(osutils.format_bytes(os.path.getsize(fp)))) if (all_columns): df = allel.vcf_to_dataframe(fp, fields='*', alt_number=1) else: df = allel.vcf_to_dataframe(fp) # Possible other similar fields are 'REF':'Ref', 'ALT':'Obs' df.rename(columns={'CHROM': 'Chr', 'POS': 'Start'}, inplace=True) return df except (OSError, IOError) as e: print('>>> read_vcf: {0} - error: {1} <<<'.format( fp, os.strerror(e.errno))) return None
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir, input_ref_pgx_vcf): ''' extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output ''' print( 'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.' ) path_output = os.path.join( output_dir, obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz') input_vcf_cyvcf2 = VCF(input_vcf) input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf) # get pgx regions in each chromosome input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf) input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[ 'CHROM'].replace({ 'chr': '' }, regex=True).astype(str).astype(int) ref_pgx_regions = input_ref_pgx_pos_pandas.groupby( ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index() # fix chr names chr_name_match = re.compile("^chr") if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames): # chromosomes have leading 'chr' characters in the original VCF # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1).replace({'^': 'chr'}, regex=True) else: # chromosomes do not have leading 'chr' characters in the original VCF # add chromosome name with leading 'chr' to the VCF header for single_chr in input_vcf_cyvcf2.seqnames: input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr + '>') # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1) # write to a VCF output file # header output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz") # content for single_region in ref_pgx_regions: for single_variant in input_vcf_cyvcf2(single_region): single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1', single_variant.CHROM) output_vcf_cyvcf2.write_record(single_variant) # close pipe input_vcf_cyvcf2.close() input_ref_pgx_pos_cyvcf2.close() output_vcf_cyvcf2.close() tabix_index_vcf(tabix_executable_path, path_output) return path_output
def get_max_num_ann(temp_out_name): num_ann_guess = 500 callset = allel.vcf_to_dataframe(temp_out_name, fields='ANN', numbers={'ANN': num_ann_guess}) num_ann = callset.apply(lambda x: sum(x != ''), axis=1) num_ann_max = num_ann.max() # num_ann_max = 175 return num_ann_max
def alleles(self): import allel vcfInfo = allel.vcf_to_dataframe( self.vcf, ['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT'], alt_number=1) vcfList = vcfInfo.values.tolist() return vcfList
def parse_mutations(self): ''' purpose: parse input mutation vcf file input: vcf file or gz vcf file, one alternate per line format: vcf 4.0 standard format output: mutation dataframe with mutation id and genomic position ''' print("Parsing mutations file:{}".format(self.mutation_file)) log.info("Parsing mutations file:{}".format(self.mutation_file)) # check that file exists and is not empty self.file_check(self.mutation_file) # read in mutation file, truncate to only one mutation per line mutation_df = allel.vcf_to_dataframe(self.mutation_file, fields=[ 'CHROM', 'POS', 'ID', 'REF', 'ALT', 'variants/STRAND', 'variants/svlen' ], alt_number=1, types={ 'CHROM': 'object', 'POS': 'int32', 'ID': 'object', 'REF': 'object', 'ALT': 'object', 'STRAND': 'S1', 'variants/svlen': int }, numbers={ "ALT": 1, "STRAND": 1 }) # test that required columns are present self.test_cols(mutation_df, "mutation vcf", ["CHROM", "POS", "ID", "REF", "ALT", "STRAND"]) mutation_df.rename(columns={"CHROM":"chrom", "POS":"pos", "ID":"id", "REF":"ref", \ "ALT":"alt", "STRAND":"strand"}, inplace=True) # test that no two mutation ID's are the same assert mutation_df["id"].nunique() == mutation_df.shape[0] # drop any identical mutations mutation_df.drop_duplicates(["chrom", "pos", "ref", "alt", "strand"], inplace=True) # convert mutation lengths to mutation types mutation_df['mut_type'] = "" mutation_df['mut_type'][mutation_df["svlen"] == 0] = "SNV" mutation_df['mut_type'][mutation_df["svlen"] < 0] = "DEL" mutation_df['mut_type'][mutation_df["svlen"] > 0] = "INS" mutation_df["id"] = mutation_df.id.astype( str) + "_" + mutation_df.mut_type.astype(str) return mutation_df
def generate_var_id_for_exac(vcf_file): callset = allel.vcf_to_dataframe(vcf_file, fields=['CHROM', 'POS', 'REF', 'ALT'], alt_number=num_alt) var_all = pd.DataFrame() for i in range(1, num_alt+1): ALT_i = 'ALT_' + str(i) var_i = callset[['CHROM', 'POS', 'REF', ALT_i]].apply(lambda x: "-".join(x.map(str)), axis=1) var_all = pd.concat([var_all, var_i], axis=1) return var_all
def parse_mutations(self): ''' purpose: parse input mutation vcf file input: vcf file or gz vcf file, one alternate per line format: vcf 4.0 standard format output: mutation dataframe with mutation id and genomic position ''' print("Parsing mutations file:{}".format(self.mutation_file)) log.info("Parsing mutations file:{}".format(self.mutation_file)) # check that file exists and is not empty self.file_check(self.mutation_file) try: # read in mutation file, truncate to only one mutation per line mutation_df = allel.vcf_to_dataframe(self.mutation_file, fields=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'variants/STRAND', 'is_snp'], alt_number=1, types={'CHROM':'object', 'POS':'int32', 'ID':'object', 'REF':'object', 'ALT':'object', 'STRAND':'S1', 'is_snp':'object'}, numbers={"ALT":1, "STRAND":1}) # test that required columns are present self.test_cols(mutation_df, "mutation vcf", ["CHROM", "POS", "ID", "REF", "ALT", "STRAND"]) mutation_df.rename(columns={"CHROM":"chrom", "POS":"pos", "ID":"id", "REF":"ref", \ "ALT":"alt", "STRAND":"strand"}, inplace=True) # test that no two mutation ID's are the same assert mutation_df["id"].nunique() == mutation_df.shape[0] # drop any identical mutations mutation_df.drop_duplicates(["chrom", "pos", "ref", "alt", "strand"], inplace=True) # label mutation by length for bucketing conditions = [ (mutation_df['ref'].str.len() < mutation_df['alt'].str.len()), (mutation_df['ref'].str.len() > mutation_df['alt'].str.len()), (mutation_df['is_snp']) ] choices = ['ins', 'del', 'snv'] # add column with mutation type mutation_df['mut_type'] = np.select(conditions, choices, default='') mutation_df.drop("is_snp", inplace=True, axis=1) return mutation_df except Exception as e: error_msg = "Error parsing mutation vcf file: \n{}".format(e) log.error(error_msg) raise SystemExit(error_msg)
def read10xlargeSVs(input, sv_type, qual_filter): df = allel.vcf_to_dataframe(input, fields=['variants/CHROM', 'variants/POS', 'variants/ID', 'variants/REF', 'variants/ALT', 'variants/QUAL', 'variants/FILTER_PASS', 'variants/END', 'variants/SVLEN']) if qual_filter: scores_cutoff = np.mean(df.QUAL) + 1*np.std(df.QUAL) df = df.loc[df['QUAL'] > scores_cutoff] df = df.loc[df['ALT_1']==sv_type] df.reset_index(inplace=True, drop=True) df['CHROM'] = df['CHROM'].map(lambda x: x.lstrip('chr')) return(df)
def fit_em_smm( variants_vcf: str, n_iterations: int, K: int, seed: int, logsum_approx: bool, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: variants = ( allel.vcf_to_dataframe(variants_vcf, fields=[ 'POS', 'REF', 'ALT' ]).drop(['ALT_2', 'ALT_3'], axis=1) # ALT_2, ALT_3 are always empty ) genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT']) genotypes = genotypes['calldata/GT'] # scikit-allel reads missing values as -1 genotypes = np.where(genotypes == -1, 0, genotypes) haplo_1 = genotypes[:, :, 0] haplo_2 = genotypes[:, :, 1] haplos = np.hstack((haplo_1, haplo_2)).T n_variants_pos = ( variants # find number of variants by position .groupby('POS' ) # add 1 to account for fact that we always have a reference .count()['REF'].values) + 1 max_n_variants = np.sort(n_variants_pos)[-1] n_loci = len(variants['POS'].unique()) n_samples = haplos.shape[0] haplos = _encode_haplotypes(variants['POS'].values, haplos, n_samples, n_loci) # em initialization rng = np.random.default_rng(seed) group_e_ini = rng.random(size=(n_samples, K)) group_e = group_e_ini / np.sum(group_e_ini, axis=1, keepdims=1) group_probs = np.full(6, 1 / K) # make this a probability vector variant_ini = rng.random(size=(K, n_loci, max_n_variants)) variant_probs = variant_ini / np.sum(variant_ini, axis=2, keepdims=1) # TODO: add step filtering this to correct number of variants return _em_loop( n_iterations, K, n_samples, n_loci, n_variants_pos, group_e, group_probs, variant_probs, haplos, logsum_approx, )
def merge_vcf_into_fasta(self, fasta_file, vcf_file): ''' ''' fasta = FileName(fasta_file) vcf = FileName(vcf_file) output_file_name = vcf.name + "_merged_into_" + fasta.name + ".fasta" fasta_out = open(output_file_name, 'w') print(f'Threshold cutoff, not applying calls below {self.qual_threshold} QUAL') print(f'Threshold cutoff, not applying calls below {self.map_threshold} MQ') if self.ambiguity_NOT: print(f'AC=1 calls will NOT be reported with IUPAC ambiguity nucleotide codes.') else: print(f'AC=1 calls will be reported with IUPAC ambiguity nucleotide codes') for seq_record in SeqIO.parse(fasta.file, "fasta"): qual_threshold = int(self.qual_threshold) map_threshold = int(self.map_threshold) ambiguity_NOT = self.ambiguity_NOT chrom_seq_dict = {} record = 0 chrom = seq_record.id sequence = seq_record.seq for base in sequence: record += 1 chrom_seq_dict[chrom + "-" + str(record)] = base fasta_file_df = pd.DataFrame.from_dict(chrom_seq_dict, orient='index') fasta_file_df.columns = ["REF"] fasta_file_df = fasta_file_df.rename(columns={"REF": "ALT"}) fasta_file_df.index.name = "Absolute_Pos" vcf_df = allel.vcf_to_dataframe(vcf.file, fields=['variants/CHROM', 'variants/POS', 'variants/QUAL', 'variants/REF', 'variants/ALT', 'variants/AC', 'variants/DP', 'variants/MQ'], alt_number=1) df1 = vcf_df[((vcf_df.QUAL >= qual_threshold) & (vcf_df.ALT.str.len() == 1) & (vcf_df.MQ >= map_threshold) | (vcf_df.REF == "N"))] if not ambiguity_NOT: #aka applying ambiguity df1.ALT = np.where(df1.AC.eq(1), df1.REF + df1.ALT, df1.ALT) #cat to ALT column when AC=1 df1.ALT = df1.ALT.replace({"AG": "R", "CT": "Y", "GC": "S", "AT": "W", "GT": "K", "AC": "M", "GA": "R", "TC": "Y", "CG": "S", "TA": "W", "TG": "K", "CA": "M"}) df1.ALT = np.where(df1.REF.eq("N"), df1.REF, df1.ALT) #move the Ns to the ALT column df1["Absolute_Pos"] = df1["CHROM"].map(str) + "-" + df1["POS"].map(str) vcf_merge_read = df1[["Absolute_Pos", "ALT"]] vcf_merge_read = vcf_merge_read.set_index('Absolute_Pos') fasta_file_df.update(vcf_merge_read) #merge vcf changes to fasta dataframe print(">{}".format(vcf.name + "_merged_into_" + fasta.name), file=fasta_out) print("{}".format(''.join(list(fasta_file_df.to_dict()['ALT'].values()))), file=fasta_out) print(f'\nFile written to: {output_file_name}\n') fasta_out.close() FastaLineBreaks(output_file_name)
def read10xlargeSVs(input, sv_type): df = allel.vcf_to_dataframe(input, fields=[ 'variants/CHROM', 'variants/POS', 'variants/ID', 'variants/REF', 'variants/ALT', 'variants/QUAL', 'variants/FILTER_PASS', 'variants/END', 'variants/SVLEN' ]) df = df[df['ALT_1'].notna()] df = df.loc[df['ALT_1'].str.contains("DUP")] df.reset_index(inplace=True, drop=True) df['CHROM'] = df['CHROM'].map(lambda x: x.lstrip('chr')) return (df)
def read_vcf(): global index, df, vcf, Chr df = allel.vcf_to_dataframe(vcf, fields=['CHROM', 'POS', 'REF', 'ALT'], alt_number=1) chromosome_vcf = df.CHROM i = 0 for chr in chromosome_vcf: if chr == Chr: index.append(i) i += 1 print(index) for j in index: print(df.iloc[[j]])
def read10x(input): df = allel.vcf_to_dataframe(input, fields=[ 'variants/CHROM', 'variants/POS', 'variants/ID', 'variants/REF', 'variants/ALT', 'variants/QUAL', 'variants/FILTER_PASS', 'variants/END', 'variants/SVLEN' ]) # scores_cutoff = np.mean(df.QUAL) - 2*np.std(df.QUAL) # df = df.loc[df['FILTER_PASS']==True] # df = df.loc[df['QUAL']>scores_cutoff] df.reset_index(inplace=True, drop=True) df['CHROM'] = df['CHROM'].map(lambda x: x.lstrip('chr')) return (df)
def extract_from_vcf(reference_fasta: str, variants_vcf: str, sample_ids: str, output: str, start: int, end: int): reference = Bio.SeqIO.read(reference_fasta, 'fasta') reference_seq = str(reference.seq) reference_gene = reference_seq[start:end] ids = pd.read_csv(sample_ids, header=None) ids = ids[0].tolist() variants = ( allel.vcf_to_dataframe(variants_vcf, fields=[ 'POS', 'REF', 'ALT' ]).drop(['ALT_2', 'ALT_3'], axis=1) # ALT_2, ALT_3 are always empty ) # shift POS to start from 0 in the gene's sequence variants['POS'] = variants['POS'] - start genotypes = allel.read_vcf(variants_vcf, fields=['calldata/GT'], samples=ids) genotypes = genotypes['calldata/GT'] haplo_1 = pd.DataFrame(genotypes[:, :, 0]) haplo_2 = pd.DataFrame(genotypes[:, :, 1]) haplos = pd.concat([variants, haplo_1, haplo_2], axis=1) haplos = haplos[(haplos['REF'].str.len() == 1) & (haplos['ALT_1'].str.len() == 1)] haplos = haplos.reset_index() # reset index after filtering # iterate over all haplotype columns, building the fulls sequence for each seqs = [] for j in range(3, haplos.shape[1]): seq = list(reference_gene) # cannot modify a string for i in range(len(haplos)): if haplos.iloc[i, j] == 1: alt = haplos.loc[i, 'ALT_1'] seq[haplos.loc[i, 'POS']] = alt seq = pd.Series(seq, dtype='category') seqs.append(seq) print('Completed sample {}\n'.format(j)) seqs_df = pd.DataFrame( seqs) # as feather is columnar prefer taking transpose after loading seqs_df.columns = seqs_df.columns.astype(str) seqs_df.to_feather(output)
def __init__(self, fileName): self.vcfFileName = fileName # AVAILABLE FIELDS # 'CHROM', 'POS', 'ID', 'REF', 'ALT_1', 'ALT_2', 'ALT_3', 'QUAL', 'PRO', # 'EPP_1', 'EPP_2', 'EPP_3', 'SRF', 'NS', 'AB_1', 'AB_2', 'AB_3', # 'NUMALT', 'SRR', 'RPPR', 'QA_1', 'QA_2', 'QA_3', 'RUN_1', 'RUN_2', # 'RUN_3', 'MQM_1', 'MQM_2', 'MQM_3', 'DPB', 'PAIREDR', 'SAR_1', 'SAR_2', # 'SAR_3', 'DPRA_1', 'DPRA_2', 'DPRA_3', 'BVAR', 'DP', 'RO', 'GTI', # 'ODDS', 'AC_1', 'AC_2', 'AC_3', 'AF_1', 'AF_2', 'AF_3', 'PAO_1', # 'PAO_2', 'PAO_3', 'PAIRED_1', 'PAIRED_2', 'PAIRED_3', 'CIGAR_1', # 'CIGAR_2', 'CIGAR_3', 'PQR', 'AO_1', 'AO_2', 'AO_3', 'LEN_1', 'LEN_2', # 'LEN_3', 'SRP', 'ABP_1', 'ABP_2', 'ABP_3', 'RPP_1', 'RPP_2', 'RPP_3', # 'MEANALT_1', 'MEANALT_2', 'MEANALT_3', 'AN', 'MQMR', 'QR', 'SAP_1', # 'SAP_2', 'SAP_3', 'PQA_1', 'PQA_2', 'PQA_3', 'TYPE_1', 'TYPE_2', # 'TYPE_3', 'EPPR', 'SAF_1', 'SAF_2', 'SAF_3', 'FILTER_PASS', 'numalt', # 'svlen_1', 'svlen_2', 'svlen_3', 'is_snp' self.dataFrame = allel.vcf_to_dataframe(fileName, fields='*')
def sim_to_mhs(sim, vcf_path=os.getcwd() + '/vcf_mhs/', vcf_filename=datetime.now().strftime("%Y%m%d"), mhs_path=os.getcwd() + '/vcf_mhs/', mhs_filename=datetime.now().strftime("%Y%m%d") + '_mhs', suffix='', verbose="True"): # function: given an msprime simulation, write a vcf for it and then call multihetsep() to write mhs # sim: msprime simulation object # vcf_path: path where you want to save vcf. Default is current directory # vcf_filename: the name of the vcf file that you want to write. Default is current date and time # vcf_filename = vcf_name # mhs_path: where do you want to save this mhs # mhs_filename: where do you want to save this file # verbose: Whether to print progress or not # get array of genotype for each variant. TODO this isn't necessary for PSMC's purposes # vars = [varient.genotypes for varient in sim.variants()] # check given directories are valid dir_check(vcf_path, mhs_path) #write vcf of genotypes if verbose: print('Writing vcf...') with open(vcf_path + vcf_filename + suffix + ".vcf", "w") as vcf_file: sim.write_vcf(vcf_file, 2) if verbose: print('vcf written to {}'.format(vcf_path)) # write matrix of genotypes, maybe delete this gen_mat = sim.genotype_matrix() #read vcf as data frame sim_vcf = allel.vcf_to_dataframe(vcf_path + vcf_filename + suffix + ".vcf", fields='*') # generate and write mhs multihetsep(sim_vcf, mhs_path, mhs_filename, suffix, gen_mat, verbose) # return the vcf file as a dataframe return None
def compare_cosmic(*args): patient = args[0] sample = args[1] path_to_rvboost = f"/media/emir/Storage/Cancer/mutect/rvboost/{patient}s{sample}" rvboost_name = f"intersect.{patient}_s{sample}_0.0.vcf" rvboost = allel.vcf_to_dataframe(os.path.join(path_to_rvboost, rvboost_name), fields="*", alt_number=1) rvboost.query("SNPEFF_IMPACT == \"HIGH\" | SNPEFF_IMPACT == \"MODERATE\"", inplace=True) merged_rvb = pd.merge(rvboost, cosmic, left_on=["CHROM", "POS"], right_on=["CHROM", "POS"], how="inner") path_to_mutect2 = f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/s{sample}" mutect2_name = "mutect2.pkl" mutect2 = pd.read_pickle(os.path.join(path_to_mutect2, mutect2_name)) mutect2.query( "ANN_Annotation_Impact == \"HIGH\" | ANN_Annotation_Impact == \"MODERATE\"", inplace=True) merged_m2 = pd.merge(mutect2, cosmic, left_on=["CHROM", "POS"], right_on=["CHROM", "POS"], how="inner") merged = pd.merge(merged_rvb, merged_m2, left_on=["CHROM", "POS"], right_on=["CHROM", "POS"], how="inner") venn2(subsets=(merged_m2.shape[0], merged_rvb.shape[0], merged.shape[0]), set_labels=(f"Mut2_pat{patient}s{sample}", f"rvb_pat{patient}s{sample}"))
def rvboost_s1_s2(patient): # fig=plt.figure() d = {} for i in range(1, 3): path_to_rvb_files = f"/media/emir/Storage/Cancer/mutect/rvboost/{patient}s{i}" rvboost_name = f"intersect.{patient}_s{i}_0.0.vcf" d["s" + str(i)] = allel.vcf_to_dataframe(os.path.join( path_to_rvb_files, rvboost_name), fields=["CHROM", "POS"], alt_number=1) merged = pd.merge(d["s1"], d["s2"], left_on=["CHROM", "POS"], right_on=["CHROM", "POS"], how="inner") venn2(subsets=(d["s1"].shape[0], d["s2"].shape[0], merged.shape[0]), set_labels=(f"RVB_pat{patient}s1", f"RVB_pat{patient}s2")) plt.savefig( f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/pics/rvb_s1_s2.png", dpi=250) plt.clf()
def hapcall_s1_s2(patient): # fig=plt.figure() d = {} for i in range(1, 3): path_to_vcfs = f"/media/emir/Storage/LINUX/gatk/gatk_source/Patient_{patient}/file_samples" # vcf_name = f"{patient}_t_{i}.bam.g.vcf" vcf_name = f"gvcf_list_{i}.list.raw_snp.vcf" d["s" + str(i)] = allel.vcf_to_dataframe(os.path.join( path_to_vcfs, vcf_name), fields="*", alt_number=1) merged = pd.merge(d["s1"], d["s2"], left_on=["CHROM", "POS"], right_on=["CHROM", "POS"], how="inner") venn2(subsets=(d["s1"].shape[0], d["s2"].shape[0], merged.shape[0]), set_labels=(f"Mut2_pat{patient}s1", f"Mut2_pat{patient}s2")) plt.savefig( f"/media/emir/Storage/Cancer/mutect/output/pat{patient}/pics/mut2_s1_s2.png", dpi=250)
def read_breakend(path, qual_filter): sample_frame = allel.vcf_to_dataframe( path, fields=[ 'variants/CHROM', 'variants/POS', 'variants/ID', 'variants/REF', 'variants/ALT', 'variants/QUAL', 'variants/FILTER_PASS', 'variants/MATEID', 'variants/SVTYPE' ]) if qual_filter: scores_cutoff = np.mean( sample_frame.QUAL) + 1 * np.std(sample_frame.QUAL) sample_frame = sample_frame.loc[sample_frame['QUAL'] > scores_cutoff] df = sample_frame.loc[sample_frame['SVTYPE'].str.contains("BND", na=False)] df = pd.merge(df, df[['ID', 'MATEID', 'CHROM', 'POS']], left_on="ID", right_on="MATEID") df = df[~df.MATEID_x.str.endswith("_1")] df = df[~df.MATEID_x.str.endswith("_3")] df = df.rename(columns={'ID_x': 'ID'}) df['CHROM_x'] = df['CHROM_x'].map(lambda x: x.lstrip('chr')) df['CHROM_y'] = df['CHROM_y'].map(lambda x: x.lstrip('chr')) return df
working_dir = "/home/nick_rose/nick/Downloads/vcf/" #Create list of file names within direcory for root, dirs, files in os.walk(working_dir): file_list = [] for filename in files: if filename.endswith('.vcf'): file_list.append(os.path.join(root, filename)) #Create master file df_master = pd.DataFrame() #Extract data from VCF files and Append to Master for f in file_list: #Create header of loci df = allel.vcf_to_dataframe(f) df["LOCI"] = df["CHROM"] + "_" + df["POS"].astype(str) df = df[['LOCI']] df = df.T new_header = df.iloc[0] #Define Callset Data callset = allel.read_vcf(f, fields=['calldata/REPCN']) callset = callset['calldata/REPCN'] #Create list of Sample names (Temporary naming system, only works for samples 7n long) sam = f[len(working_dir):(len(working_dir) + 7)] df2 = pd.DataFrame(callset, columns=[sam]) #Insert REPCN data df2 = df2.T df2.columns = (new_header) #Append file to master df_master = pd.concat([df_master, df2])
#Select only vcf files vcf_files = [] for file in files: if file.split(".")[-1].casefold() == "vcf": vcf_files.append(file) #print(file) vcf_files.sort() #print(vcf_files) ### START THE MERGING PROCESS print("Started merging...") # Read the first two files vcf_1 = allel.vcf_to_dataframe(path + "/" + str(vcf_files[0]), fields=['CHROM', 'POS', 'REF', 'ALT'], alt_number=1) vcf_2 = allel.vcf_to_dataframe(path + "/" + str(vcf_files[1]), fields=['CHROM', 'POS', 'REF', 'ALT'], alt_number=1) #Dropping rows with insertions e.g ACCT, AT vcf_1 = vcf_1[vcf_1['ALT'].str.len().lt(2)] vcf_2 = vcf_2[vcf_2['ALT'].str.len().lt(2)] print("Merging " + str(vcf_files[0])) print("Merging " + str(vcf_files[1])) # Merge the first two vcf files merged_vcf = pd.merge(vcf_1, vcf_2,
ax2 = fig.add_subplot(212, sharex=ax1) draw_motif(sm, ax2) return fig ### main ### if False: db_f = '/home/sergio/tools/deepbind/db/db.tsv' db = pd.read_csv(db_f, sep='\t', comment="#", index_col=0) db = db[db['Labels'].isnull()] fn = '/home/sergio/media/NAS4/PFlab/TLX3_project/WES-seq/references/mouse_mm9_reference_genome.fa' vn = '/home/sergio/Res_CIML/TLX3_project/data/tracks/WGS-WES/Exomes/WES_TLX3_TAP.vcf' var = allel.vcf_to_dataframe(vn, fields='*', numbers={'ALT': 2}) vnt = var.loc[1] fa = Fasta(fn) rg = fa['chr1'][6372606:6372646] if rg.name == vnt['CHROM']: pos = vnt['POS'] - rg.start else: print('It is not correct variant') pos = np.nan ref = vnt['REF'].upper() alt = vnt['ALT_1'].upper() fs = rg.seq.upper()
def chromosome_plotter(): def onclick(event): global Chr, plotted i = 0 for y in ax_y: if event.ydata > (y - 0.1) and event.ydata < (y + 0.3): Chr = chromosomes[i] print("The chromosome is :" + str(chromosomes[i])) i += 1 global chromosomes, vcf root = tk.Tk() root.title("Variation Visualizer (CHROMOSOME VIEWER)") root.state('zoomed') df = allel.vcf_to_dataframe(vcf, fields=['CHROM', 'POS', 'REF', 'ALT'], alt_number=1) chromosome_vcf = df.CHROM fig = Figure(figsize=(10, 20)) ax = fig.subplots() ax.set_ylim(0, 50) ax.set_xlim(1, 200) canvas = FigureCanvasTkAgg(fig, root) fig.set_canvas(canvas=canvas) ax_y = [] ax_labels = [] p = 0 m = 48 v = 47.5 for chr in chromosomes: print(chr) chromosome = '{}.{}'.format(chr, 'fa') with open(chromosome) as fasta_file: # Will close handle cleanly lengths = [] for record in SeqIO.parse( fasta_file, "fasta" ): # (generator)...in this case it is not a multi fasta file lengths.append(record.seq) seq = str(record.seq) print("Initial length :" + str(len(seq))) length = int(len(seq) / 2000000) print("LEngth is : " + str(length)) start = 0 p = 0 for i in range(start, length): ax.broken_barh([(p, 1)], (m, 0.2), facecolors='#1a1a00') p += 1 i = 0 index = [] counter = 0 for chrm in chromosome_vcf: if chrm == chr: counter += 1 index.append(i) i += 1 ax.annotate(str(counter) + " variations", (p + 2, m), fontsize=6) for j in index: pos = df.POS[j] pos = int(pos / 2000000) ax.broken_barh([(pos, 1)], (v, 1), facecolors='#b30000') ax_y.append(m + 0.1) ax_labels.append(chr) m -= 2 v -= 2 plotted = True ax.set_yticks(ax_y) ax.set_yticklabels(ax_labels, fontsize=12) fig.canvas.mpl_connect('button_press_event', onclick) canvas.draw() canvas.get_tk_widget().pack(side=tk.BOTTOM, fill=tk.BOTH, expand=True) root.mainloop()
def extract_data_from_vcf(filepath): ''' Extracts data from .vcf file, stores it in a pd.dataframe and returns the dataframe. ''' return allel.vcf_to_dataframe(filepath, fields='*')
def main(clinvar_url, vcf_file_name, output_directory): """ Main function downloads vcf file, extract clinical variations and links between clinvar IDs (ID) and dbsnp IDs (RS) then write data in json format files. Args: vcf_file_name: <str> Name of vcf file in the website, this argument is optional. output_directory: <str> Path of the output directory, this argument is optional. """ # Check if output directory is provided if (output_directory != None): if (os.path.exists(output_directory)): # Remove / at the end of output directory in case if the user put it in the path if output_directory.endswith("/"): output_directory = output_directory[:-1] else: os.mkdir(output_directory) else: os.system("mkdir output") output_directory = os.getcwd() + "/output" # Parse ncbi clinvar page page = requests.get(clinvar_url) page_parser = BeautifulSoup(page.content, "html.parser") # Get all vcf names in the web page vcf_files_list = [ element["href"] for element in page_parser.find_all(href=True) if element["href"].endswith(".vcf.gz") ] # Case file name provided in the config if vcf_file_name != None: # Verify if the file is in the ncbi clinvar ftp if vcf_file_name not in vcf_files_list: print("This file " + vcf_file_name + " is not in " + clinvar_url + " website, please use vcf file available in the website.") exit() # Case file name is not provided, it downloads the first vcf file of the clinvar page else: vcf_file_name = vcf_files_list[0] # Download files in vcf, tbi and md5 formats print("Start to download " + vcf_file_name + ", " + vcf_file_name + ".tbi, " + vcf_file_name + ".md5 files :") vcf_file = complementTools.download_url(clinvar_url + vcf_file_name, output_directory) index_vcf_file = complementTools.download_url( clinvar_url + vcf_file_name + ".tbi", output_directory) md5_vcf_file = complementTools.download_url( clinvar_url + vcf_file_name + ".md5", output_directory) # Parse vcf file and extract clinical data print("Currently parsing vcf file.") vcf_data = allel.vcf_to_dataframe(vcf_file, fields=['variants/*', 'calldata/*']) nods_data, links_data = complementTools.extract_clinical_data(vcf_data) # Save data to json files print("Nodes and Links json files are saved in " + output_directory) nods_data.to_json(output_directory + "/nodes.json", orient="records") links_data.to_json(output_directory + "/links.json", orient="records")
# later needed to reffere to each sample by position in genome array (calc Fst) subpop1 = np.arange(len(subpopulation1)) subpop2 = np.arange(len(subpopulation1), len(my_samples)) ### data extraction # needed for Genotype data callset = allel.read_vcf(inputfile, fields='*', samples=my_samples, log=sys.stdout) callset.keys() ## what categories do exist in vcf file / what is their name df = allel.vcf_to_dataframe(inputfile, fields='*') # if X == True: print(df) ##### filtering print('filtering vcf file \n') ### filter out unimportant vcf info (dependent on what you want to do later) # set info_keep to user specifications, when given as ARGV info_keep = ['ID', 'CHROM', 'POS', 'END', 'SVTYPE', 'QUAL', 'PRECISE', 'REF'] info_keep = info_keep + info_keep_opt