def getUniqueVCF_entries(patient, cell): basePATH = os.getcwd() patientPATH = basePATH + '/bulkVCF/' + patient cellPATH = basePATH + '/scVCF/' + cell + '.vcf' try: patient_df = VCF.dataframe(patientPATH) cell_df = VCF.dataframe(cellPATH) except FileNotFoundError: print('FILE NOT FOUND: %s' % cellPATH) return patient_df_trimmed = patient_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']] cell_df_trimmed = cell_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']] # get whats SHARED between patient and cell # FIND GERMLINE MUTATIONS patient_cell_concat = pd.concat([patient_df_trimmed, cell_df_trimmed]) rowsToKeep = patient_cell_concat.duplicated() patient_cell_shared = patient_cell_concat[rowsToKeep] patient_cell_shared = patient_cell_shared.reset_index(drop=True) # now go back to the original cell df, pull out whats UNIQUE # THIS IS THE GERMLINE FILTER!! cell_cell_concat = pd.concat([cell_df_trimmed, patient_cell_shared]) cell_cell_concat_noDups = cell_cell_concat.drop_duplicates(keep=False) cell_cell_concat_noDups = cell_cell_concat_noDups.reset_index(drop=True) return (cell_cell_concat_noDups)
def runBatch(cellsList_file, outputDF_): cellsList_open = open(cellsList_file, "r") cells = cellsList_open.readlines() global cellName for cell in cells: cellName = cell.rstrip() get_s3_files(cell) cwd = os.getcwd() vcf_path = cwd + '/' + cell vcf_path_strip = vcf_path.rstrip() + '.vcf' gvcf_path = cwd + '/' + cell gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf' vcf = VCF.dataframe(vcf_path_strip) gvcf = VCF.dataframe(gvcf_path_strip) # get a list of the records we actually care about toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) # subset by relevant records vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)] gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)] # get depth of coverage, for relevant records outputRow_v = getDepth_adv(vcf_GOI) outputRow_g = getDepth_adv(gvcf_GOI) # make the combined row, with both vcf and gvcf fields filled in outputRow_comb = pd.DataFrame(columns=colNames) # colNames is a global outputRow_comb['cellName'] = outputRow_v['cellName'] outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool'] outputRow_comb['depth_vcf'] = outputRow_v['depth'] outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool'] outputRow_comb['depth_gvcf'] = outputRow_g['depth'] outputDF_ = outputDF_.append(outputRow_comb) # remove s3 files os.system('rm *.vcf > /dev/null 2>&1') # remove, and mute errors os.system('rm *.vcf* > /dev/null 2>&1') # remove, and mute errors return (outputDF_)
def check_vcf( input ): v = VCF( input ) debug = False print "\n".join(v.metadata) for line in v.lines(): r = line.ref a = line.alt_list if len(r) > 1: if len(a) > 1: raise Exception("WARNING: multi-allelic change not coded for") line1 = copy.deepcopy( line ) line2 = copy.deepcopy( line ) if len(a[0]) > 1: if len(r) < len(a[0]): # insertion if len(r) == 2: line1.ref = r[1] line1.alt = r[1]+a[0][2:] line1.pos += 1 line2.ref = r[1] line2.alt = a[0][1] line2.pos += 1 if debug: print "====" print line1 print line2 if debug: print line elif len(a[0]) < len(r): if len(a[0]) == 2: line1.ref = r[1:] line1.alt = r[1] line1.pos += 1 line2.ref = r[1] line2.alt = a[0][1] line2.pos += 1 if debug: print "====" print line1 print line2 if debug: print line else: print line else: print line else: print line
def test_genotypes(self): homo_ref = VCF.process_snp_call('0/0:10,9:19:99:254,0,337', 'A', 'T', IUPAC_ambiguities=True) self.assertEqual(homo_ref, 'A') heterozygote = VCF.process_snp_call('0/1:10,9:19:99:254,0,337', 'A', 'T', IUPAC_ambiguities=True) self.assertEqual(heterozygote, 'W') homo_alt = VCF.process_snp_call('1/1:10,9:19:99:254,0,337', 'A', 'T', IUPAC_ambiguities=True) self.assertEqual(homo_alt, 'T') second_alt = VCF.process_snp_call('0/2:10,9:19:99:254,0,337', 'A', 'T,G', IUPAC_ambiguities=True) self.assertEqual(second_alt, 'R') double_alt = VCF.process_snp_call('1/2:10,9:19:99:254,0,337', 'A', 'T,G', IUPAC_ambiguities=True) self.assertEqual(double_alt ,'K')
def test_make_slices_default_with_params_set(self): """Test slicing function with window_size set""" for count, i in enumerate(VCF.get_slice_indicies(self.bgzip_path, regions=None, window_size=1008)): if count > 10: break self.assertEqual(i, ('Chr01', 11089, 12096))
def test_make_slices_default_settings(self): """Test slicing function with default settings: 500 bp slices""" for count, i in enumerate(VCF.get_slice_indicies(self.bgzip_path, regions=None, window_size=500)): if count > 10: break self.assertEqual(i, ('Chr01', 5501, 6000))
def getGOIHits(fileNames, chrom, pos1, pos2): print('getting hits to GOI') global queryChrom, lPosQuery, rPosQuery # dont like this genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) cells_dict_GOI = {} queryChrom = chrom lPosQuery = pos1 rPosQuery = pos2 for f in fileNames: numMatches = 0 cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query) & set(genomePos_laud_db)) # get the LAUD filter set shared1 = pd.Series( shared) # what if i convert this guy to a pandas object? numMatches = shared1.apply(hitSearchFunc) # another apply call cells_dict_GOI.update({cell: sum(numMatches)}) return cells_dict_GOI
def generate_slices(args): vcf = VCF.VCF() vcf.populations = args.populations vcf.set_chrms(args.input) chrm_2_windows = vcf.chrm2length.fromkeys(vcf.chrm2length.keys(),None) for count, chrm in enumerate(vcf.chrm2length.keys()): length = vcf.chrm2length[chrm] window_size = args.window_size overlap = args.overlap # Skip contigs that are to short if length <= window_size: continue # Fit windows into remaining space if (length % window_size) > overlap: start = (length % window_size)/2 stop = (length - window_size) - overlap/2 # Prevent windows from invading remaining space if (length % window_size) <= overlap: start = (length % window_size)/2 stop = length - overlap*2 starts = range(start, stop, overlap) stops = [i+window_size for i in starts] windows = zip(starts, stops) chrm_2_windows[chrm] = windows return chrm_2_windows
def summary_haplotype_block(vcf, haplotype, outfile): phased_block = defaultdict(lambda : list()) phased_block_com = defaultdict(lambda : list()) import VCF for v in VCF.lines(vcf): block_id = re.split(r':', v['FCM'][1])[1] genotype = re.split(r':', v['FCM'][0])[0] #print genotype #print '{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id) block_idx = '{}_{}'.format(v['CHROM'], block_id) snp_idx = '{}:{}'.format(v['CHROM'], v['POS']) bases = [v['REF'], v['ALT']] hap1_10x = bases[int(genotype[0])] haplotype_flag = -1 if haplotype.has_key(snp_idx): if hap1_10x == haplotype[snp_idx][0]: haplotype_flag = 0 elif hap1_10x == haplotype[snp_idx][1]: haplotype_flag = 1 phased_block[block_idx].append(int(v['POS'])) phased_block_com[block_idx].append(haplotype_flag) ofile = open(outfile, 'w') for blc in phased_block.keys(): snps = len(phased_block[blc]) start = np.min(phased_block[blc]) end = np.max(phased_block[blc]) length= int(end) - int(start) + 1 hap1_n = len([i for i in phased_block_com[blc] if i == 0]) hap2_n = len([i for i in phased_block_com[blc] if i == 1]) hap0_n = len([i for i in phased_block_com[blc] if i == -1]) print >> ofile, '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, start, end, hap1_n, hap2_n, hap0_n) ofile.close()
def make_dadi_fs(args, region): vcf = VCF.VCF() vcf.populations = args.populations vcf.set_header(args.input) pop_ids = args.populations.keys() # Get slice and setup output dictionaries chunk = vcf.slice_vcf(args.input, *region) if chunk == None: return None else: g = count_alleles(chunk, args.populations) final_dadi = {} population_level_dadis = dict.fromkeys(pop_ids,{}) for row_count, row in enumerate(g): raw_calls = chunk[row_count] row['outgroups'] = {'ALT': 0, 'REF': 0} # set empty outgroup # To Do: Need to create a function to fill outgroup if one is defined. # The heliconius dataset, for example, has this. if check_outgroup(row) == False: continue # skip outgroup not fixed at one value if len(raw_calls['REF']) > 1 or len(raw_calls["ALT"]) > 1: continue # skip multi allelic sites # CALL BASE FOR OUTGROUP outgroup_allele = get_outgroup_base(row, raw_calls) # CALL MAJOR ALLELE (BASE) FOR INGROUP major_allele = get_ingroup_major_allele(row, raw_calls, outgroup_allele) # POLORIZE REF AND ALT FOR INGROUP if major_allele != raw_calls['REF']: ref, alt = ('ALT','REF') else: ref, alt = ('REF','ALT') calls = {} for count, pop in enumerate(pop_ids): calls[pop] = (row[pop][ref], row[pop][alt]) row_id = "{0}_{1}".format(raw_calls['CHROM'],raw_calls['POS']) dadi_site = {'calls': calls, 'context': make_triplet(major_allele), 'outgroup_context': make_triplet(outgroup_allele), 'outgroup_allele': outgroup_allele, 'segregating': (raw_calls[ref], raw_calls[alt]) } final_dadi[row_id] = dadi_site return (final_dadi, pop_ids)
def test_header_vs_population_sample_ids(self): """Check that the sample IDs parsed from the population arguement match those in the VCF file. NOTE: In practice the populations arguement can contain fewer samples and populations than actually contained in the VCF file. """ header = VCF.set_header(self.bgzip_path) header_sample_ids = [item for count, item in enumerate(header) if count >= 9] populations_dict = VCF.parse_populations_list(self.populations_list) populations_sample_ids = [i for l in populations_dict.values() for i in l] # Check both unique IDs and equal length self.assertEqual(set(header_sample_ids), set(populations_sample_ids)) self.assertEqual(len(header_sample_ids), len(populations_sample_ids))
def test_population_string_parsing(self): populations = VCF.parse_populations_list(self.populations_list) self.assertEqual(populations, {'melpo': ['m523', 'm524', 'm525', 'm589', 'm675', 'm676', 'm682', 'm683', 'm687', 'm689'], 'pachi': ['p516', 'p517', 'p518', 'p519', 'p520', 'p591', 'p596', 'p690', 'p694', 'p696'], 'cydno': ['c511', 'c512', 'c513', 'c514', 'c515', 'c563', 'c614', 'c630', 'c639', 'c640'], 'outgroups': ['h665', 'i02-210']})
def runBatch(cell): try: cellName = cell.rstrip() #get_s3_files(cell) cwd = os.getcwd() vcf_path = cwd + '/vcf_files/' + cell vcf_path_strip = vcf_path.rstrip() + '.vcf' gvcf_path = cwd + '/vcf_files/' + cell gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf' vcf = VCF.dataframe(vcf_path_strip) gvcf = VCF.dataframe(gvcf_path_strip) # get a list of the records we actually care about toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) # subset by relevant records vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)] gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)] # get depth of coverage, for relevant records outputRow_v = getDepth_adv(vcf_GOI, cellName) outputRow_g = getDepth_adv(gvcf_GOI, cellName) # make the combined row, with both vcf and gvcf fields filled in outputRow_comb = pd.DataFrame(columns=colNames) # colNames is a global outputRow_comb['cellName'] = outputRow_v['cellName'] outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool'] outputRow_comb['depth_vcf'] = outputRow_v['depth'] outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool'] outputRow_comb['depth_gvcf'] = outputRow_g['depth'] except: outputRow_comb = pd.DataFrame(columns=colNames) # just an empty row # fill in this row with something return (outputRow_comb)
def sort_vcf( input, reference, output ): contig_list = get_contig_list( reference ) print "read {} contigs".format(len(contig_list)) v = VCF( input, True ) # request index of VCF upon open with open( output, 'w') as fd_out: fd_out.writelines( [ line+"\n" for line in v.metadata ] ) for contig in contig_list: print "writing entries for contig {}".format(contig) # filter lines from vcf by contig count = 0 if v.seek(contig) < 0: print "skipped {} because it's not in the VCF".format( contig ) else: for line in v.lines( True, \ lambda raw_line: VCFLine(raw_line).chr == contig ): fd_out.write( line.line+"\n" ) count += 1 print "wrote {} entries for {}".format( count, contig )
def getRawCounts(fileNames): print('getting raw counts...') cells_dict = {} for f in fileNames: cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) unique = len(np.unique(df.POS)) cells_dict.update({cell: unique}) print('finished!') return cells_dict
def callSNPs(current_base, numb_of_seqs): """Call the SNPs. Duh!""" blanks = np.zeros(numb_of_seqs, np.string0) if current_base.FILTER == 'LowQual': blanks.fill("-") if current_base.FORMAT == 'GT': blanks.fill("-") for count, snp_call in enumerate(current_base[9:]): base = VCF.process_snp_call(snp_call, current_base.REF, current_base.ALT) blanks[count] = base return blanks
def getGeneCellMutCounts(f): # Creates dictionry obj where every key is a cell and every value is # a list of the genes we found mutations in for that cell. tup = [] # not really a tuple, just a list, i guess cell = os.path.basename(f) cell = cell.replace(".vcf", "") print(cell) # to see where we are df = VCF.dataframe(f) genomePos_query = df.apply(getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query)) # genomePos_query (potentially) has dups sharedGeneNames = [f for e in shared for f in getGeneName(e)] tup = [cell, sharedGeneNames] return(tup)
def getFilterCountsLAUD(fileNames): print('getting filter counts LAUD...') cells_dict_laud = {} genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) for f in fileNames: cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query) & set(genomePos_laud_db)) cells_dict_laud.update({cell: len(shared)}) print('finished!') return cells_dict_laud
def getFilterCountsBasic(fileNames): print('getting filter counts basic...') cells_dict_filter = {} genomePos_db = pd.Series(database['Mutation genome position']) for f in fileNames: cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") print(cell) df = VCF.dataframe(f) genomePos_query = df.apply(getGenomePos, axis=1) shared = list(set(genomePos_query) & set(genomePos_db)) cells_dict_filter.update({cell: len(shared)}) #print(cells_dict_filter) print('finished!') return cells_dict_filter
def getGeneCellMutCounts(fileNames): print('getting gene/cell mutation counts...') cells_dict = {} genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) for f in fileNames: cell = f.replace("../vcf_test/", "") cell = cell.replace(".vcf", "") print(cell) # to see where we are df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query) & set(genomePos_laud_db)) shared_series = pd.Series(shared) sharedGeneNames = shared_series.apply(getGeneName) cells_dict.update({cell: sharedGeneNames}) return cells_dict
def getGOIHit_coords(fileNames, chrom, pos1, pos2): print('getting coords to GOI hits') global queryChrom, lPosQuery, rPosQuery # dont like this genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) cells_dict_GOI_coords = {} queryChrom = chrom lPosQuery = pos1 rPosQuery = pos2 for f in fileNames: numMatches = 0 cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df # get the entries shared between curr cells VCF and the LAUD filter set # remember, these are general, and NOT gene specific genomePos_query_expand = expandSet(set(genomePos_query)) shared = list(set(genomePos_query_expand) & set(genomePos_laud_db)) # problem is right here!!! shared1 = pd.Series(shared) # convert to pandas obj matches = shared1.apply(hitSearchFunc_coords) # another apply call # delete empty dict keys for k in matches.keys(): try: if len(matches[k]) < 1: del matches[k] except: pass cells_dict_GOI_coords.update({cell: list(matches.values)}) return cells_dict_GOI_coords
gvcfFilePrefix = sys.argv[5] cellName = str(vcfFilePrefix).strip('.vcf') print(' ') print('chromosome: %s' % chrom_) print('start_position: %s' % start_) print('end_position: %s' % end_) print('cell name: %s' % cellName) print(' ') cwd = os.getcwd() vcf_path = cwd + '/' + vcfFilePrefix gvcf_path = cwd + '/' + gvcfFilePrefix vcf = VCF.dataframe(vcf_path) gvcf = VCF.dataframe(gvcf_path) # get a list of the records we actually care about toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) # subset by relevant records vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)] gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)] # get depth of coverage, for relevant records getDepth_adv(vcf_GOI) getDepth_adv_g(gvcf_GOI) #////////////////////////////////////////////////////////////////////
import VCF import cyvcf2 try: ## MUST BE A UNCOMPRESSED VCF file finput = argv[1] except IndexError as ie: exit("{}\nUSAGE: $0 $vcf_file ".format(ie)) if not path.exists(finput): msg = "ERROR: FNF {}".format(finput) raise IOError(msg) d = {} for v in VCF.lines(finput): if v['CHROM'] in d: d[v['CHROM']].append(v['POS']) else: d[v['CHROM']] = [v['POS']] with open("{}.consPos.txt".format(finput), 'wt') as of: for key,val in d.items(): ## make sure all positions are integer ; if not raise error try: data = [int(i) for i in val] except ValueError as ve: exit("ERROR: {}".format(e)) # https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list for k, g in groupby(enumerate(data), lambda ix: ix[0] - ix[1]): cn = list(map(itemgetter(1), g))
def main(args): vcf = VCF.VCF() vcf.populations = args.populations vcf.set_header(args.input) pop_ids = args.populations.keys() # get slice and setup output dictionaries chunk = vcf.vcf_chunk_2_dadi(args.input, args.populations, *args.region) g = count_alleles(chunk, args.populations) # Create Header Row dadi_header = create_dadi_header(args) fout = open(args.output,'w') fout.write(dadi_header + "\n") for row_count, row in enumerate(g): raw_calls = chunk[row_count] row['outgroups'] = {'ALT': 0, 'REF': 0} # set empty outgroup # To Do: Need to create a function to fill outgroup if one is defined. # The heliconius dataset, for example, has this. if check_outgroup(row) == False: continue # skip outgroup not fixed at one value if len(raw_calls['REF']) > 1 or len(raw_calls["ALT"]) > 1: continue # skip multi allelic sites # CALL BASE FOR OUTGROUP outgroup_allele = get_outgroup_base(row, raw_calls) # CALL MAJOR ALLELE (BASE) FOR INGROUP major_allele = get_ingroup_major_allele(row, raw_calls, outgroup_allele) # POLORIZE REF AND ALT FOR INGROUP if major_allele != raw_calls['REF']: ref, alt = ('ALT','REF') else: ref, alt = ('REF','ALT') # CREATE DADI ROW dadi_row = [make_triplet(major_allele), make_triplet(outgroup_allele)] for count, pop in enumerate(pop_ids): if count == 0: dadi_row.append(chunk[row_count][ref]) dadi_row.append(row[pop][ref]) else: dadi_row.append(row[pop][ref]) for count, pop in enumerate(pop_ids): if count == 0: dadi_row.append(chunk[row_count][alt]) dadi_row.append(row[pop][alt]) else: dadi_row.append(row[pop][alt]) dadi_row.append(raw_calls['CHROM']) dadi_row.append(raw_calls['POS']) dadi_row = " ".join([str(item) for item in dadi_row]) fout.write(dadi_row + "\n")
def summary_haplotype_block(vcf, haplotype, outfile_up, outfile_down): phased_block = defaultdict(lambda : list()) phased_block_com = defaultdict(lambda : list()) phased_block_rank = defaultdict(lambda : int()) count = 0 import VCF for v in VCF.lines(vcf): block_id = re.split(r':', v['FCM'][1])[1] genotype = re.split(r':', v['FCM'][0])[0] #print genotype #print '{}\t{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id, v['FCM'][0]) block_idx = '{}_{}'.format(v['CHROM'], block_id) snp_idx = '{}:{}'.format(v['CHROM'], v['POS']) bases = [v['REF'], v['ALT']] hap1_10x = bases[int(genotype[0])] haplotype_flag = -1 if haplotype.has_key(snp_idx): if hap1_10x == haplotype[snp_idx][0]: haplotype_flag = 0 elif hap1_10x == haplotype[snp_idx][1]: haplotype_flag = 1 if not phased_block.has_key(block_idx): count += 1 phased_block[block_idx].append(int(v['POS'])) phased_block_com[block_idx].append(haplotype_flag) phased_block_rank[block_idx] = count for blc in phased_block.keys(): chrs, blc_id = re.split(r'_', blc) snps = len(phased_block[blc]) start = np.min(phased_block[blc]) end = np.max(phased_block[blc]) length= int(end) - int(start) + 1 hap1_n = len([i for i in phased_block_com[blc] if i == 0]) hap2_n = len([i for i in phased_block_com[blc] if i == 1]) hap0_n = len([i for i in phased_block_com[blc] if i == -1]) print '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, chrs, start, end, hap1_n, hap2_n, hap0_n) ratio = 0.0 #try: # ratio = np.max(float(hap1_n)/(float(hap1_n)+float(hap2_n)), float(hap2_n)/(float(hap1_n)+float(hap2_n))) #except: # continue min_snp = 1 if hap1_n >= min_snp and hap1_n == snps - hap0_n: ratio = 1 elif hap2_n >= min_snp and hap2_n == snps - hap0_n: ratio = 1 elif hap1_n >= min_snp and hap1_n > hap2_n: ratio = float(hap1_n)/(float(hap1_n)+float(hap2_n)) elif hap2_n >= min_snp and hap2_n > hap1_n: ratio = float(hap2_n)/(float(hap1_n)+float(hap2_n)) print ratio if ratio > 0.95: color = 'gray' hap = 3 print hap1_n, hap2_n if hap1_n > hap2_n: color = 'orange' hap = 1 else: color = 'blue' hap = 2 print color if phased_block_rank[blc]%2 == 1: print >> outfile_up, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color) else: print >> outfile_down, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color)
#///////////////////////////////////////////////////////////////////////// # script: convert_to_csv.py # author: Lincoln # date: 3.18.19 # # want to convert any remaining vcfs to csv #///////////////////////////////////////////////////////////////////////// import pandas as pd import VCF import os import warnings warnings.simplefilter(action='ignore', category=FutureWarning) filterDir = '/home/ubuntu/code/SNP_calling_pipeline/bulkAnalysis/scVCF_filtered_all/' filterDir_list = os.listdir(filterDir) for f in filterDir_list: if '.vcf' in f: currPATH = filterDir + f df = VCF.dataframe(currPATH) df_trimmed = df[['CHROM', 'POS', 'ID', 'REF', 'ALT']] cellName = f.strip('.vcf') outStr = filterDir + cellName + '.csv' df_trimmed.to_csv(outStr, index=False) #///////////////////////////////////////////////////////////////////////// #/////////////////////////////////////////////////////////////////////////
def test_header_to_ordereddict_parsing(self): header = VCF.set_header(self.bgzip_path) self.assertEqual(header,self.header_dict)
def LoadDataSet(vcfInfile, traningSet, qFaLen): if len(traningSet) == 0: raise ValueError('[ERROR] No Training Data found') if vcfInfile[-3:] == '.gz': I = os.popen('gzip -dc %s' % vcfInfile) else: I = open(vcfInfile) data, hInfo = [], VCF.VCFHeader() while 1: # VCF format lines = I.readlines(100000) if not lines: break for line in lines: col = line.strip('\n').split() if re.search(r'^#CHROM', line): col2sam = {i+9:sam for i,sam in enumerate(col[9:])} # Record the header information if re.search(r'^#', line): hInfo.Record(line.strip('\n')) continue # Get inbreeding coefficient. If fail then continue. # It's calculated like: 1.0 - hetCount/Expected_hetCount in VCF #inbCoeff = re.search(r';F=([^;]+)', col[7]) inbCoeff = re.search(r';?InbCoeff=([^;]+)', col[7]) if not inbCoeff: continue #print >> sys.stderr, '[ERROR] No inbreeding coefficient "InbCoeff=..." in INFO field in vcf:\n%s\n' % vcfInfile inbCoeff = float('%.2f' % float(inbCoeff.group(1))) fmat = {k:i for i,k in enumerate(col[8].split(':'))} # Get Format if 'QR' not in fmat: continue # Cause by INTERGAP. But We'd better delete this statment, because the error is cause by the USER for tag in ['AA', 'QR', 'NR']: if tag not in fmat: raise ValueError('[ERROR] The "Format" fields did not contian "%s" in VCF: %s\nAT: %s\n' %(tag, vcfInfile, line)) isBiallelic = True if len(col[4].split(',')) > 1: isBiallelic = False annotations = [] atleastOne = False for i, sample in enumerate(col[9:]): sampleId = col2sam[9+i] if sample == './.': continue field = sample.split(':') if len(field[fmat['AA']].split(',')) != 4: continue if len(field) < fmat['QR'] + 1: continue qr = field[fmat['QR']].split(',')[-1] if qr == '.': continue atleastOne = True qregion = np.array(qr.split('-')) if len(qregion) > 3: qId = qregion[0] + '-' + qregion[1] else : qId = qregion[0] qSta = string.atoi(qregion[-2]) qEnd = string.atoi(qregion[-1]) if sampleId not in qFaLen : raise ValueError('[ERROR] The sample name $s(in vcf) is not in the name of Fa list.' % sampleId) if qId not in qFaLen[sampleId]: raise ValueError('[ERROR]', qId, 'is not been found in fa file\n') qSta = int(qSta * 100 / qFaLen[sampleId][qId] + 0.5) qEnd = int(qEnd * 100 / qFaLen[sampleId][qId] + 0.5) if qSta > 100: qSta = 100 # Bug!!! Should delete if qEnd > 100: qEnd = 100 # Bug!!! Should delete if qSta > 100 or qEnd > 100: raise ValueError('[ERROR] Query size Overflow! sample: %s; scaffold: %s\n%s\n%s' % (sampleId, qId, sample, line)) leg = min(qSta, 100 - qEnd) nn = string.atof(sample.split(':')[fmat['NR']]) n = int(1000 * nn + 0.5) / 10.0 # n ratio range: [0, 100] alt = string.atoi(sample.split(':')[fmat['AA']].split(',')[1]) # Alternate perfect bot = string.atoi(sample.split(':')[fmat['AA']].split(',')[3]) # Both imperfect annotations.append([isBiallelic, inbCoeff, leg, n , alt, bot]) if not atleastOne: raise ValueError('[ERROR] All the samples don\'t contain this variant.', col) datum = vd.VariantDatum() datum.annotations = np.median(annotations, axis = 0) pos = col[0] + ':' + col[1] datum.variantOrder = pos if pos in traningSet: datum.atTrainingSite = True data.append(datum) I.close() return hInfo, np.array(data)
distSquareMatrix = dist.squareform(distMatrix) linkageMatrix = hier.linkage(distMatrix,method='ward') dendro = hier.dendrogram(linkageMatrix) leaves2 = dendro['leaves'] transformedData = transformedData[:,leaves2] ##### leaves1 for the mutations sites ##### leaves2 for the taxa fig_ = plt.figure(figsize=(6,6)) ax_ = fig_.add_subplot(111) cax_ = ax_.matshow(transformedData,cmap='Blues',aspect="auto") ax_.set_ylabel('Cells') ax_.set_xlabel('Genomic Positions') # fig_.colorbar(cax_) fig_.savefig(out_path+"hier_clust_heatmap.png", dpi=1200) ########################################################################### ######################## Generate the VCF output ########################## ########################################################################### VCF.gen_VCF(out_dir=out_path, genotype_mat=mat_, read_count_mat_=read_counts, chrs=chroms, posits=positions, alt_counts=alts, rfs=refs, ids=names, dps=depths) ################################################################################### ######################## Generate Perfect Phylogeny Newick ######################## ################################################################################### if K_==0: Phylo_module.gen_Newick(genotype=mat_, PerfectPhy_path=PerfectPhy_path_, out_dir_path=out_path, names_=names)
def nr_sensitivity( input, sample, truth, minqual=0, misses=None, debug=False ): print >>sys.stderr,"input={}\ntruth={}\nminqual={}".format(input,truth,minqual) truth_vcf = VCF( truth ) eval_vcf = VCF( input ) if misses: misses_fd = open( misses, 'w' ) found = False for line in truth_vcf.metadata: if line[:len("#CHROM")]=="#CHROM": misses_fd.write( '##nr_concordance="comment={subset of '\ 'missed sites created by DISCOVAR release bundle Python program '\ 'nr_concordance.py}"\n') found = True misses_fd.write(line+"\n") if not found: raise Exception("program bug? made it this far without #CHROM in truth VCF?") else: misses_fd = None eval_sample_index = eval_vcf.sample_names.index(sample) truth_sample_index = truth_vcf.sample_names.index(sample) eval_chr=[] truth_chr=[] eval_gen = eval_vcf.lines() eval_line = eval_gen.next() eval_chr.append( eval_line.chr ) eval_done = False n_truth_lines = 0 n_site_hits = 0 n_site_concords = 0 last_truth=(None,None) last_eval=(None,None) for truth_line in truth_vcf.lines(): check_sort_order( truth, last_truth, truth_line.chr, truth_line.pos ) truth_genotype=truth_line.get_sample_dict(truth_sample_index)["GT"] if truth_genotype == "0/0" or truth_genotype=="0|0" or truth_genotype == ".": continue if debug: print >>sys.stderr,"seeking {}:{}".format( truth_line.chr, truth_line.pos ) n_truth_lines += 1 if n_truth_lines % 1000 == 0: print n_truth_lines # skip to the correct chromosome if len(truth_chr) == 0 or truth_chr[-1] != truth_line.chr: truth_chr.append( truth_line.chr ) # if we've already passed this chr in the eval file, then spin if not eval_done and truth_line.chr != eval_line.chr and truth_line.chr in eval_chr: continue # if we've not already passed this chr, then find it while not eval_done and eval_line.chr != truth_line.chr: try: eval_line = eval_gen.next() check_sort_order( input, last_eval, eval_line.chr, eval_line.pos ) if debug: print >>sys.stderr,"...next chr={}".format( eval_line.chr ) if eval_chr[-1] != eval_line.chr: eval_chr.append(eval_line.chr) print eval_line.chr except StopIteration: eval_done=True # try to find the correct position while not eval_done and eval_line.pos < truth_line.pos \ and eval_line.chr == truth_line.chr: try: eval_line = eval_gen.next() check_sort_order( input, last_eval, eval_line.chr, eval_line.pos ) if debug: print >>sys.stderr,"...next chr:pos={}:{}".format( eval_line.chr, eval_line.pos ) if eval_chr[-1] != eval_line.chr: eval_chr.append(eval_line.chr) print eval_line.chr except StopIteration: eval_done=True if minqual > 0 and eval_line.qual == '.': raise Exception("not sure what to do here, we're filtering on qual, but qual is '.'") if eval_done or eval_line.pos != truth_line.pos \ or eval_line.chr != truth_line.chr \ or ( eval_line.qual != '.' and float(eval_line.qual) < minqual ): if misses_fd: misses_fd.write(truth_line.line+"\n") else: if truth_line.ref != eval_line.ref: raise Exception(""" Your truth set does not seem to be called on the same reference as your call set. We're done here. truth={} truth_pos={}:{} truth_ref={} input={} input_pos={}:{} input_ref={} """.format( truth, truth_line.chr, truth_line.pos, truth_line.ref, input, eval_line.chr, eval_line.pos, eval_line.ref ) ) if debug: print >>sys.stderr,""" Evaluating: truth_pos={}:{} truth_ref={} truth_alt={} input_pos={}:{} input_ref={} input_alt={} """.format( truth_line.chr, truth_line.pos, truth_line.ref, truth_line.alt, eval_line.chr, eval_line.pos, eval_line.ref, eval_line.alt ) # grab truth NR bases and eval NR bases eval_genotype=eval_line.get_sample_dict( eval_sample_index)["GT"] if eval_genotype != ".": eval_calls_idx = eval_genotype.split("/") if eval_calls_idx[0] == eval_genotype: eval_calls_idx = eval_genotype.split("|") if '0' in eval_calls_idx: eval_calls_idx.remove('0') eval_calls_idx = map(int, eval_calls_idx ) truth_calls_idx = truth_genotype.split("/") if truth_calls_idx[0] == truth_genotype: truth_calls_idx = truth_genotype.split("|") if '0' in truth_calls_idx: truth_calls_idx.remove('0') truth_calls_idx = map(int, truth_calls_idx ) if len(eval_calls_idx) > 0: n_site_hits += 1 if debug: print >>sys.stderr, "accepting site hit at {}:{}".format(eval_line.chr, eval_line.pos) elif debug: print >>sys.stderr, "no NR calls" for truth_nr in [ truth_line.alt_list[i-1] for i in truth_calls_idx ]: # if any truth non-reference call is not found on # the eval line, then we break without counting the # concordance if not truth_nr in eval_line.alt_list: if debug: print >>sys.stderr,""" non-concordant: pos={}:{} truth_ref={} eval_ref={} truth_alt={} eval_alt={} """.format( truth_line.chr, truth_line.pos, truth_line.ref, eval_line.ref, truth_line.alt_list, eval_line.alt_list ) break else: # normal termination of the for loop, so count the # concordance if debug: print >>sys.stderr,""" CONCORDANT: pos={}:{} truth_ref={} eval_ref={} truth_alt={} eval_alt={} """.format( truth_line.chr, truth_line.pos, truth_line.ref, eval_line.ref, truth_line.alt_list, eval_line.alt_list ) n_site_concords += 1 print "n_truth_lines={}, n_site_hits={}, n_site_concords={}, site_hit_frac={}, site_concord_frac={}".format( n_truth_lines, n_site_hits, n_site_concords, n_site_hits/float(n_truth_lines), n_site_concords/float(n_truth_lines) ) # check that for the eval chromosomes that are also in the truth # set, that they come in the same order # first form intersection set overlap_chr = set(truth_chr).intersection(set(eval_chr)) truth_chr_rev = [ chr for chr in truth_chr if chr in overlap_chr ] truth_chr_rev.reverse() for chr in eval_chr: if chr not in overlap_chr: continue if chr != truth_chr_rev[-1]: raise Exception(""" input chromosome ordering doesn't match truth chromosome ordering: input={} truth={} """.format( eval_chr, truth_chr ) ) truth_chr_rev.pop() if misses_fd: misses_fd.close()