def getUniqueVCF_entries(patient, cell): basePATH = os.getcwd() patientPATH = basePATH + '/bulkVCF/' + patient cellPATH = basePATH + '/scVCF/' + cell + '.vcf' try: patient_df = VCF.dataframe(patientPATH) cell_df = VCF.dataframe(cellPATH) except FileNotFoundError: print('FILE NOT FOUND: %s' % cellPATH) return patient_df_trimmed = patient_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']] cell_df_trimmed = cell_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']] # get whats SHARED between patient and cell # FIND GERMLINE MUTATIONS patient_cell_concat = pd.concat([patient_df_trimmed, cell_df_trimmed]) rowsToKeep = patient_cell_concat.duplicated() patient_cell_shared = patient_cell_concat[rowsToKeep] patient_cell_shared = patient_cell_shared.reset_index(drop=True) # now go back to the original cell df, pull out whats UNIQUE # THIS IS THE GERMLINE FILTER!! cell_cell_concat = pd.concat([cell_df_trimmed, patient_cell_shared]) cell_cell_concat_noDups = cell_cell_concat.drop_duplicates(keep=False) cell_cell_concat_noDups = cell_cell_concat_noDups.reset_index(drop=True) return (cell_cell_concat_noDups)
def runBatch(cellsList_file, outputDF_): cellsList_open = open(cellsList_file, "r") cells = cellsList_open.readlines() global cellName for cell in cells: cellName = cell.rstrip() get_s3_files(cell) cwd = os.getcwd() vcf_path = cwd + '/' + cell vcf_path_strip = vcf_path.rstrip() + '.vcf' gvcf_path = cwd + '/' + cell gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf' vcf = VCF.dataframe(vcf_path_strip) gvcf = VCF.dataframe(gvcf_path_strip) # get a list of the records we actually care about toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) # subset by relevant records vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)] gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)] # get depth of coverage, for relevant records outputRow_v = getDepth_adv(vcf_GOI) outputRow_g = getDepth_adv(gvcf_GOI) # make the combined row, with both vcf and gvcf fields filled in outputRow_comb = pd.DataFrame(columns=colNames) # colNames is a global outputRow_comb['cellName'] = outputRow_v['cellName'] outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool'] outputRow_comb['depth_vcf'] = outputRow_v['depth'] outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool'] outputRow_comb['depth_gvcf'] = outputRow_g['depth'] outputDF_ = outputDF_.append(outputRow_comb) # remove s3 files os.system('rm *.vcf > /dev/null 2>&1') # remove, and mute errors os.system('rm *.vcf* > /dev/null 2>&1') # remove, and mute errors return (outputDF_)
def getGOIHits(fileNames, chrom, pos1, pos2): print('getting hits to GOI') global queryChrom, lPosQuery, rPosQuery # dont like this genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) cells_dict_GOI = {} queryChrom = chrom lPosQuery = pos1 rPosQuery = pos2 for f in fileNames: numMatches = 0 cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query) & set(genomePos_laud_db)) # get the LAUD filter set shared1 = pd.Series( shared) # what if i convert this guy to a pandas object? numMatches = shared1.apply(hitSearchFunc) # another apply call cells_dict_GOI.update({cell: sum(numMatches)}) return cells_dict_GOI
def runBatch(cell): try: cellName = cell.rstrip() #get_s3_files(cell) cwd = os.getcwd() vcf_path = cwd + '/vcf_files/' + cell vcf_path_strip = vcf_path.rstrip() + '.vcf' gvcf_path = cwd + '/vcf_files/' + cell gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf' vcf = VCF.dataframe(vcf_path_strip) gvcf = VCF.dataframe(gvcf_path_strip) # get a list of the records we actually care about toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) # subset by relevant records vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)] gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)] # get depth of coverage, for relevant records outputRow_v = getDepth_adv(vcf_GOI, cellName) outputRow_g = getDepth_adv(gvcf_GOI, cellName) # make the combined row, with both vcf and gvcf fields filled in outputRow_comb = pd.DataFrame(columns=colNames) # colNames is a global outputRow_comb['cellName'] = outputRow_v['cellName'] outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool'] outputRow_comb['depth_vcf'] = outputRow_v['depth'] outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool'] outputRow_comb['depth_gvcf'] = outputRow_g['depth'] except: outputRow_comb = pd.DataFrame(columns=colNames) # just an empty row # fill in this row with something return (outputRow_comb)
def getRawCounts(fileNames): print('getting raw counts...') cells_dict = {} for f in fileNames: cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) unique = len(np.unique(df.POS)) cells_dict.update({cell: unique}) print('finished!') return cells_dict
def getGeneCellMutCounts(f): # Creates dictionry obj where every key is a cell and every value is # a list of the genes we found mutations in for that cell. tup = [] # not really a tuple, just a list, i guess cell = os.path.basename(f) cell = cell.replace(".vcf", "") print(cell) # to see where we are df = VCF.dataframe(f) genomePos_query = df.apply(getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query)) # genomePos_query (potentially) has dups sharedGeneNames = [f for e in shared for f in getGeneName(e)] tup = [cell, sharedGeneNames] return(tup)
def getFilterCountsLAUD(fileNames): print('getting filter counts LAUD...') cells_dict_laud = {} genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) for f in fileNames: cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query) & set(genomePos_laud_db)) cells_dict_laud.update({cell: len(shared)}) print('finished!') return cells_dict_laud
def getFilterCountsBasic(fileNames): print('getting filter counts basic...') cells_dict_filter = {} genomePos_db = pd.Series(database['Mutation genome position']) for f in fileNames: cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") print(cell) df = VCF.dataframe(f) genomePos_query = df.apply(getGenomePos, axis=1) shared = list(set(genomePos_query) & set(genomePos_db)) cells_dict_filter.update({cell: len(shared)}) #print(cells_dict_filter) print('finished!') return cells_dict_filter
def getGeneCellMutCounts(fileNames): print('getting gene/cell mutation counts...') cells_dict = {} genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) for f in fileNames: cell = f.replace("../vcf_test/", "") cell = cell.replace(".vcf", "") print(cell) # to see where we are df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df shared = list(set(genomePos_query) & set(genomePos_laud_db)) shared_series = pd.Series(shared) sharedGeneNames = shared_series.apply(getGeneName) cells_dict.update({cell: sharedGeneNames}) return cells_dict
def getGOIHit_coords(fileNames, chrom, pos1, pos2): print('getting coords to GOI hits') global queryChrom, lPosQuery, rPosQuery # dont like this genomePos_laud_db = pd.Series(database_laud['Mutation genome position']) cells_dict_GOI_coords = {} queryChrom = chrom lPosQuery = pos1 rPosQuery = pos2 for f in fileNames: numMatches = 0 cell = f.replace("../vcf/", "") cell = cell.replace(".vcf", "") df = VCF.dataframe(f) genomePos_query = df.apply( getGenomePos, axis=1) # apply function for every row in df # get the entries shared between curr cells VCF and the LAUD filter set # remember, these are general, and NOT gene specific genomePos_query_expand = expandSet(set(genomePos_query)) shared = list(set(genomePos_query_expand) & set(genomePos_laud_db)) # problem is right here!!! shared1 = pd.Series(shared) # convert to pandas obj matches = shared1.apply(hitSearchFunc_coords) # another apply call # delete empty dict keys for k in matches.keys(): try: if len(matches[k]) < 1: del matches[k] except: pass cells_dict_GOI_coords.update({cell: list(matches.values)}) return cells_dict_GOI_coords
gvcfFilePrefix = sys.argv[5] cellName = str(vcfFilePrefix).strip('.vcf') print(' ') print('chromosome: %s' % chrom_) print('start_position: %s' % start_) print('end_position: %s' % end_) print('cell name: %s' % cellName) print(' ') cwd = os.getcwd() vcf_path = cwd + '/' + vcfFilePrefix gvcf_path = cwd + '/' + gvcfFilePrefix vcf = VCF.dataframe(vcf_path) gvcf = VCF.dataframe(gvcf_path) # get a list of the records we actually care about toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_)) # subset by relevant records vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)] gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)] # get depth of coverage, for relevant records getDepth_adv(vcf_GOI) getDepth_adv_g(gvcf_GOI) #////////////////////////////////////////////////////////////////////
#///////////////////////////////////////////////////////////////////////// # script: convert_to_csv.py # author: Lincoln # date: 3.18.19 # # want to convert any remaining vcfs to csv #///////////////////////////////////////////////////////////////////////// import pandas as pd import VCF import os import warnings warnings.simplefilter(action='ignore', category=FutureWarning) filterDir = '/home/ubuntu/code/SNP_calling_pipeline/bulkAnalysis/scVCF_filtered_all/' filterDir_list = os.listdir(filterDir) for f in filterDir_list: if '.vcf' in f: currPATH = filterDir + f df = VCF.dataframe(currPATH) df_trimmed = df[['CHROM', 'POS', 'ID', 'REF', 'ALT']] cellName = f.strip('.vcf') outStr = filterDir + cellName + '.csv' df_trimmed.to_csv(outStr, index=False) #///////////////////////////////////////////////////////////////////////// #/////////////////////////////////////////////////////////////////////////