Ejemplo n.º 1
0
def getUniqueVCF_entries(patient, cell):
    basePATH = os.getcwd()
    patientPATH = basePATH + '/bulkVCF/' + patient
    cellPATH = basePATH + '/scVCF/' + cell + '.vcf'

    try:
        patient_df = VCF.dataframe(patientPATH)
        cell_df = VCF.dataframe(cellPATH)
    except FileNotFoundError:
        print('FILE NOT FOUND: %s' % cellPATH)
        return

    patient_df_trimmed = patient_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']]
    cell_df_trimmed = cell_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

    # get whats SHARED between patient and cell
    #    FIND GERMLINE MUTATIONS
    patient_cell_concat = pd.concat([patient_df_trimmed, cell_df_trimmed])
    rowsToKeep = patient_cell_concat.duplicated()
    patient_cell_shared = patient_cell_concat[rowsToKeep]
    patient_cell_shared = patient_cell_shared.reset_index(drop=True)

    # now go back to the original cell df, pull out whats UNIQUE
    #     THIS IS THE GERMLINE FILTER!!
    cell_cell_concat = pd.concat([cell_df_trimmed, patient_cell_shared])
    cell_cell_concat_noDups = cell_cell_concat.drop_duplicates(keep=False)
    cell_cell_concat_noDups = cell_cell_concat_noDups.reset_index(drop=True)

    return (cell_cell_concat_noDups)
def runBatch(cellsList_file, outputDF_):
    cellsList_open = open(cellsList_file, "r")
    cells = cellsList_open.readlines()

    global cellName

    for cell in cells:
        cellName = cell.rstrip()
        get_s3_files(cell)

        cwd = os.getcwd()
        vcf_path = cwd + '/' + cell
        vcf_path_strip = vcf_path.rstrip() + '.vcf'
        gvcf_path = cwd + '/' + cell
        gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf'

        vcf = VCF.dataframe(vcf_path_strip)
        gvcf = VCF.dataframe(gvcf_path_strip)

        # get a list of the records we actually care about
        toKeepList_v = vcf.apply(getGOI_record,
                                 axis=1,
                                 args=(chrom_, start_, end_))
        toKeepList_g = gvcf.apply(getGOI_record,
                                  axis=1,
                                  args=(chrom_, start_, end_))

        # subset by relevant records
        vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)]
        gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)]

        # get depth of coverage, for relevant records
        outputRow_v = getDepth_adv(vcf_GOI)
        outputRow_g = getDepth_adv(gvcf_GOI)

        # make the combined row, with both vcf and gvcf fields filled in
        outputRow_comb = pd.DataFrame(columns=colNames)  # colNames is a global
        outputRow_comb['cellName'] = outputRow_v['cellName']
        outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool']
        outputRow_comb['depth_vcf'] = outputRow_v['depth']
        outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool']
        outputRow_comb['depth_gvcf'] = outputRow_g['depth']

        outputDF_ = outputDF_.append(outputRow_comb)

        # remove s3 files
        os.system('rm *.vcf > /dev/null 2>&1')  # remove, and mute errors
        os.system('rm *.vcf* > /dev/null 2>&1')  # remove, and mute errors

    return (outputDF_)
Ejemplo n.º 3
0
def getGOIHits(fileNames, chrom, pos1, pos2):
    print('getting hits to GOI')

    global queryChrom, lPosQuery, rPosQuery  # dont like this
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])

    cells_dict_GOI = {}
    queryChrom = chrom
    lPosQuery = pos1
    rPosQuery = pos2

    for f in fileNames:
        numMatches = 0
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df

        shared = list(set(genomePos_query)
                      & set(genomePos_laud_db))  # get the LAUD filter set
        shared1 = pd.Series(
            shared)  # what if i convert this guy to a pandas object?

        numMatches = shared1.apply(hitSearchFunc)  # another apply call

        cells_dict_GOI.update({cell: sum(numMatches)})

    return cells_dict_GOI
def runBatch(cell):
    try:
        cellName = cell.rstrip()
        #get_s3_files(cell)

        cwd = os.getcwd()
        vcf_path = cwd + '/vcf_files/' + cell
        vcf_path_strip = vcf_path.rstrip() + '.vcf'
        gvcf_path = cwd + '/vcf_files/' + cell
        gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf'

        vcf = VCF.dataframe(vcf_path_strip)
        gvcf = VCF.dataframe(gvcf_path_strip)

        # get a list of the records we actually care about
        toKeepList_v = vcf.apply(getGOI_record,
                                 axis=1,
                                 args=(chrom_, start_, end_))
        toKeepList_g = gvcf.apply(getGOI_record,
                                  axis=1,
                                  args=(chrom_, start_, end_))

        # subset by relevant records
        vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)]
        gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)]

        # get depth of coverage, for relevant records
        outputRow_v = getDepth_adv(vcf_GOI, cellName)
        outputRow_g = getDepth_adv(gvcf_GOI, cellName)

        # make the combined row, with both vcf and gvcf fields filled in
        outputRow_comb = pd.DataFrame(columns=colNames)  # colNames is a global
        outputRow_comb['cellName'] = outputRow_v['cellName']
        outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool']
        outputRow_comb['depth_vcf'] = outputRow_v['depth']
        outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool']
        outputRow_comb['depth_gvcf'] = outputRow_g['depth']

    except:
        outputRow_comb = pd.DataFrame(columns=colNames)  # just an empty row
        # fill in this row with something
    return (outputRow_comb)
Ejemplo n.º 5
0
def getRawCounts(fileNames):
    print('getting raw counts...')
    cells_dict = {}

    for f in fileNames:
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        unique = len(np.unique(df.POS))

        cells_dict.update({cell: unique})
    print('finished!')
    return cells_dict
Ejemplo n.º 6
0
def getGeneCellMutCounts(f):
        # Creates dictionry obj where every key is a cell and every value is
        # a list of the genes we found mutations in for that cell.
        tup = [] # not really a tuple, just a list, i guess

        cell = os.path.basename(f)
        cell = cell.replace(".vcf", "")
        print(cell) # to see where we are

        df = VCF.dataframe(f)
        genomePos_query = df.apply(getGenomePos, axis=1) # apply function for every row in df

        shared = list(set(genomePos_query)) # genomePos_query (potentially) has dups
        sharedGeneNames = [f for e in shared for f in getGeneName(e)]
        tup = [cell, sharedGeneNames]

        return(tup)
Ejemplo n.º 7
0
def getFilterCountsLAUD(fileNames):
    print('getting filter counts LAUD...')
    cells_dict_laud = {}
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])

    for f in fileNames:
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df

        shared = list(set(genomePos_query) & set(genomePos_laud_db))
        cells_dict_laud.update({cell: len(shared)})

    print('finished!')
    return cells_dict_laud
Ejemplo n.º 8
0
def getFilterCountsBasic(fileNames):
    print('getting filter counts basic...')
    cells_dict_filter = {}
    genomePos_db = pd.Series(database['Mutation genome position'])

    for f in fileNames:
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")
        print(cell)
        df = VCF.dataframe(f)

        genomePos_query = df.apply(getGenomePos, axis=1)

        shared = list(set(genomePos_query) & set(genomePos_db))
        cells_dict_filter.update({cell: len(shared)})

        #print(cells_dict_filter)
    print('finished!')
    return cells_dict_filter
Ejemplo n.º 9
0
def getGeneCellMutCounts(fileNames):
    print('getting gene/cell mutation counts...')
    cells_dict = {}
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])

    for f in fileNames:
        cell = f.replace("../vcf_test/", "")
        cell = cell.replace(".vcf", "")
        print(cell)  # to see where we are
        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df

        shared = list(set(genomePos_query) & set(genomePos_laud_db))

        shared_series = pd.Series(shared)
        sharedGeneNames = shared_series.apply(getGeneName)
        cells_dict.update({cell: sharedGeneNames})

    return cells_dict
Ejemplo n.º 10
0
def getGOIHit_coords(fileNames, chrom, pos1, pos2):
    print('getting coords to GOI hits')

    global queryChrom, lPosQuery, rPosQuery  # dont like this
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])
    cells_dict_GOI_coords = {}
    queryChrom = chrom
    lPosQuery = pos1
    rPosQuery = pos2

    for f in fileNames:
        numMatches = 0
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df
        # get the entries shared between curr cells VCF and the LAUD filter set
        #	remember, these are general, and NOT gene specific
        genomePos_query_expand = expandSet(set(genomePos_query))

        shared = list(set(genomePos_query_expand)
                      & set(genomePos_laud_db))  # problem is right here!!!
        shared1 = pd.Series(shared)  # convert to pandas obj
        matches = shared1.apply(hitSearchFunc_coords)  # another apply call

        # delete empty dict keys
        for k in matches.keys():
            try:
                if len(matches[k]) < 1:
                    del matches[k]
            except:
                pass

        cells_dict_GOI_coords.update({cell: list(matches.values)})

    return cells_dict_GOI_coords
gvcfFilePrefix = sys.argv[5]

cellName = str(vcfFilePrefix).strip('.vcf')

print('  ')
print('chromosome: %s' % chrom_)
print('start_position: %s' % start_)
print('end_position: %s' % end_)
print('cell name: %s' % cellName)
print(' ')

cwd = os.getcwd()
vcf_path = cwd + '/' + vcfFilePrefix
gvcf_path = cwd + '/' + gvcfFilePrefix

vcf = VCF.dataframe(vcf_path)
gvcf = VCF.dataframe(gvcf_path)

# get a list of the records we actually care about
toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_))
toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_))

# subset by relevant records
vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)]
gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)]

# get depth of coverage, for relevant records
getDepth_adv(vcf_GOI)
getDepth_adv_g(gvcf_GOI)

#////////////////////////////////////////////////////////////////////
Ejemplo n.º 12
0
#/////////////////////////////////////////////////////////////////////////
# script: convert_to_csv.py
# author: Lincoln
# date: 3.18.19
#
# want to convert any remaining vcfs to csv
#/////////////////////////////////////////////////////////////////////////
import pandas as pd
import VCF
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

filterDir = '/home/ubuntu/code/SNP_calling_pipeline/bulkAnalysis/scVCF_filtered_all/'
filterDir_list = os.listdir(filterDir)

for f in filterDir_list:
	if '.vcf' in f:
		currPATH = filterDir + f
		df = VCF.dataframe(currPATH)
		df_trimmed = df[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

		cellName = f.strip('.vcf')
		outStr = filterDir + cellName + '.csv'
		df_trimmed.to_csv(outStr, index=False)

#/////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////