Example #1
0
def getUniqueVCF_entries(patient, cell):
    basePATH = os.getcwd()
    patientPATH = basePATH + '/bulkVCF/' + patient
    cellPATH = basePATH + '/scVCF/' + cell + '.vcf'

    try:
        patient_df = VCF.dataframe(patientPATH)
        cell_df = VCF.dataframe(cellPATH)
    except FileNotFoundError:
        print('FILE NOT FOUND: %s' % cellPATH)
        return

    patient_df_trimmed = patient_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']]
    cell_df_trimmed = cell_df[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

    # get whats SHARED between patient and cell
    #    FIND GERMLINE MUTATIONS
    patient_cell_concat = pd.concat([patient_df_trimmed, cell_df_trimmed])
    rowsToKeep = patient_cell_concat.duplicated()
    patient_cell_shared = patient_cell_concat[rowsToKeep]
    patient_cell_shared = patient_cell_shared.reset_index(drop=True)

    # now go back to the original cell df, pull out whats UNIQUE
    #     THIS IS THE GERMLINE FILTER!!
    cell_cell_concat = pd.concat([cell_df_trimmed, patient_cell_shared])
    cell_cell_concat_noDups = cell_cell_concat.drop_duplicates(keep=False)
    cell_cell_concat_noDups = cell_cell_concat_noDups.reset_index(drop=True)

    return (cell_cell_concat_noDups)
def runBatch(cellsList_file, outputDF_):
    cellsList_open = open(cellsList_file, "r")
    cells = cellsList_open.readlines()

    global cellName

    for cell in cells:
        cellName = cell.rstrip()
        get_s3_files(cell)

        cwd = os.getcwd()
        vcf_path = cwd + '/' + cell
        vcf_path_strip = vcf_path.rstrip() + '.vcf'
        gvcf_path = cwd + '/' + cell
        gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf'

        vcf = VCF.dataframe(vcf_path_strip)
        gvcf = VCF.dataframe(gvcf_path_strip)

        # get a list of the records we actually care about
        toKeepList_v = vcf.apply(getGOI_record,
                                 axis=1,
                                 args=(chrom_, start_, end_))
        toKeepList_g = gvcf.apply(getGOI_record,
                                  axis=1,
                                  args=(chrom_, start_, end_))

        # subset by relevant records
        vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)]
        gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)]

        # get depth of coverage, for relevant records
        outputRow_v = getDepth_adv(vcf_GOI)
        outputRow_g = getDepth_adv(gvcf_GOI)

        # make the combined row, with both vcf and gvcf fields filled in
        outputRow_comb = pd.DataFrame(columns=colNames)  # colNames is a global
        outputRow_comb['cellName'] = outputRow_v['cellName']
        outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool']
        outputRow_comb['depth_vcf'] = outputRow_v['depth']
        outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool']
        outputRow_comb['depth_gvcf'] = outputRow_g['depth']

        outputDF_ = outputDF_.append(outputRow_comb)

        # remove s3 files
        os.system('rm *.vcf > /dev/null 2>&1')  # remove, and mute errors
        os.system('rm *.vcf* > /dev/null 2>&1')  # remove, and mute errors

    return (outputDF_)
def check_vcf( input ):
    v = VCF( input )

    debug = False

    print "\n".join(v.metadata)

    for line in v.lines():
        r = line.ref
        a = line.alt_list

        if len(r) > 1:
            if len(a) > 1:
                raise Exception("WARNING: multi-allelic change not coded for")

            line1 = copy.deepcopy( line )
            line2 = copy.deepcopy( line )

            if len(a[0]) > 1:
                if len(r) < len(a[0]):          # insertion
                    if len(r) == 2:
                        line1.ref = r[1]
                        line1.alt = r[1]+a[0][2:] 
                        line1.pos += 1
                        line2.ref = r[1]
                        line2.alt = a[0][1] 
                        line2.pos += 1
                    if debug: print "===="
                    print line1
                    print line2
                    if debug: print line
                elif len(a[0]) < len(r):
                    if len(a[0]) == 2:
                        line1.ref = r[1:]
                        line1.alt = r[1]
                        line1.pos += 1
                        line2.ref = r[1]
                        line2.alt = a[0][1]
                        line2.pos += 1
                    if debug: print "===="
                    print line1
                    print line2
                    if debug: print line
                else:
                    print line
            else:
                print line
        else:
            print line
Example #4
0
    def test_genotypes(self):
        homo_ref = VCF.process_snp_call('0/0:10,9:19:99:254,0,337', 'A', 'T', IUPAC_ambiguities=True)
        self.assertEqual(homo_ref, 'A')

        heterozygote = VCF.process_snp_call('0/1:10,9:19:99:254,0,337', 'A', 'T', IUPAC_ambiguities=True)
        self.assertEqual(heterozygote, 'W')

        homo_alt =  VCF.process_snp_call('1/1:10,9:19:99:254,0,337', 'A', 'T', IUPAC_ambiguities=True)
        self.assertEqual(homo_alt, 'T')

        second_alt = VCF.process_snp_call('0/2:10,9:19:99:254,0,337', 'A', 'T,G', IUPAC_ambiguities=True)
        self.assertEqual(second_alt, 'R')

        double_alt = VCF.process_snp_call('1/2:10,9:19:99:254,0,337', 'A', 'T,G', IUPAC_ambiguities=True)
        self.assertEqual(double_alt ,'K')
Example #5
0
    def test_make_slices_default_with_params_set(self):
        """Test slicing function with window_size set"""

        for count, i in enumerate(VCF.get_slice_indicies(self.bgzip_path, regions=None, window_size=1008)):
            if count > 10: break
        
        self.assertEqual(i, ('Chr01', 11089, 12096))
Example #6
0
    def test_make_slices_default_settings(self):
        """Test slicing function with default settings: 500 bp slices"""
 
        for count, i in enumerate(VCF.get_slice_indicies(self.bgzip_path, regions=None, window_size=500)):
            if count > 10: break
        
        self.assertEqual(i, ('Chr01', 5501, 6000))
Example #7
0
def getGOIHits(fileNames, chrom, pos1, pos2):
    print('getting hits to GOI')

    global queryChrom, lPosQuery, rPosQuery  # dont like this
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])

    cells_dict_GOI = {}
    queryChrom = chrom
    lPosQuery = pos1
    rPosQuery = pos2

    for f in fileNames:
        numMatches = 0
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df

        shared = list(set(genomePos_query)
                      & set(genomePos_laud_db))  # get the LAUD filter set
        shared1 = pd.Series(
            shared)  # what if i convert this guy to a pandas object?

        numMatches = shared1.apply(hitSearchFunc)  # another apply call

        cells_dict_GOI.update({cell: sum(numMatches)})

    return cells_dict_GOI
Example #8
0
def generate_slices(args):

	vcf = VCF.VCF()
	vcf.populations = args.populations
	vcf.set_chrms(args.input)

	chrm_2_windows = vcf.chrm2length.fromkeys(vcf.chrm2length.keys(),None)
	
	for count, chrm in enumerate(vcf.chrm2length.keys()):

		length = vcf.chrm2length[chrm]
		window_size = args.window_size
		overlap = args.overlap

		# Skip contigs that are to short
		if length <= window_size: continue
		
		# Fit windows into remaining space
		if (length % window_size) > overlap:
			start = (length % window_size)/2
			stop = (length - window_size) - overlap/2

		# Prevent windows from invading remaining space 
		if (length % window_size) <= overlap:
			start = (length % window_size)/2
			stop = length - overlap*2
				
		starts = range(start, stop, overlap)
		stops = [i+window_size for i in starts]
		windows = zip(starts, stops)
		
		chrm_2_windows[chrm] = windows

	return chrm_2_windows
Example #9
0
def summary_haplotype_block(vcf, haplotype, outfile):
    phased_block = defaultdict(lambda : list())
    phased_block_com = defaultdict(lambda : list())
    import VCF
    for v in VCF.lines(vcf):
        block_id = re.split(r':', v['FCM'][1])[1]
        genotype = re.split(r':', v['FCM'][0])[0]
        #print genotype
        #print '{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id)
        block_idx = '{}_{}'.format(v['CHROM'], block_id)
        snp_idx   = '{}:{}'.format(v['CHROM'], v['POS'])
        bases     = [v['REF'], v['ALT']]
        hap1_10x  = bases[int(genotype[0])]
        haplotype_flag = -1
        if haplotype.has_key(snp_idx):
            if hap1_10x == haplotype[snp_idx][0]:
                haplotype_flag = 0
            elif hap1_10x == haplotype[snp_idx][1]:
                haplotype_flag = 1
        phased_block[block_idx].append(int(v['POS']))
        phased_block_com[block_idx].append(haplotype_flag)

    ofile = open(outfile, 'w')
    for blc in phased_block.keys():
        snps  = len(phased_block[blc])  
        start = np.min(phased_block[blc])
        end   = np.max(phased_block[blc])
        length= int(end) - int(start) + 1
        hap1_n = len([i for i in phased_block_com[blc] if i == 0])
        hap2_n = len([i for i in phased_block_com[blc] if i == 1])
        hap0_n = len([i for i in phased_block_com[blc] if i == -1])
        print >> ofile, '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, start, end, hap1_n, hap2_n, hap0_n)
    ofile.close()
Example #10
0
def make_dadi_fs(args, region):

	vcf = VCF.VCF()
	vcf.populations = args.populations
	vcf.set_header(args.input)

	pop_ids = args.populations.keys()

	# Get slice and setup output dictionaries
	chunk = vcf.slice_vcf(args.input, *region)
	if chunk == None:
		return None

	else:
		g = count_alleles(chunk, args.populations)

		final_dadi = {}
		population_level_dadis = dict.fromkeys(pop_ids,{})

		for row_count, row in enumerate(g):

			raw_calls = chunk[row_count] 
			row['outgroups'] = {'ALT': 0, 'REF': 0} # set empty outgroup

			# To Do: Need to create a function to fill outgroup if one is defined.
			# The heliconius dataset, for example, has this.

			if check_outgroup(row) == False: continue # skip outgroup not fixed at one value
			if len(raw_calls['REF']) > 1 or len(raw_calls["ALT"]) > 1: continue # skip multi allelic sites
			
			# CALL BASE FOR OUTGROUP
			outgroup_allele = get_outgroup_base(row, raw_calls)

			# CALL MAJOR ALLELE (BASE) FOR INGROUP
			major_allele = get_ingroup_major_allele(row, raw_calls, outgroup_allele)

			# POLORIZE REF AND ALT FOR INGROUP
			if major_allele != raw_calls['REF']:
				ref, alt = ('ALT','REF')
			else:
				ref, alt = ('REF','ALT')


			calls = {}
			for count, pop in enumerate(pop_ids):
				calls[pop] = (row[pop][ref], row[pop][alt])

			row_id = "{0}_{1}".format(raw_calls['CHROM'],raw_calls['POS'])
		
			dadi_site = {'calls': calls,
				   'context': make_triplet(major_allele),
				   'outgroup_context': make_triplet(outgroup_allele),
				   'outgroup_allele': outgroup_allele,
				   'segregating': (raw_calls[ref], raw_calls[alt])
				   }

			final_dadi[row_id] = dadi_site

		return (final_dadi, pop_ids)
Example #11
0
    def test_header_vs_population_sample_ids(self):
        """Check that the sample IDs parsed from the population arguement
            match those in the VCF file.

            NOTE: In practice the populations arguement can contain fewer 
            samples and populations than actually contained in the VCF file.
        """

        header = VCF.set_header(self.bgzip_path)
        header_sample_ids = [item for count, item in enumerate(header) if count >= 9]
        
        populations_dict  = VCF.parse_populations_list(self.populations_list)
        populations_sample_ids = [i for l in populations_dict.values() for i in l]

        # Check both unique IDs and equal length
        self.assertEqual(set(header_sample_ids), set(populations_sample_ids))
        self.assertEqual(len(header_sample_ids), len(populations_sample_ids))
Example #12
0
 def test_population_string_parsing(self):
     populations = VCF.parse_populations_list(self.populations_list)
     
     self.assertEqual(populations, {'melpo': ['m523', 'm524', 'm525', 
         'm589', 'm675', 'm676', 'm682', 'm683', 'm687', 'm689'], 
         'pachi': ['p516', 'p517', 'p518', 'p519', 'p520', 'p591', 
         'p596', 'p690', 'p694', 'p696'], 'cydno': ['c511', 
         'c512', 'c513', 'c514', 'c515', 'c563', 'c614', 'c630', 
         'c639', 'c640'], 'outgroups': ['h665', 'i02-210']})
def runBatch(cell):
    try:
        cellName = cell.rstrip()
        #get_s3_files(cell)

        cwd = os.getcwd()
        vcf_path = cwd + '/vcf_files/' + cell
        vcf_path_strip = vcf_path.rstrip() + '.vcf'
        gvcf_path = cwd + '/vcf_files/' + cell
        gvcf_path_strip = gvcf_path.rstrip() + '.g.vcf'

        vcf = VCF.dataframe(vcf_path_strip)
        gvcf = VCF.dataframe(gvcf_path_strip)

        # get a list of the records we actually care about
        toKeepList_v = vcf.apply(getGOI_record,
                                 axis=1,
                                 args=(chrom_, start_, end_))
        toKeepList_g = gvcf.apply(getGOI_record,
                                  axis=1,
                                  args=(chrom_, start_, end_))

        # subset by relevant records
        vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)]
        gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)]

        # get depth of coverage, for relevant records
        outputRow_v = getDepth_adv(vcf_GOI, cellName)
        outputRow_g = getDepth_adv(gvcf_GOI, cellName)

        # make the combined row, with both vcf and gvcf fields filled in
        outputRow_comb = pd.DataFrame(columns=colNames)  # colNames is a global
        outputRow_comb['cellName'] = outputRow_v['cellName']
        outputRow_comb['coverage_bool_vcf'] = outputRow_v['coverage_bool']
        outputRow_comb['depth_vcf'] = outputRow_v['depth']
        outputRow_comb['coverage_bool_gvcf'] = outputRow_g['coverage_bool']
        outputRow_comb['depth_gvcf'] = outputRow_g['depth']

    except:
        outputRow_comb = pd.DataFrame(columns=colNames)  # just an empty row
        # fill in this row with something
    return (outputRow_comb)
def sort_vcf( input, reference, output ):
    contig_list = get_contig_list( reference )

    print "read {} contigs".format(len(contig_list))

    v = VCF( input, True )      # request index of VCF upon open

    with open( output, 'w') as fd_out:
        fd_out.writelines( [ line+"\n" for line in v.metadata ] )

        for contig in contig_list:
            print "writing entries for contig {}".format(contig)
            # filter lines from vcf by contig
            count = 0
            if v.seek(contig) < 0:
                print "skipped {} because it's not in the VCF".format( contig )
            else:
                for line in v.lines( True, \
                        lambda raw_line: VCFLine(raw_line).chr == contig ):
                    fd_out.write( line.line+"\n" )
                    count += 1
                print "wrote {} entries for {}".format( count, contig )
Example #15
0
def getRawCounts(fileNames):
    print('getting raw counts...')
    cells_dict = {}

    for f in fileNames:
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        unique = len(np.unique(df.POS))

        cells_dict.update({cell: unique})
    print('finished!')
    return cells_dict
Example #16
0
def callSNPs(current_base, numb_of_seqs):
    """Call the SNPs. Duh!"""

    blanks =  np.zeros(numb_of_seqs, np.string0)

    if current_base.FILTER == 'LowQual':
        blanks.fill("-")

    if current_base.FORMAT == 'GT':
        blanks.fill("-")

    for count, snp_call in enumerate(current_base[9:]):
        base = VCF.process_snp_call(snp_call, current_base.REF, current_base.ALT)
        blanks[count] = base

    return blanks
Example #17
0
def getGeneCellMutCounts(f):
        # Creates dictionry obj where every key is a cell and every value is
        # a list of the genes we found mutations in for that cell.
        tup = [] # not really a tuple, just a list, i guess

        cell = os.path.basename(f)
        cell = cell.replace(".vcf", "")
        print(cell) # to see where we are

        df = VCF.dataframe(f)
        genomePos_query = df.apply(getGenomePos, axis=1) # apply function for every row in df

        shared = list(set(genomePos_query)) # genomePos_query (potentially) has dups
        sharedGeneNames = [f for e in shared for f in getGeneName(e)]
        tup = [cell, sharedGeneNames]

        return(tup)
Example #18
0
def getFilterCountsLAUD(fileNames):
    print('getting filter counts LAUD...')
    cells_dict_laud = {}
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])

    for f in fileNames:
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df

        shared = list(set(genomePos_query) & set(genomePos_laud_db))
        cells_dict_laud.update({cell: len(shared)})

    print('finished!')
    return cells_dict_laud
Example #19
0
def getFilterCountsBasic(fileNames):
    print('getting filter counts basic...')
    cells_dict_filter = {}
    genomePos_db = pd.Series(database['Mutation genome position'])

    for f in fileNames:
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")
        print(cell)
        df = VCF.dataframe(f)

        genomePos_query = df.apply(getGenomePos, axis=1)

        shared = list(set(genomePos_query) & set(genomePos_db))
        cells_dict_filter.update({cell: len(shared)})

        #print(cells_dict_filter)
    print('finished!')
    return cells_dict_filter
Example #20
0
def getGeneCellMutCounts(fileNames):
    print('getting gene/cell mutation counts...')
    cells_dict = {}
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])

    for f in fileNames:
        cell = f.replace("../vcf_test/", "")
        cell = cell.replace(".vcf", "")
        print(cell)  # to see where we are
        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df

        shared = list(set(genomePos_query) & set(genomePos_laud_db))

        shared_series = pd.Series(shared)
        sharedGeneNames = shared_series.apply(getGeneName)
        cells_dict.update({cell: sharedGeneNames})

    return cells_dict
Example #21
0
def getGOIHit_coords(fileNames, chrom, pos1, pos2):
    print('getting coords to GOI hits')

    global queryChrom, lPosQuery, rPosQuery  # dont like this
    genomePos_laud_db = pd.Series(database_laud['Mutation genome position'])
    cells_dict_GOI_coords = {}
    queryChrom = chrom
    lPosQuery = pos1
    rPosQuery = pos2

    for f in fileNames:
        numMatches = 0
        cell = f.replace("../vcf/", "")
        cell = cell.replace(".vcf", "")

        df = VCF.dataframe(f)
        genomePos_query = df.apply(
            getGenomePos, axis=1)  # apply function for every row in df
        # get the entries shared between curr cells VCF and the LAUD filter set
        #	remember, these are general, and NOT gene specific
        genomePos_query_expand = expandSet(set(genomePos_query))

        shared = list(set(genomePos_query_expand)
                      & set(genomePos_laud_db))  # problem is right here!!!
        shared1 = pd.Series(shared)  # convert to pandas obj
        matches = shared1.apply(hitSearchFunc_coords)  # another apply call

        # delete empty dict keys
        for k in matches.keys():
            try:
                if len(matches[k]) < 1:
                    del matches[k]
            except:
                pass

        cells_dict_GOI_coords.update({cell: list(matches.values)})

    return cells_dict_GOI_coords
gvcfFilePrefix = sys.argv[5]

cellName = str(vcfFilePrefix).strip('.vcf')

print('  ')
print('chromosome: %s' % chrom_)
print('start_position: %s' % start_)
print('end_position: %s' % end_)
print('cell name: %s' % cellName)
print(' ')

cwd = os.getcwd()
vcf_path = cwd + '/' + vcfFilePrefix
gvcf_path = cwd + '/' + gvcfFilePrefix

vcf = VCF.dataframe(vcf_path)
gvcf = VCF.dataframe(gvcf_path)

# get a list of the records we actually care about
toKeepList_v = vcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_))
toKeepList_g = gvcf.apply(getGOI_record, axis=1, args=(chrom_, start_, end_))

# subset by relevant records
vcf_GOI = vcf[np.array(toKeepList_v, dtype=bool)]
gvcf_GOI = gvcf[np.array(toKeepList_g, dtype=bool)]

# get depth of coverage, for relevant records
getDepth_adv(vcf_GOI)
getDepth_adv_g(gvcf_GOI)

#////////////////////////////////////////////////////////////////////
import VCF
import cyvcf2

try:
	## MUST BE A UNCOMPRESSED VCF file
	finput = argv[1]
except IndexError as ie:
	exit("{}\nUSAGE: $0 $vcf_file ".format(ie))

if not path.exists(finput):
	msg = "ERROR: FNF {}".format(finput)
	raise IOError(msg)

d = {}

for v in VCF.lines(finput):
	if v['CHROM'] in d:
		d[v['CHROM']].append(v['POS'])
	else:
		d[v['CHROM']] = [v['POS']]

with open("{}.consPos.txt".format(finput), 'wt') as of:
	for key,val in d.items():
		## make sure all positions are integer ; if not raise error
		try:
			data = [int(i) for i in val]
		except ValueError as ve:
			exit("ERROR: {}".format(e))
		# https://stackoverflow.com/questions/2361945/detecting-consecutive-integers-in-a-list
		for k, g in groupby(enumerate(data), lambda ix: ix[0] - ix[1]):
			cn = list(map(itemgetter(1), g))
Example #24
0
def main(args):

	vcf = VCF.VCF()
	vcf.populations = args.populations
	vcf.set_header(args.input)

	pop_ids = args.populations.keys()

	# get slice and setup output dictionaries
	chunk = vcf.vcf_chunk_2_dadi(args.input, args.populations, *args.region)
	g = count_alleles(chunk, args.populations)


	# Create Header Row
	dadi_header = create_dadi_header(args)

	fout = open(args.output,'w')
	fout.write(dadi_header + "\n")

	for row_count, row in enumerate(g):

		raw_calls = chunk[row_count] 
		row['outgroups'] = {'ALT': 0, 'REF': 0} # set empty outgroup

		# To Do: Need to create a function to fill outgroup if one is defined.
		# The heliconius dataset, for example, has this.

		if check_outgroup(row) == False: continue # skip outgroup not fixed at one value
		if len(raw_calls['REF']) > 1 or len(raw_calls["ALT"]) > 1: continue # skip multi allelic sites
		
		# CALL BASE FOR OUTGROUP
		outgroup_allele = get_outgroup_base(row, raw_calls)

		# CALL MAJOR ALLELE (BASE) FOR INGROUP
		major_allele = get_ingroup_major_allele(row, raw_calls, outgroup_allele)

		# POLORIZE REF AND ALT FOR INGROUP
		if major_allele != raw_calls['REF']:
			ref, alt = ('ALT','REF')
		else:
			ref, alt = ('REF','ALT')

		#  CREATE DADI ROW
		dadi_row = [make_triplet(major_allele), make_triplet(outgroup_allele)]

		for count, pop in enumerate(pop_ids):
			if count == 0:
				dadi_row.append(chunk[row_count][ref])
				dadi_row.append(row[pop][ref])
			else:
				dadi_row.append(row[pop][ref])


		for count, pop in enumerate(pop_ids):
			if count == 0:
				dadi_row.append(chunk[row_count][alt]) 
				dadi_row.append(row[pop][alt])
			else:
				dadi_row.append(row[pop][alt])

		dadi_row.append(raw_calls['CHROM'])
		dadi_row.append(raw_calls['POS'])

		dadi_row = " ".join([str(item) for item in dadi_row])
		fout.write(dadi_row + "\n")
Example #25
0
def summary_haplotype_block(vcf, haplotype, outfile_up, outfile_down):
    phased_block = defaultdict(lambda : list())
    phased_block_com  = defaultdict(lambda : list())
    phased_block_rank = defaultdict(lambda : int())
    count = 0
    import VCF
    for v in VCF.lines(vcf):
        block_id = re.split(r':', v['FCM'][1])[1]
        genotype = re.split(r':', v['FCM'][0])[0]
        #print genotype
        #print '{}\t{}\t{}\t{}'.format(v['CHROM'], v['POS'], block_id, v['FCM'][0])
        block_idx = '{}_{}'.format(v['CHROM'], block_id)
        snp_idx   = '{}:{}'.format(v['CHROM'], v['POS'])
        bases     = [v['REF'], v['ALT']]
        hap1_10x  = bases[int(genotype[0])]
        haplotype_flag = -1
        if haplotype.has_key(snp_idx):
            if hap1_10x == haplotype[snp_idx][0]:
                haplotype_flag = 0
            elif hap1_10x == haplotype[snp_idx][1]:
                haplotype_flag = 1
        if not phased_block.has_key(block_idx):
            count += 1
        phased_block[block_idx].append(int(v['POS']))
        phased_block_com[block_idx].append(haplotype_flag)
        phased_block_rank[block_idx] = count

    for blc in phased_block.keys():
        chrs, blc_id = re.split(r'_', blc)
        snps  = len(phased_block[blc])  
        start = np.min(phased_block[blc])
        end   = np.max(phased_block[blc])
        length= int(end) - int(start) + 1
        hap1_n = len([i for i in phased_block_com[blc] if i == 0])
        hap2_n = len([i for i in phased_block_com[blc] if i == 1])
        hap0_n = len([i for i in phased_block_com[blc] if i == -1])
        print '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(blc, snps, length, chrs, start, end, hap1_n, hap2_n, hap0_n)
        ratio = 0.0
        #try:
        #    ratio = np.max(float(hap1_n)/(float(hap1_n)+float(hap2_n)), float(hap2_n)/(float(hap1_n)+float(hap2_n)))
        #except:
        #    continue
        min_snp = 1
        if hap1_n >= min_snp and hap1_n == snps - hap0_n:
            ratio = 1
        elif hap2_n >= min_snp and hap2_n == snps - hap0_n:
            ratio = 1
        elif hap1_n >= min_snp and hap1_n > hap2_n:
            ratio = float(hap1_n)/(float(hap1_n)+float(hap2_n))
        elif hap2_n >= min_snp and hap2_n > hap1_n:
            ratio = float(hap2_n)/(float(hap1_n)+float(hap2_n)) 

        print ratio
        if ratio > 0.95: 
            color = 'gray'
            hap   = 3
            print hap1_n, hap2_n
            if hap1_n > hap2_n:
                color = 'orange'
                hap   = 1
            else:
                color = 'blue' 
                hap   = 2
            print color
            if phased_block_rank[blc]%2 == 1:
                print >> outfile_up, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color)
            else:
                print >> outfile_down, '{}\t{}\t{}\t{}\t{}\t+'.format(chrs, start, end, hap, color)
Example #26
0
#/////////////////////////////////////////////////////////////////////////
# script: convert_to_csv.py
# author: Lincoln
# date: 3.18.19
#
# want to convert any remaining vcfs to csv
#/////////////////////////////////////////////////////////////////////////
import pandas as pd
import VCF
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

filterDir = '/home/ubuntu/code/SNP_calling_pipeline/bulkAnalysis/scVCF_filtered_all/'
filterDir_list = os.listdir(filterDir)

for f in filterDir_list:
	if '.vcf' in f:
		currPATH = filterDir + f
		df = VCF.dataframe(currPATH)
		df_trimmed = df[['CHROM', 'POS', 'ID', 'REF', 'ALT']]

		cellName = f.strip('.vcf')
		outStr = filterDir + cellName + '.csv'
		df_trimmed.to_csv(outStr, index=False)

#/////////////////////////////////////////////////////////////////////////
#/////////////////////////////////////////////////////////////////////////
Example #27
0
 def test_header_to_ordereddict_parsing(self):
     header = VCF.set_header(self.bgzip_path)
     
     self.assertEqual(header,self.header_dict)
Example #28
0
def LoadDataSet(vcfInfile, traningSet, qFaLen):

    if len(traningSet) == 0: raise ValueError('[ERROR] No Training Data found')
    if vcfInfile[-3:] == '.gz':
        I = os.popen('gzip -dc %s' % vcfInfile)
    else:
        I = open(vcfInfile)

    data, hInfo = [], VCF.VCFHeader()
    while 1: # VCF format

        lines = I.readlines(100000)
        if not lines: break
        for line in lines:

            col = line.strip('\n').split()
            if re.search(r'^#CHROM', line): col2sam = {i+9:sam for i,sam in enumerate(col[9:])}

            # Record the header information
            if re.search(r'^#', line):
                hInfo.Record(line.strip('\n'))
                continue

            # Get inbreeding coefficient. If fail then continue.
            # It's calculated like: 1.0 - hetCount/Expected_hetCount in VCF
            #inbCoeff = re.search(r';F=([^;]+)', col[7])
            inbCoeff = re.search(r';?InbCoeff=([^;]+)', col[7])
            if not inbCoeff:
                continue
                #print >> sys.stderr, '[ERROR] No inbreeding coefficient "InbCoeff=..." in INFO field in vcf:\n%s\n' % vcfInfile
            inbCoeff = float('%.2f' % float(inbCoeff.group(1)))

            fmat = {k:i for i,k in enumerate(col[8].split(':'))} # Get Format
            if 'QR' not in fmat: continue # Cause by INTERGAP. But We'd better delete this statment, because the error is cause by the USER 

            for tag in ['AA', 'QR', 'NR']:
                if tag not in fmat: raise ValueError('[ERROR] The "Format" fields did not contian "%s" in VCF: %s\nAT: %s\n' %(tag, vcfInfile, line))

            isBiallelic = True
            if len(col[4].split(',')) > 1: isBiallelic = False

            annotations = []
            atleastOne  = False
            for i, sample in enumerate(col[9:]): 

                sampleId  = col2sam[9+i]
                if sample == './.': continue
                field = sample.split(':')
                if len(field[fmat['AA']].split(',')) != 4: continue

                if len(field) < fmat['QR'] + 1: continue
                qr    = field[fmat['QR']].split(',')[-1]
                if qr == '.': continue

                atleastOne = True
                qregion    = np.array(qr.split('-'))
                if len(qregion) > 3: qId = qregion[0] + '-' + qregion[1]
                else               : qId = qregion[0]
                qSta = string.atoi(qregion[-2])
                qEnd = string.atoi(qregion[-1])

                if sampleId not in qFaLen          : raise ValueError('[ERROR] The sample name $s(in vcf) is not in the name of Fa list.' % sampleId)
                if      qId not in qFaLen[sampleId]: raise ValueError('[ERROR]', qId, 'is not been found in fa file\n')
                qSta = int(qSta * 100 / qFaLen[sampleId][qId] + 0.5)
                qEnd = int(qEnd * 100 / qFaLen[sampleId][qId] + 0.5)
                if qSta > 100: qSta = 100 # Bug!!! Should delete
                if qEnd > 100: qEnd = 100 # Bug!!! Should delete
                if qSta > 100 or qEnd > 100: 
                    raise ValueError('[ERROR] Query size Overflow! sample: %s; scaffold: %s\n%s\n%s' % (sampleId, qId, sample, line))

                leg = min(qSta, 100 - qEnd)
                nn  = string.atof(sample.split(':')[fmat['NR']])
                n   = int(1000 * nn + 0.5) / 10.0 # n ratio range: [0, 100]
                alt = string.atoi(sample.split(':')[fmat['AA']].split(',')[1]) # Alternate perfect
                bot = string.atoi(sample.split(':')[fmat['AA']].split(',')[3]) # Both imperfect
                annotations.append([isBiallelic, inbCoeff, leg, n , alt, bot])

            if not atleastOne: raise ValueError('[ERROR] All the samples don\'t contain this variant.', col)
            datum                = vd.VariantDatum()
            datum.annotations    = np.median(annotations, axis = 0)
            pos                  = col[0] + ':' + col[1]
            datum.variantOrder   = pos
            if pos in traningSet: datum.atTrainingSite = True
            data.append(datum)

    I.close()

    return hInfo, np.array(data)
Example #29
0
	distSquareMatrix = dist.squareform(distMatrix)
	linkageMatrix = hier.linkage(distMatrix,method='ward')
	dendro = hier.dendrogram(linkageMatrix)
	leaves2 = dendro['leaves']
	transformedData = transformedData[:,leaves2]
	##### leaves1 for the mutations sites
	##### leaves2 for the taxa
	fig_ = plt.figure(figsize=(6,6))
	ax_ = fig_.add_subplot(111)
	cax_ = ax_.matshow(transformedData,cmap='Blues',aspect="auto")
	ax_.set_ylabel('Cells')
	ax_.set_xlabel('Genomic Positions')


	# fig_.colorbar(cax_)
	fig_.savefig(out_path+"hier_clust_heatmap.png", dpi=1200)

	###########################################################################
	######################## Generate the VCF output ##########################
	###########################################################################
	VCF.gen_VCF(out_dir=out_path, genotype_mat=mat_, read_count_mat_=read_counts, chrs=chroms, posits=positions, alt_counts=alts, rfs=refs, ids=names, dps=depths)

	###################################################################################
	######################## Generate Perfect Phylogeny Newick ########################
	###################################################################################

	if K_==0:
		Phylo_module.gen_Newick(genotype=mat_, PerfectPhy_path=PerfectPhy_path_, out_dir_path=out_path, names_=names)


Example #30
0
def nr_sensitivity( input, sample, truth, minqual=0, misses=None, debug=False ):
    print >>sys.stderr,"input={}\ntruth={}\nminqual={}".format(input,truth,minqual)

    truth_vcf = VCF( truth )
    eval_vcf = VCF( input )

    if misses:
        misses_fd = open( misses, 'w' )
        found = False
        for line in truth_vcf.metadata:
            if line[:len("#CHROM")]=="#CHROM":
                misses_fd.write( '##nr_concordance="comment={subset of '\
                    'missed sites created by DISCOVAR release bundle Python program '\
                    'nr_concordance.py}"\n')
                found = True
            misses_fd.write(line+"\n")
        if not found:
            raise Exception("program bug? made it this far without #CHROM in truth VCF?")

    else: misses_fd = None

    eval_sample_index = eval_vcf.sample_names.index(sample)
    truth_sample_index = truth_vcf.sample_names.index(sample)

    eval_chr=[]
    truth_chr=[]

    eval_gen = eval_vcf.lines()
    eval_line = eval_gen.next()
    eval_chr.append( eval_line.chr )

    eval_done = False

    n_truth_lines = 0
    n_site_hits = 0
    n_site_concords = 0

    last_truth=(None,None)
    last_eval=(None,None)

    for truth_line in truth_vcf.lines():

        check_sort_order( truth, last_truth, truth_line.chr, truth_line.pos )

        truth_genotype=truth_line.get_sample_dict(truth_sample_index)["GT"]
        if truth_genotype == "0/0" or truth_genotype=="0|0" or truth_genotype == ".": continue
        if debug: print >>sys.stderr,"seeking {}:{}".format( truth_line.chr, truth_line.pos )
        n_truth_lines += 1
        if n_truth_lines % 1000 == 0: print n_truth_lines

        # skip to the correct chromosome
        if len(truth_chr) == 0 or truth_chr[-1] != truth_line.chr:
            truth_chr.append( truth_line.chr )

        # if we've already passed this chr in the eval file, then spin
        if not eval_done and truth_line.chr != eval_line.chr and truth_line.chr in eval_chr:
            continue

        # if we've not already passed this chr, then find it
        while not eval_done and eval_line.chr != truth_line.chr:
            try:
                eval_line = eval_gen.next()
                check_sort_order( input, last_eval, eval_line.chr, eval_line.pos )
                if debug: print >>sys.stderr,"...next chr={}".format( eval_line.chr )
                if eval_chr[-1] != eval_line.chr:
                    eval_chr.append(eval_line.chr)
                    print eval_line.chr
            except StopIteration: eval_done=True

        # try to find the correct position
        while not eval_done and eval_line.pos < truth_line.pos \
                and eval_line.chr == truth_line.chr:
            try:
                eval_line = eval_gen.next()
                check_sort_order( input, last_eval, eval_line.chr, eval_line.pos )
                if debug: print >>sys.stderr,"...next chr:pos={}:{}".format( eval_line.chr, eval_line.pos )
                if eval_chr[-1] != eval_line.chr:
                    eval_chr.append(eval_line.chr)
                    print eval_line.chr
            except StopIteration: eval_done=True

        if minqual > 0 and eval_line.qual == '.':
            raise Exception("not sure what to do here, we're filtering on qual, but qual is '.'")

        if eval_done or eval_line.pos != truth_line.pos \
                or  eval_line.chr != truth_line.chr \
                or  ( eval_line.qual != '.' and float(eval_line.qual) < minqual ):
            if misses_fd: misses_fd.write(truth_line.line+"\n")
        else:
            if truth_line.ref != eval_line.ref:
                raise Exception("""
                    Your truth set does not seem to be called on the
                    same reference as your call set.  We're done here.
                    truth={}
                    truth_pos={}:{}
                    truth_ref={}

                    input={}
                    input_pos={}:{}
                    input_ref={}
                    """.format( truth, truth_line.chr, truth_line.pos,
                        truth_line.ref, input, eval_line.chr,
                        eval_line.pos, eval_line.ref ) )

            if debug:
                print >>sys.stderr,"""
                Evaluating:
                truth_pos={}:{}
                truth_ref={}
                truth_alt={}

                input_pos={}:{}
                input_ref={}
                input_alt={}

                """.format( truth_line.chr, truth_line.pos,
                        truth_line.ref, truth_line.alt, eval_line.chr,
                        eval_line.pos, eval_line.ref, eval_line.alt )

            # grab truth NR bases and eval NR bases
            eval_genotype=eval_line.get_sample_dict( eval_sample_index)["GT"]
            if eval_genotype != ".":
                eval_calls_idx = eval_genotype.split("/")
                if eval_calls_idx[0] == eval_genotype: eval_calls_idx = eval_genotype.split("|")
                if '0' in eval_calls_idx: eval_calls_idx.remove('0')
                eval_calls_idx = map(int, eval_calls_idx )
                truth_calls_idx = truth_genotype.split("/")
                if truth_calls_idx[0] == truth_genotype: truth_calls_idx = truth_genotype.split("|")
                if '0' in truth_calls_idx: truth_calls_idx.remove('0')
                truth_calls_idx = map(int, truth_calls_idx )

                if len(eval_calls_idx) > 0:
                    n_site_hits += 1
                    if debug: print >>sys.stderr, "accepting site hit at {}:{}".format(eval_line.chr, eval_line.pos)
                elif debug:
                    print >>sys.stderr, "no NR calls"

                for truth_nr in [ truth_line.alt_list[i-1] for i in truth_calls_idx ]:
                    # if any truth non-reference call is not found on
                    # the eval line, then we break without counting the
                    # concordance
                    if not truth_nr in eval_line.alt_list:
                        if debug: print >>sys.stderr,"""
                            non-concordant:
                            pos={}:{}
                            truth_ref={}
                            eval_ref={}
                            truth_alt={}
                            eval_alt={}
                            """.format( truth_line.chr, truth_line.pos,
                                    truth_line.ref, eval_line.ref,
                                    truth_line.alt_list,
                                    eval_line.alt_list )
                        break

                else:
                    # normal termination of the for loop, so count the
                    # concordance
                    if debug: print >>sys.stderr,"""
                            CONCORDANT:
                            pos={}:{}
                            truth_ref={}
                            eval_ref={}
                            truth_alt={}
                            eval_alt={}
                            """.format( truth_line.chr, truth_line.pos,
                                    truth_line.ref, eval_line.ref,
                                    truth_line.alt_list,
                                    eval_line.alt_list )
                    n_site_concords += 1

    print "n_truth_lines={}, n_site_hits={}, n_site_concords={}, site_hit_frac={}, site_concord_frac={}".format(
            n_truth_lines, n_site_hits, n_site_concords,
            n_site_hits/float(n_truth_lines),
            n_site_concords/float(n_truth_lines)
            )

    # check that for the eval chromosomes that are also in the truth
    # set, that they come in the same order

    # first form intersection set
    overlap_chr = set(truth_chr).intersection(set(eval_chr))

    truth_chr_rev = [ chr for chr in truth_chr if chr in overlap_chr ]
    truth_chr_rev.reverse()

    for chr in eval_chr:
        if chr not in overlap_chr: continue
        if chr != truth_chr_rev[-1]:
            raise Exception("""
            input chromosome ordering doesn't match truth chromosome ordering:
            input={}
            truth={}
            """.format( eval_chr, truth_chr ) )
        truth_chr_rev.pop()

    if misses_fd: misses_fd.close()