Ejemplo n.º 1
0
def calc_lvafs(list_CLV_target_range_base_counts):
	dict_CLV_bases = {'A':0,'C':1,'G':2,'T':3,'N':4}

	list_CLV_target_range_LVAF = [(int_CLV_temp_range_position,division_zero_tolerant(max([list_CLV_temp_base_counts[1][int_CLV_i] for int_CLV_i in range(4) if int_CLV_i != dict_CLV_bases[list_CLV_temp_base_counts[0]]]), \
		sum(list_CLV_temp_base_counts[1][:4]))) for int_CLV_temp_range_position, list_CLV_temp_base_counts in list_CLV_target_range_base_counts]

	return list_CLV_target_range_LVAF
Ejemplo n.º 2
0
def calc_indel_score(list_CIS_target_position_base_counts,list_CIS_target_range_base_counts,int_CIS_target_position,list_CIS_target_base_change,float_CIS_LOD_stdev_test,float_CIS_LOD_stdev_cutoff_BE,int_CIS_NOP_target_range):
	if len(list_CIS_target_base_change[0]) == len(list_CIS_target_base_change[1]):
		raise ValueError('Indel not recognized')

	#get indel sequence from base change
	if len(list_CIS_target_base_change[0]) < len(list_CIS_target_base_change[1]):
		int_CIS_indel_type = 0
		str_CIS_indel_seq = list_CIS_target_base_change[1][1:]
	else:
		int_CIS_indel_type = 1
		str_CIS_indel_seq = list_CIS_target_base_change[0][1:]

	#get Nsupp from pileup
	if str_CIS_indel_seq in list_CIS_target_position_base_counts[2][int_CIS_indel_type]:
		int_CIS_target_Nsupp = list_CIS_target_position_base_counts[2][int_CIS_indel_type][str_CIS_indel_seq]
	else:
		int_CIS_target_Nsupp = 0

	int_CIS_target_Rdepth = sum(list_CIS_target_position_base_counts[1][:4])

	float_CIS_target_VAF = division_zero_tolerant(int_CIS_target_Nsupp,int_CIS_target_Rdepth)

	list_CIS_target_range_LVAF = calc_lvafs(list_CIS_target_range_base_counts)

	dict_CIS_target_formal_test = formal_testing(float_CIS_target_VAF,list_CIS_target_range_LVAF,int_CIS_target_position,float_CIS_LOD_stdev_test,float_CIS_LOD_stdev_cutoff_BE,int_CIS_NOP_target_range)

	return [int_CIS_target_Nsupp,int_CIS_target_Rdepth,float_CIS_target_VAF,[dict_CIS_target_formal_test]]
Ejemplo n.º 3
0
def get_quality(dict_GQ_read, int_GQ_phred_cutoff):
    list_GQ_low_phreds = [
        1 if Chr2Phred(str_GQ_temp_chr) < int_GQ_phred_cutoff else 0
        for str_GQ_temp_chr in dict_GQ_read['quals']
    ]

    int_GQ_len_read = len(dict_GQ_read['read'])

    float_GQ_low_q_fraction = division_zero_tolerant(sum(list_GQ_low_phreds),
                                                     int_GQ_len_read)

    int_GQ_low_q_fraction_ceil = ceil(float_GQ_low_q_fraction * 100)

    return int_GQ_low_q_fraction_ceil
Ejemplo n.º 4
0
def calc_tvafs(list_CTV_target_range_base_counts,*list_CTV_target_base_changes):
	dict_CTV_bases = {'A':0,'C':1,'G':2,'T':3,'N':4}

	dict_CTV_base_changes = {}
	for list_CTV_temp_base_change in list_CTV_target_base_changes:
		if not list_CTV_temp_base_change[0] in dict_CTV_base_changes:
			dict_CTV_base_changes[list_CTV_temp_base_change[0]] = []
		dict_CTV_base_changes[list_CTV_temp_base_change[0]].append(dict_CTV_bases[list_CTV_temp_base_change[1]])

	list_CTV_target_range_TVAF = [(int_CTV_temp_range_position,division_zero_tolerant(max([list_CTV_temp_base_counts[1][int_CTV_i] for int_CTV_i in dict_CTV_base_changes[list_CTV_temp_base_counts[0]]]), \
		sum(list_CTV_temp_base_counts[1][:4]))) for int_CTV_temp_range_position, list_CTV_temp_base_counts in list_CTV_target_range_base_counts \
		if list_CTV_temp_base_counts[0] in dict_CTV_base_changes]

	return list_CTV_target_range_TVAF
Ejemplo n.º 5
0
def calc_snp_score(list_CSS_target_position_base_counts,list_CSS_target_range_base_counts,int_CSS_target_position,list_CSS_target_base_change,float_CSS_LOD_stdev_test,float_CSS_LOD_stdev_cutoff_BE,int_CSS_NOP_target_range):
	if not len(list_CSS_target_base_change[0]) == len(list_CSS_target_base_change[1]) == 1:
		raise ValueError('SNP base change not recognized')

	dict_CSS_bases = {'A':0,'C':1,'G':2,'T':3,'N':4}

	int_CSS_target_Nsupp = list_CSS_target_position_base_counts[1][dict_CSS_bases[list_CSS_target_base_change[1]]]
	int_CSS_target_Rdepth = sum(list_CSS_target_position_base_counts[1][:4])

	float_CSS_target_VAF = division_zero_tolerant(int_CSS_target_Nsupp,int_CSS_target_Rdepth)

	list_CSS_target_range_VAF = [calc_lvafs(list_CSS_target_range_base_counts),calc_tvafs(list_CSS_target_range_base_counts,list_CSS_target_base_change)]

	list_CSS_target_formal_test = [formal_testing(float_CSS_target_VAF,list_CSS_temp_target_range_VAF,int_CSS_target_position,float_CSS_LOD_stdev_test,float_CSS_LOD_stdev_cutoff_BE,int_CSS_NOP_target_range) for list_CSS_temp_target_range_VAF in list_CSS_target_range_VAF]

	return [int_CSS_target_Nsupp,int_CSS_target_Rdepth,float_CSS_target_VAF,list_CSS_target_formal_test]
Ejemplo n.º 6
0
def adjust_quality_distr(list_AQD_path_input_files, int_AQD_phred_cutoff,
                         int_AQD_input_max_low_q_percent,
                         int_AQD_max_read_filter_percent, int_AQD_read_chunks,
                         int_AQD_number_processes):

    list_AQD_quality_distr = [0 for _ in range(101)]

    with contextlib.ExitStack() as stack:
        list_AQD_input_files = [stack.enter_context(open(str_AQD_temp_path_input_file, 'r')) for str_AQD_temp_path_input_file \
         in list_AQD_path_input_files]
        it_AQD_input_fastq = fastq_iterator(fastq_zip_equal(*[FastqGeneralIterator(obj_AQD_temp_input_file) \
         for obj_AQD_temp_input_file in list_AQD_input_files]))

        for list_AQD_reads_for_processing in iter_double_chunked(
                it_AQD_input_fastq, int_AQD_read_chunks,
                int_AQD_number_processes):
            with concurrent.futures.ProcessPoolExecutor() as executor:

                list_AQD_read_chunk_distr = executor.map(
                    quality_read_chunk, list_AQD_reads_for_processing,
                    itertools.repeat(int_AQD_phred_cutoff))

                for list_AQD_temp_read_chunk_distr in list_AQD_read_chunk_distr:
                    list_AQD_quality_distr = [
                        int_AQD_temp_this_score + int_AQD_temp_total
                        for int_AQD_temp_this_score, int_AQD_temp_total in zip(
                            list_AQD_temp_read_chunk_distr,
                            list_AQD_quality_distr)
                    ]

    int_AQD_total_reads = sum(list_AQD_quality_distr)

    int_AQD_reads_pot_filter = 0
    for int_AQD_i in range(101):
        int_AQD_reads_pot_filter += list_AQD_quality_distr[int_AQD_i]
        int_AQD_min_low_q = int_AQD_i
        if division_zero_tolerant(
                int_AQD_reads_pot_filter, int_AQD_total_reads
        ) * 100 >= 100 - int_AQD_max_read_filter_percent:
            break

    if int_AQD_min_low_q > int_AQD_input_max_low_q_percent:
        print('Adjusted filter criterion to ' + str(int_AQD_min_low_q) + '%')
        return int_AQD_min_low_q
    else:
        return int_AQD_input_max_low_q_percent
Ejemplo n.º 7
0
def quality_filter(dict_QF_read, int_QF_phred_cutoff,
                   float_QF_max_low_q_fraction, int_QF_start_end_trim,
                   int_QF_end_end_trim, float_QF_end_trim_max_low_q_fraction):
    list_QF_low_phreds = [
        1 if Chr2Phred(str_QF_temp_chr) < int_QF_phred_cutoff else 0
        for str_QF_temp_chr in dict_QF_read['quals']
    ]

    int_QF_len_read = len(dict_QF_read['read'])

    bool_QF_read_end_cut = False
    #check if read meets basic quality criteria
    float_QF_low_q_fraction = division_zero_tolerant(sum(list_QF_low_phreds),
                                                     int_QF_len_read)
    if float_QF_low_q_fraction <= float_QF_max_low_q_fraction:
        bool_QF_read_passed = True
        int_QF_end_cut_point = int_QF_len_read
        #check if quality at read end is low
        int_QF_end_sum_low_q_bases = sum(
            list_QF_low_phreds[int_QF_start_end_trim:])
        for int_QF_i in range(int_QF_start_end_trim,
                              int_QF_len_read - int_QF_end_end_trim):
            if int_QF_end_sum_low_q_bases / (
                    int_QF_len_read -
                    int_QF_i) > float_QF_end_trim_max_low_q_fraction:
                int_QF_end_cut_point = int_QF_i
                bool_QF_read_end_cut = True
                break
            int_QF_end_sum_low_q_bases -= list_QF_low_phreds[int_QF_i]
        dict_QF_output_read = {
            str_QF_temp_read_item:
            dict_QF_read[str_QF_temp_read_item][:int_QF_end_cut_point]
            for str_QF_temp_read_item in ['read', 'quals']
        }
        dict_QF_output_read['name'] = dict_QF_read['name']

    else:
        bool_QF_read_passed = False
        dict_QF_output_read = dict_QF_read

    return [bool_QF_read_passed, bool_QF_read_end_cut, dict_QF_output_read]
Ejemplo n.º 8
0
def get_mrd_scores(list_GMS_samples,float_GMS_LOD_stdev_test,float_GMS_LOD_stdev_cutoff_BE,int_GMS_NOP_target_range,int_GMS_R1_Nsupp_cutoff):
	list_GMS_R1_tests = ['_R1.tsv','_R2.tsv']
	list_GMS_EC_tests = ['_R1R2.tsv','_RF.tsv']

	list_GMS_MRD_scores = []

	for dict_GMS_temp_sample in list_GMS_samples:

		list_GMS_temp_final_line = []

		#general sample info
		list_GMS_sample_ident = [dict_GMS_temp_sample['number'],dict_GMS_temp_sample['sample'],dict_GMS_temp_sample['primer'],f'{dict_GMS_temp_sample["target"]["chr"]}:' \
			+ f'{dict_GMS_temp_sample["target"]["position"]}:{">".join(dict_GMS_temp_sample["target"]["base_change"])}']

		#check if primer on target region
		if dict_GMS_temp_sample['target_region']['bounds'][0] <= dict_GMS_temp_sample['target']['position'] <= dict_GMS_temp_sample['target_region']['bounds'][1] \
			and dict_GMS_temp_sample['target']['chr'] == dict_GMS_temp_sample['target_region']['chr']:

			#tests to run
			#get MRD scores
			list_GMS_EC_results = [CalcMRDScore.calc_mrd_score(dict_GMS_temp_sample['base_filename']+str_GMS_temp_file_end,dict_GMS_temp_sample['target']['chr'], \
				dict_GMS_temp_sample['target_region']['bounds'][0],dict_GMS_temp_sample['target_region']['bounds'][1]-dict_GMS_temp_sample['target_region']['bounds'][0]+1,dict_GMS_temp_sample['target']['position'], \
				dict_GMS_temp_sample['target']['base_change'],float_GMS_LOD_stdev_test,float_GMS_LOD_stdev_cutoff_BE,int_GMS_NOP_target_range) for str_GMS_temp_file_end in list_GMS_EC_tests]

			list_GMS_R1_temp_results = [CalcMRDScore.calc_mrd_score(dict_GMS_temp_sample['base_filename']+str_GMS_temp_file_end,dict_GMS_temp_sample['target']['chr'], \
				dict_GMS_temp_sample['target_region']['bounds'][0],dict_GMS_temp_sample['target_region']['bounds'][1]-dict_GMS_temp_sample['target_region']['bounds'][0]+1,dict_GMS_temp_sample['target']['position'], \
				dict_GMS_temp_sample['target']['base_change'],float_GMS_LOD_stdev_test,float_GMS_LOD_stdev_cutoff_BE,int_GMS_NOP_target_range) for str_GMS_temp_file_end in list_GMS_R1_tests]
			#check if R1 or R2 has more Nsupp
			if list_GMS_R1_temp_results[0][0] >= list_GMS_R1_temp_results[1][0]:
				list_GMS_R1_result = list_GMS_R1_temp_results[0]
				str_GMS_large_indel_test = 'R1'
			else:
				list_GMS_R1_result = list_GMS_R1_temp_results[1]
				str_GMS_large_indel_test = 'R2'

			list_GMS_test_results = list_GMS_EC_results + [list_GMS_R1_result]

			list_GMS_test_results_formatted = [format_mrd_output(list_GMS_temp_test_result) for list_GMS_temp_test_result in list_GMS_test_results]

			#add R1/R2 and RF results
			for list_GMS_temp_result_full_test in list_GMS_test_results_formatted[:2]:
				list_GMS_temp_final_line += list_GMS_sample_ident + list_GMS_temp_result_full_test + ['#']
			#add R1 results
			list_GMS_temp_final_line += list_GMS_sample_ident + list_GMS_test_results_formatted[2][:3] + ['#',None]

			##add final test columns
			#indel-length
			int_GMS_target_indel_length = max(map(len,dict_GMS_temp_sample['target']['base_change']))-1

			list_GSM_final_decision = final_decision(list_GMS_test_results,int_GMS_target_indel_length,str_GMS_large_indel_test,int_GMS_R1_Nsupp_cutoff)

			list_GMS_temp_final_line += list_GSM_final_decision + ['#',None]

			#add additional info, RF distributions
			#get read number
			int_GMS_temp_fastq_read_number = fastq_get_read_number(dict_GMS_temp_sample['fastq_file'])

			list_GMS_temp_RF_distr = get_RF_distr(dict_GMS_temp_sample['bam_file_RF'],3,0,32,2)
			str_GMS_temp_RF_distr_join = " ".join([":".join(str(int_GMS_item) for int_GMS_item in tuple_GMS_item) for tuple_GMS_item in list_GMS_temp_RF_distr[3]])


			list_GMS_temp_additional = [f'{dict_GMS_temp_sample["target_region"]["chr"]}:{dict_GMS_temp_sample["target_region"]["bounds"][0]}-{dict_GMS_temp_sample["target_region"]["bounds"][1]}', int_GMS_temp_fastq_read_number, \
				division_zero_tolerant(list_GMS_test_results[0][1],int_GMS_temp_fastq_read_number),list_GMS_temp_RF_distr[0],list_GMS_temp_RF_distr[1],division_zero_tolerant(list_GMS_temp_RF_distr[0], \
				list_GMS_temp_RF_distr[1]),list_GMS_temp_RF_distr[2], division_zero_tolerant(list_GMS_temp_RF_distr[1],list_GMS_temp_RF_distr[2]),str_GMS_temp_RF_distr_join]

			list_GMS_temp_final_line += list_GMS_temp_additional


		#target not on primer region
		else:
			list_GMS_temp_final_line = list_GMS_sample_ident + ['Target not on primer region']

		list_GMS_MRD_scores.append(list_GMS_temp_final_line)

	return list_GMS_MRD_scores