def load_manual_threshold_somatics(jsm_file_name, threshold): ''' Load a list of rows containing somatics based on pre-specified probability threshold. ''' reader = JointSnvMixReader(jsm_file_name) chr_list = reader.get_chr_list() rows = [] scores = [] for chr_name in sorted(chr_list): print chr_name chr_rows = reader.get_rows(chr_name) for row in chr_rows: score = row['p_aa_ab'] + row['p_aa_bb'] if score >= threshold: row = format_rows(row, chr_name) insert_position = bisect.bisect(scores, score) scores.insert(insert_position, score) rows.insert(insert_position, row) reader.close() return rows
def load_auto_threshold_somatics(jsm_file_name): ''' Load a list of rows containing somatics based on automatically determined probability threshold. Threshold is determined based on inflection point method. ''' n = int(1e5) threshold = 1e-6 reader = JointSnvMixReader(jsm_file_name) chr_list = reader.get_chr_list() scores = [] rows = [] for chr_name in sorted(chr_list): print chr_name chr_rows = reader.get_rows(chr_name) for row in chr_rows: score = row['p_aa_ab'] + row['p_aa_bb'] insert_position = bisect.bisect(scores, score) if insert_position > 0 or len(scores) == 0: scores.insert(insert_position, score) row = format_rows(row, chr_name) rows.insert(insert_position, row) if scores[0] <= threshold or len(scores) > n: scores.pop(0) rows.pop(0) reader.close() max_diff = 0 index = 0 for i in range(len(scores) - 1): diff = scores[i + 1] - scores[i] if diff > max_diff: max_diff = diff index = i rows = rows[index:] return rows
def load_somatics( jsm_file_name ): n = int( 1e5 ) threshold = 1e-6 reader = JointSnvMixReader( jsm_file_name ) chr_list = reader.get_chr_list() scores = [] for chr_name in sorted( chr_list ): if chr_name in excluded_chrom: continue print chr_name chr_rows = reader.get_rows( chr_name ) for row in chr_rows: score = row['p_aa_ab'] + row['p_aa_bb'] insert_position = bisect.bisect( scores, score ) if insert_position > 0 or len( scores ) == 0: scores.insert( insert_position, score ) if scores[0] <= threshold or len( scores ) > n: scores.pop( 0 ) reader.close() max_diff = 0 index = 0 for i in range( len( scores ) - 1 ): diff = scores[i + 1] - scores[i] if diff > max_diff: max_diff = diff index = i scores = scores[index:] return scores