def generate_feature_matrix(input_dir, input_files, input_files_id, format_, caller, svtype_col_name, as_breakpoint, definitions, output): """ Generate feature matrix from VCF or BEDPE files. """ if format_ == 'bedpe': if (input_dir is None) & (input_files is None): return elif (input_files is None): data = viola.read_bedpe_multi(input_dir, svtype_col_name=svtype_col_name) elif (input_dir is None): ls_input = input_files.split(',') ls_bedpe = [ viola.read_bedpe(path, svtype_col_name=svtype_col_name) for path in ls_input ] if input_files_id is None: ls_names = range(len(ls_bedpe)) else: ls_names = input_files_id.split(',') data = viola.MultiBedpe(ls_bedpe, ls_names) else: return else: if (input_dir is None) & (input_files is None): return elif (input_files is None): data = viola.read_vcf_multi(input_dir, variant_caller=caller, as_breakpoint=as_breakpoint) elif (input_dir is None): ls_input = input_files.split(',') if as_breakpoint: ls_vcf = [ viola.read_vcf( path, variant_caller=caller).breakend2breakpoint() for path in ls_input ] else: ls_vcf = [ viola.read_vcf(path, variant_caller=caller) for path in ls_input ] if input_files_id is None: ls_names = range(len(ls_vcf)) else: ls_names = input_files_id.split(',') data = viola.MultiBedpe(ls_vcf, ls_names) else: return result = data.classify_manual_svtype(definitions=definitions) result.to_csv(output, sep='\t')
def test_classify_manual_svtype_from_file(): bedpe1 = viola.read_bedpe(StringIO(data)) bedpe2 = viola.read_bedpe(StringIO(data)) ls_conditions = [ small_del, large_del, small_dup, large_dup, small_inv, tra ] ls_names = [ 'small_del', 'large_del', 'small_dup', 'large_dup', 'small_inv', 'tra' ] multibedpe = viola.MultiBedpe([bedpe1, bedpe2], ['bedpe1', 'bedpe2']) path = os.path.join(HERE, '../bedpe/data/example_definition.txt') result = multibedpe.classify_manual_svtype(definitions=path) manual_sv_type = multibedpe.manual_sv_type manual_sv_type.set_index('id', inplace=True) manual_sv_type_expected = pd.read_csv(StringIO(data_expected), sep='\t', names=('id', 'value_idx', 'manual_sv_type')) manual_sv_type_expected.set_index('id', inplace=True) pd.testing.assert_frame_equal(manual_sv_type, manual_sv_type_expected, check_like=True) result_expected = pd.DataFrame([[2, 3, 1, 0, 2, 2, 2], [2, 3, 1, 0, 2, 2, 2]]) result_expected.columns = ls_names + ['others'] result_expected.columns.name = 'manual_sv_type' result_expected.index = ['bedpe1', 'bedpe2'] result_expected.index.name = 'patients' pd.testing.assert_frame_equal(result, result_expected)
def test_classify_manual_svtype_exclude_empty(): bedpe1 = viola.read_bedpe(StringIO(data)) bedpe2 = viola.read_bedpe(StringIO(data)) empty1 = viola.read_bedpe(StringIO(data_empty)) empty2 = viola.read_bedpe(StringIO(data_empty)) ls_conditions = [ small_del, large_del, small_dup, large_dup, small_inv, tra ] ls_names = [ 'small_del', 'large_del', 'small_dup', 'large_dup', 'small_inv', 'tra' ] multibedpe = viola.MultiBedpe([bedpe1, empty1, bedpe2, empty2], ['bedpe1', 'empty1', 'bedpe2', 'empty2']) result = multibedpe.classify_manual_svtype(ls_conditions=ls_conditions, ls_names=ls_names, exclude_empty_cases=True) manual_sv_type = multibedpe.manual_sv_type manual_sv_type.set_index('id', inplace=True) manual_sv_type_expected = pd.read_csv(StringIO(data_expected), sep='\t', names=('id', 'value_idx', 'manual_sv_type')) manual_sv_type_expected.set_index('id', inplace=True) pd.testing.assert_frame_equal(manual_sv_type, manual_sv_type_expected, check_like=True) result_expected = pd.DataFrame([[2, 3, 1, 0, 2, 2, 2], [2, 3, 1, 0, 2, 2, 2]]) result_expected.columns = ls_names + ['others'] result_expected.columns.name = 'manual_sv_type' result_expected.index = ['bedpe1', 'bedpe2'] result_expected.index.name = 'patients' pd.testing.assert_frame_equal(result, result_expected)
def merge(self, ls_bedpe=[], ls_caller_names=None, threshold=100, linkage="complete", str_missing=True): """ merge(ls_bedpe:list, ls_caller_names:list, threshold:float, linkage = "complete", str_missing=True) Return a merged bedpe object from mulitple caller's bedpe objects in ls_bedpe Parameters ---------- ls_bedpe:list A list of bedpe objects to be merged, which are the same order with ls_caller_names ls_caller_names:list A list of names of bedpe objects to be merged, which should have self's name as the first element threshold:float Two SVs whose diference of positions is under this threshold are cosidered to be identical. linkage:{‘complete’, ‘average’, ‘single’}, default=’complete’ The linkage of hierarchical clustering. To keep the mutual distance of all SVs in each cluster below the threshold, "complete" is recommended. str_missing:boolean, default="True" If True, all the missing strands are considered to be identical to the others. Returns ---------- A merged bedpe object """ if self in ls_bedpe: pass else: ls_bedpe = [self] + ls_bedpe multibedpe = viola.MultiBedpe(ls_bedpe, ls_caller_names) distance_matrix = self._generate_distance_matrix_by_distance( multibedpe, penalty_length=3e9, str_missing=str_missing) hcl_clustering_model = AgglomerativeClustering( n_clusters=None, affinity="precomputed", linkage=linkage, distance_threshold=threshold) labels = hcl_clustering_model.fit_predict(X=distance_matrix) positions_table = multibedpe.get_table("positions") mergedid_dict = {labels[0]: 0} ls_mergedid = [] idx_head = 0 for label in labels: if label in mergedid_dict: ls_mergedid.append(mergedid_dict[label]) else: idx_head += 1 mergedid_dict[label] = idx_head ls_mergedid.append(mergedid_dict[label]) N = len(positions_table) value_idx = pd.Series(np.zeros(N, dtype=int)) df_mergedid = pd.DataFrame({ "id": positions_table["id"], "value_idx": value_idx, "mergedid": pd.Series(ls_mergedid) }) originalid = multibedpe.get_table("global_id")["id"] df_originalid = pd.DataFrame({ "id": positions_table["id"], "value_idx": value_idx, "originalid": originalid }) ############## Edited by Sugita ################## df_id = multibedpe.get_table("global_id") df_patients = multibedpe.get_table("patients") df_id_patients = df_id.merge(df_patients, left_on="patient_id", right_on="id") caller = df_id_patients["patients"] df_caller = pd.DataFrame({ "id": positions_table["id"], "value_idx": value_idx, "caller": caller }) ############## /Edited by Sugita ################# df_svpos = multibedpe._df_svpos odict_df_info = multibedpe._odict_df_info merged_bedpe = viola.Bedpe(df_svpos=df_svpos, odict_df_info=odict_df_info) merged_bedpe.add_info_table(table_name="mergedid", df=df_mergedid) merged_bedpe.add_info_table(table_name="originalid", df=df_originalid) merged_bedpe.add_info_table(table_name="caller", df=df_caller) return merged_bedpe
def test_read_bedpe_with_empty(): bedpe1 = viola.read_bedpe(StringIO(data)) bedpe2 = viola.read_bedpe(StringIO(data)) bedpe_empty = viola.read_bedpe(StringIO(data_empty)) multibedpe = viola.MultiBedpe([bedpe1, bedpe2, bedpe_empty], ['bedpe1', 'bedpe2', 'empty'])