def intersect_fusions_by_breakpoints(): lines = [line for line in open(cff, "r")] fusion = pygeneann.CffFusion(lines[0]) header = fusion.zone1_attrs + fusion.zone2_attrs + fusion.zone3_attrs + fusion.zone4_attrs df_cff = pd.read_csv(cff, sep='\t', keep_default_na=False, index_col=False, names=header) #create BedTools object with appropriate column names print >> sys.stderr, "create BedTools object with appropriate column names" df_bed = df_cff[[ 'chr1', 'pos1', 'pos1', 'chr2', 'pos2', 'pos2', 'fusion_id' ]] df_bed.columns = [ 'chr1', 'pos1', 'pos1_2', 'chr2', 'pos2', 'pos2_2', 'fusion_id' ] df_bed.loc[:, ['pos1_2', 'pos2_2']] += 1 df_bed = bedtools.BedTool.from_dataframe(df_bed) #Intersect fusions: NOTE: only keeps fusions that intersect #print >> sys.stderr, "Intersect fusions: NOTE: only keeps fusions that intersect" #df_intersect=df_bed.pair_to_pair(df_bed, slop=100, rdn=True) print >> sys.stderr, "Intersect fusions: NOTE: rdn=False, keeps self-intersections" df_intersect = df_bed.pair_to_pair(df_bed, slop=100, rdn=False) df = df_intersect.to_dataframe(header=None).iloc[:, 0:14] df.columns = [ 'chr1', 'pos1', 'pos1_2', 'chr2', 'pos2', 'pos2_2', 'fusion_id', 'chr1_1', 'pos1_1', 'pos1_2_1', 'chr2_1', 'pos2_1', 'pos2_2_1', 'fusion_id_lst' ] df = df[['fusion_id', 'fusion_id_lst']] #write paired F_IDs to tsv return df
def intersect_fusions_by_genes(cff_file): fusion_dict = {} fusion_list_for_bp_cmp = [] common_key_dict = {} # cluster fusions by gene pairs, save in fusion_dict for line in open(cff_file, "r"): if line.startswith("#"): continue fusion = pygeneann.CffFusion(line) if fusion.t_gene1 == "NA" or fusion.t_gene2 == "NA": continue else: key = ",".join( sorted([ fusion.t_gene1 + "|" + fusion.chr1, fusion.t_gene2 + "|" + fusion.chr2 ])) fusion_dict.setdefault(key, []).append(fusion.fusion_id) return fusion_dict
def cluster_fusions_by_genes(cff_file): fusion_dict = {} fusion_list_for_bp_cmp = [] common_key_dict = {} # cluster fusions by gene pairs, save in fusion_dict for line in open(cff_file, "r"): if line.startswith("#"): continue fusion = pygeneann.CffFusion(line) if fusion.t_gene1 == "NA" or fusion.t_gene2 == "NA": continue else: key = ",".join( sorted([ fusion.t_gene1 + "|" + fusion.chr1, fusion.t_gene2 + "|" + fusion.chr2 ])) fusion_dict.setdefault(key, []).append(fusion.fusion_id) # output clustered fusions #for key in fusion_dict: # fusion_list = fusion_dict[key] #self.output_clustered_fusions(fusion_list, "Gene_Cluster") return fusion_dict
parser.add_argument('ref_fa', action='store', help='Reference genome file') args = parser.parse_args() gene_ann = pygeneann.GeneAnnotation(args.ensbed) ref = pysam.FastaFile(args.ref_fa) def remove_underscores(gene): return gene.replace("_", ".") seq_dict = {} for line in open(args.cff_file, "r"): fusion = pygeneann.CffFusion(line) # in a downstream script, "_" is used as a field separator. Need to remove "_" from gene names gene1 = remove_underscores(fusion.reann_gene1) gene2 = remove_underscores(fusion.reann_gene2) lib = fusion.library fusion_id = fusion.fusion_id #print fusion.tostring() #fusion.check_codon(gene_ann, ref_fa) head_seqs = pygeneann.build_transcript_and_fusion_seq( gene_ann, fusion, ref, fusion.pos1, "head") #print(set(head_seqs)) #continue tail_seqs = pygeneann.build_transcript_and_fusion_seq(
list(set(gene2_list))), max_split_cnt, max_span_cnt, ",".join(list(set(sample_type_list))), ",".join( list(set(disease_list))), ",".join(list(set(tool_list))), ",".join(list(set(category_list))), gene1_on_bndry, gene1_close_to_bndry, gene2_on_bndry, gene2_close_to_bndry, dna_supp_cluster_num, ",".join(list(set(sample_list))), ",".join( list(set(chr1_list))), ",".join(list(set(breakpoint_1_list))), ",".join(list(set(chr2_list))), ",".join( list(set(breakpoint_2_list))), captured_reads_tumor_mean, captured_reads_normal_mean, ",".join(list(set(fusion_IDs))) ])) # Load cff file lines = [line for line in open(cff, "r")] fusion = pygeneann.CffFusion(lines[0]) header = fusion.zone1_attrs + fusion.zone2_attrs + fusion.zone3_attrs + fusion.zone4_attrs df_cff = pd.read_csv(cff, sep='\t', keep_default_na=False, index_col=False, names=header) # load FIDs file FID_clusters = [line for line in open(FIDs, "r")] #FID_clusters = FID_clusters[1] for cluster in FID_clusters: if cluster.startswith('FIDs'): continue FID_lst = cluster.rstrip().split(",") #print(FID_lst) #print(df_cff[df_cff['fusion_id'].isin(FID_lst)][['t_gene1', 't_gene2', 'pos1', 'pos2']])