def intersect_fusions_by_breakpoints():
    lines = [line for line in open(cff, "r")]
    fusion = pygeneann.CffFusion(lines[0])
    header = fusion.zone1_attrs + fusion.zone2_attrs + fusion.zone3_attrs + fusion.zone4_attrs
    df_cff = pd.read_csv(cff,
                         sep='\t',
                         keep_default_na=False,
                         index_col=False,
                         names=header)

    #create BedTools object with appropriate column names
    print >> sys.stderr, "create BedTools object with appropriate column names"
    df_bed = df_cff[[
        'chr1', 'pos1', 'pos1', 'chr2', 'pos2', 'pos2', 'fusion_id'
    ]]
    df_bed.columns = [
        'chr1', 'pos1', 'pos1_2', 'chr2', 'pos2', 'pos2_2', 'fusion_id'
    ]
    df_bed.loc[:, ['pos1_2', 'pos2_2']] += 1
    df_bed = bedtools.BedTool.from_dataframe(df_bed)

    #Intersect fusions: NOTE: only keeps fusions that intersect
    #print >> sys.stderr, "Intersect fusions: NOTE: only keeps fusions that intersect"
    #df_intersect=df_bed.pair_to_pair(df_bed, slop=100, rdn=True)
    print >> sys.stderr, "Intersect fusions: NOTE: rdn=False, keeps self-intersections"
    df_intersect = df_bed.pair_to_pair(df_bed, slop=100, rdn=False)
    df = df_intersect.to_dataframe(header=None).iloc[:, 0:14]
    df.columns = [
        'chr1', 'pos1', 'pos1_2', 'chr2', 'pos2', 'pos2_2', 'fusion_id',
        'chr1_1', 'pos1_1', 'pos1_2_1', 'chr2_1', 'pos2_1', 'pos2_2_1',
        'fusion_id_lst'
    ]
    df = df[['fusion_id', 'fusion_id_lst']]
    #write paired F_IDs to tsv
    return df
def intersect_fusions_by_genes(cff_file):
    fusion_dict = {}
    fusion_list_for_bp_cmp = []
    common_key_dict = {}
    # cluster fusions by gene pairs, save in fusion_dict
    for line in open(cff_file, "r"):
        if line.startswith("#"):
            continue
        fusion = pygeneann.CffFusion(line)
        if fusion.t_gene1 == "NA" or fusion.t_gene2 == "NA":
            continue
        else:
            key = ",".join(
                sorted([
                    fusion.t_gene1 + "|" + fusion.chr1,
                    fusion.t_gene2 + "|" + fusion.chr2
                ]))
            fusion_dict.setdefault(key, []).append(fusion.fusion_id)
    return fusion_dict
Example #3
0
def cluster_fusions_by_genes(cff_file):
    fusion_dict = {}
    fusion_list_for_bp_cmp = []
    common_key_dict = {}
    # cluster fusions by gene pairs, save in fusion_dict
    for line in open(cff_file, "r"):
        if line.startswith("#"):
            continue
        fusion = pygeneann.CffFusion(line)
        if fusion.t_gene1 == "NA" or fusion.t_gene2 == "NA":
            continue
        else:
            key = ",".join(
                sorted([
                    fusion.t_gene1 + "|" + fusion.chr1,
                    fusion.t_gene2 + "|" + fusion.chr2
                ]))
            fusion_dict.setdefault(key, []).append(fusion.fusion_id)
    # output clustered fusions
    #for key in fusion_dict:
    #    fusion_list = fusion_dict[key]

    #self.output_clustered_fusions(fusion_list, "Gene_Cluster")
    return fusion_dict
parser.add_argument('ref_fa', action='store', help='Reference genome file')

args = parser.parse_args()

gene_ann = pygeneann.GeneAnnotation(args.ensbed)

ref = pysam.FastaFile(args.ref_fa)


def remove_underscores(gene):
    return gene.replace("_", ".")


seq_dict = {}
for line in open(args.cff_file, "r"):
    fusion = pygeneann.CffFusion(line)

    # in a downstream script, "_" is used as a field separator. Need to remove "_" from gene names
    gene1 = remove_underscores(fusion.reann_gene1)
    gene2 = remove_underscores(fusion.reann_gene2)
    lib = fusion.library
    fusion_id = fusion.fusion_id

    #print fusion.tostring()

    #fusion.check_codon(gene_ann, ref_fa)
    head_seqs = pygeneann.build_transcript_and_fusion_seq(
        gene_ann, fusion, ref, fusion.pos1, "head")
    #print(set(head_seqs))
    #continue
    tail_seqs = pygeneann.build_transcript_and_fusion_seq(
Example #5
0
                list(set(gene2_list))), max_split_cnt, max_span_cnt,
            ",".join(list(set(sample_type_list))), ",".join(
                list(set(disease_list))), ",".join(list(set(tool_list))),
            ",".join(list(set(category_list))), gene1_on_bndry,
            gene1_close_to_bndry, gene2_on_bndry, gene2_close_to_bndry,
            dna_supp_cluster_num, ",".join(list(set(sample_list))), ",".join(
                list(set(chr1_list))), ",".join(list(set(breakpoint_1_list))),
            ",".join(list(set(chr2_list))), ",".join(
                list(set(breakpoint_2_list))), captured_reads_tumor_mean,
            captured_reads_normal_mean, ",".join(list(set(fusion_IDs)))
        ]))


# Load cff file
lines = [line for line in open(cff, "r")]
fusion = pygeneann.CffFusion(lines[0])
header = fusion.zone1_attrs + fusion.zone2_attrs + fusion.zone3_attrs + fusion.zone4_attrs
df_cff = pd.read_csv(cff,
                     sep='\t',
                     keep_default_na=False,
                     index_col=False,
                     names=header)

# load FIDs file
FID_clusters = [line for line in open(FIDs, "r")]
#FID_clusters = FID_clusters[1]
for cluster in FID_clusters:
    if cluster.startswith('FIDs'): continue
    FID_lst = cluster.rstrip().split(",")
    #print(FID_lst)
    #print(df_cff[df_cff['fusion_id'].isin(FID_lst)][['t_gene1', 't_gene2', 'pos1', 'pos2']])