def get_gene_overlap( chr, pos, ori, bp ): SERVER='http://grch37.rest.ensembl.org' SPECIES="human" ENDPOINT="/overlap/region/"+SPECIES+"/"+str(chr)+":"+str(pos)+"-"+str(pos) HEADERS={"Content-Type" : "application/json"} PARAMS={"feature": "gene"} genes_data=EnsemblRestClient.perform_rest_action(SERVER, ENDPOINT, HEADERS, PARAMS) fusions = dict() for gene in genes_data: if gene["biotype"]=="protein_coding": fusion=[] if not ori and gene['strand'] == 1: if 'donor' not in fusions: fusions['donor'] = dict() fusions['donor'][gene['id']+"\t"+bp] = gene['start'] elif not ori and gene['strand'] == -1: if 'acceptor' not in fusions: fusions['acceptor'] = dict() fusions['acceptor'][gene['id']+"\t"+bp] = gene['start'] elif ori and gene['strand'] == 1: if 'acceptor' not in fusions: fusions['acceptor'] = dict() fusions['acceptor'][gene['id']+"\t"+bp] = gene['end'] elif ori and gene['strand'] == -1: if 'donor' not in fusions: fusions['donor'] = dict() fusions['donor'][gene['id']+"\t"+bp] = gene['end'] return( fusions )
def mask_seq(chr, start, end, strand, seq): ext = "/overlap/region/human/" + str(chr) + ":" + str(start) + "-" + str( end) + ":" + str(strand) + "?feature=variation" headers = {"Content-Type": "application/json"} data = EnsemblRestClient.perform_rest_action(server, ext, headers) masked_seq = seq for snp in data: snp_pos = snp['start'] - start if str(strand) == '-1': snp_pos = len(masked_seq) - snp_pos - 1 masked_seq = masked_seq[:snp_pos] + 'n' + masked_seq[snp_pos + 1:] return (masked_seq)
def get_seq(chr, start, end, strand): ext = "/info/assembly/homo_sapiens/" + str(chr) headers = {"Content-Type": "application/json"} data = EnsemblRestClient.perform_rest_action(server, ext, headers) chrlength = data['length'] ext = "/sequence/region/human/" + str(chr) + ":" + str(start) + "-" + str( end) + ":" + str(strand) + "" if start < 1: sys.stderr.write( "\tFailed to get seq, because POS is too close to START of the chr\n" ) seq = False elif end > chrlength: sys.stderr.write( "\tFailed to get seq, because ENDPOS is too close to END of the chr\n" ) seq = False else: data = EnsemblRestClient.perform_rest_action(server, ext, headers) seq = data['seq'] if mask: seq = mask_seq(chr, start, end, strand, seq) return (seq)
bamfile = pysam.AlignmentFile(args.bam, "rb" ) fasta = open(args.output_dir+"/"+svid+".fasta", 'a+') for read in bamfile.fetch(chr, start, end): #### Breakpoints that only have supplementary reads and not a primary read spanning the breakpoint will be excluded #### No effect when testing on the truthset in recall, but see if it ever happens in real sets if read.query_name in exclude or read.seq == None or read.is_supplementary: continue fasta.write( ">"+read.query_name+"\n") fasta.write(read.seq+"\n") #exclude.append(read.query_name) fasta.close() bamfile.close() print("Start:", datetime.datetime.now()) EnsemblRestClient=EnsemblRestClient() vcf_reader = pyvcf.Reader(open(args.vcf, 'r')) for record in vcf_reader: if not isinstance(record.ALT[0], pyvcf.model._Breakend): continue fusions={'donor':{}, 'acceptor':{}} bnd1_fusions = get_gene_overlap(record.CHROM, record.POS, record.ALT[0].orientation, '1') ### Skip next request if the first BND already falls outside of a gene if not fusions: continue bnd2_fusions=get_gene_overlap(record.ALT[0].chr, record.ALT[0].pos, record.ALT[0].remoteOrientation, '2') fusions.update(bnd1_fusions) if 'donor' in bnd2_fusions: fusions['donor'].update(bnd2_fusions['donor'])
type=str, help='Output bed file', required=True) parser.add_argument( '-r', '--region', type=str, help='List of genes or regions to select from the bam-file', required=True) args = parser.parse_args() bed = args.bed selection = args.region selection = selection.split(",") EnsemblRestClient = EnsemblRestClient() ### Create a bed-file that contains all the regions that need to be selected from the full bam-file with open(bed, "w") as bed: for item in selection: ### If a region is given in a chr:pos1-pos2 format, this format is directly used if re.match(".+:\d+-\d+", item): chrom = item.split(":")[0] pos1 = item.split(":")[1].split("-")[0] pos2 = item.split(":")[1].split("-")[1] else: server = 'http://grch37.rest.ensembl.org' ### If a desired region is given by a ensembl identifier (e.g. ENSG00000141510) or the gene name, the positions are taken from ensembl
def EnsemblAnnotation(CHROM, POS): SERVER = 'http://grch37.rest.ensembl.org' ENDPOINT = "/overlap/region/human/" + str(CHROM) + ":" + str( POS) + "-" + str(POS) HEADERS = {"Content-Type": "application/json"} PARAMS = {"feature": "transcript"} genes_data = EnsemblRestClient.perform_rest_action(SERVER, ENDPOINT, HEADERS, PARAMS) transcript_ccds = {} UNIQUE_GENES = [] for hit in genes_data: if hit["biotype"] == "protein_coding": if "ccdsid" in hit: transcript_ccds[hit["id"]] = hit["ccdsid"] else: transcript_ccds[hit["id"]] = None if hit["Parent"] not in UNIQUE_GENES: UNIQUE_GENES.append(hit["Parent"]) HITS = [] for GENE_ID in UNIQUE_GENES: #GENE_ID=hit ENDPOINT = "/lookup/id/" + str(GENE_ID) PARAMS = {"expand": "1"} gene_info = EnsemblRestClient.perform_rest_action( SERVER, ENDPOINT, HEADERS, PARAMS) if gene_info["biotype"] == "protein_coding": INFO = {} INFO["Gene_id"] = gene_info["id"] INFO["Gene_name"] = gene_info["display_name"] INFO["Strand"] = gene_info["strand"] INFO["Gene_start"] = gene_info["start"] INFO["Gene_end"] = gene_info["end"] INFO["Chromosome"] = CHROM INFO["Flags"] = [] if "description" in gene_info: if "readthrough" in gene_info["description"]: INFO["Flags"].append("fusion-with-readthrough") ##### FLAG for CTC-... and RP..... proteins (Often not well characterized or readthrough genes) for transcript in gene_info["Transcript"]: if transcript["is_canonical"] == 1: if transcript["id"] in transcript_ccds: if transcript_ccds[transcript["id"]] is None: INFO["Flags"].append("No-CCDS") LENGTH_CDS = 0 CDS = False INFO["Transcript_id"] = transcript["id"] INFO["Biotype"] = transcript["biotype"] INFO["Transcript_start"] = transcript["start"] INFO["Transcript_end"] = transcript["end"] INFO["Total_exons"] = len(transcript["Exon"]) INFO["Original_CDS_length"] = ( transcript["Translation"]["length"] * 3) + 3 if INFO["Strand"] == 1: INFO["CDS_start"] = transcript["Translation"]["start"] INFO["CDS_end"] = transcript["Translation"]["end"] else: INFO["CDS_start"] = transcript["Translation"]["end"] INFO["CDS_end"] = transcript["Translation"]["start"] INFO["Exons"] = [] ### EXONS for rank, exon in enumerate(transcript["Exon"]): EXON_INFO = {} EXON_INFO["Rank"] = rank + 1 EXON_INFO["Type"] = "exon" CHRON_START = exon["start"] CHRON_END = exon["end"] if INFO["Strand"] == 1: EXON_INFO["Start"] = exon["start"] EXON_INFO["End"] = exon["end"] else: EXON_INFO["Start"] = exon["end"] EXON_INFO["End"] = exon["start"] EXON_INFO["Contains_start_CDS"] = False EXON_INFO["Contains_end_CDS"] = False if CDS: if INFO["CDS_end"] >= CHRON_START and INFO[ "CDS_end"] <= CHRON_END: EXON_INFO["Contains_end_CDS"] = True EXON_INFO["CDS"] = True EXON_INFO["Start_phase"] = PHASE #EXON_INFO["End_phase"]=-1 EXON_INFO["End_phase"] = 0 EXON_INFO["CDS_length"] = abs( INFO["CDS_end"] - EXON_INFO["Start"]) + 1 CDS = False else: EXON_INFO["CDS"] = True EXON_INFO["Start_phase"] = PHASE EXON_INFO["End_phase"] = ( abs(EXON_INFO["End"] - EXON_INFO["Start"]) + 1 + PHASE) % 3 EXON_INFO["CDS_length"] = abs( EXON_INFO["End"] - EXON_INFO["Start"]) + 1 elif INFO["CDS_start"] >= CHRON_START and INFO[ "CDS_start"] <= CHRON_END: EXON_INFO["Contains_start_CDS"] = True EXON_INFO["CDS"] = True #EXON_INFO["Start_phase"]=-1 EXON_INFO["Start_phase"] = 0 if INFO["CDS_end"] >= CHRON_START and INFO[ "CDS_end"] <= CHRON_END: EXON_INFO["Contains_end_CDS"] = True EXON_INFO["End_phase"] = 0 EXON_INFO["CDS_length"] = abs( INFO["CDS_end"] - INFO["CDS_start"]) + 1 else: EXON_INFO["End_phase"] = ( abs(EXON_INFO["End"] - INFO["CDS_start"]) + 1) % 3 EXON_INFO["CDS_length"] = abs( EXON_INFO["End"] - INFO["CDS_start"]) + 1 CDS = True else: EXON_INFO["CDS"] = False EXON_INFO[ "Start_phase"] = "-1" #ADD SOMETHING ELSE SO AN ERROR IS GIVEN IF USED EXON_INFO["End_phase"] = "-1" EXON_INFO["CDS_length"] = 0 PHASE = EXON_INFO["End_phase"] LENGTH_CDS += EXON_INFO["CDS_length"] INFO["Exons"].append(EXON_INFO) ### INTRONS if rank < len(transcript["Exon"]) - 1: INTRON_INFO = {} INTRON_INFO["Type"] = "intron" INTRON_INFO["Rank"] = rank + 1 INTRON_INFO["Phase"] = EXON_INFO["End_phase"] if INFO["Strand"] == 1: INTRON_INFO["Start"] = transcript["Exon"][ rank]["end"] + 1 INTRON_INFO["End"] = transcript["Exon"][ rank + 1]["start"] - 1 else: INTRON_INFO["Start"] = transcript["Exon"][ rank]["start"] - 1 INTRON_INFO["End"] = transcript["Exon"][ rank + 1]["end"] + 1 # INTRON_INFO["Start"]=transcript["Exon"][rank]["end"] # INTRON_INFO["End"]=transcript["Exon"][rank+1]["start"] INTRON_INFO["CDS"] = CDS INFO["Exons"].append(INTRON_INFO) #print(INFO["Gene_id"],LENGTH_CDS, transcript["Translation"]["length"]*3) if LENGTH_CDS - 3 != transcript["Translation"][ "length"] * 3: INFO["Flags"].append( "Possible-incomplete-CDS" ) #as currently I am unable to request the given ENSEMBL flags. Bias towards incomplete but bases%3=0 HITS.append(INFO) return HITS