コード例 #1
0
def get_gene_overlap( chr, pos, ori, bp ):
    SERVER='http://grch37.rest.ensembl.org'
    SPECIES="human"
    ENDPOINT="/overlap/region/"+SPECIES+"/"+str(chr)+":"+str(pos)+"-"+str(pos)
    HEADERS={"Content-Type" : "application/json"}
    PARAMS={"feature": "gene"}

    genes_data=EnsemblRestClient.perform_rest_action(SERVER, ENDPOINT, HEADERS, PARAMS)

    fusions = dict()

    for gene in genes_data:
        if gene["biotype"]=="protein_coding":
            fusion=[]
            if not ori and gene['strand'] == 1:
                if 'donor' not in fusions:
                    fusions['donor'] = dict()
                fusions['donor'][gene['id']+"\t"+bp] = gene['start']
            elif not ori and gene['strand'] == -1:
                if 'acceptor' not in fusions:
                    fusions['acceptor'] = dict()
                fusions['acceptor'][gene['id']+"\t"+bp] = gene['start']
            elif ori and gene['strand'] == 1:
                if 'acceptor' not in fusions:
                    fusions['acceptor'] = dict()
                fusions['acceptor'][gene['id']+"\t"+bp] = gene['end']
            elif ori and gene['strand'] == -1:
                if 'donor' not in fusions:
                    fusions['donor'] = dict()
                fusions['donor'][gene['id']+"\t"+bp] = gene['end']

    return( fusions )
コード例 #2
0
def mask_seq(chr, start, end, strand, seq):
    ext = "/overlap/region/human/" + str(chr) + ":" + str(start) + "-" + str(
        end) + ":" + str(strand) + "?feature=variation"
    headers = {"Content-Type": "application/json"}
    data = EnsemblRestClient.perform_rest_action(server, ext, headers)
    masked_seq = seq

    for snp in data:
        snp_pos = snp['start'] - start
        if str(strand) == '-1':
            snp_pos = len(masked_seq) - snp_pos - 1
        masked_seq = masked_seq[:snp_pos] + 'n' + masked_seq[snp_pos + 1:]
    return (masked_seq)
コード例 #3
0
def get_seq(chr, start, end, strand):
    ext = "/info/assembly/homo_sapiens/" + str(chr)
    headers = {"Content-Type": "application/json"}
    data = EnsemblRestClient.perform_rest_action(server, ext, headers)
    chrlength = data['length']

    ext = "/sequence/region/human/" + str(chr) + ":" + str(start) + "-" + str(
        end) + ":" + str(strand) + ""
    if start < 1:
        sys.stderr.write(
            "\tFailed to get seq, because POS is too close to START of the chr\n"
        )
        seq = False
    elif end > chrlength:
        sys.stderr.write(
            "\tFailed to get seq, because ENDPOS is too close to END of the chr\n"
        )
        seq = False
    else:
        data = EnsemblRestClient.perform_rest_action(server, ext, headers)
        seq = data['seq']
        if mask:
            seq = mask_seq(chr, start, end, strand, seq)
    return (seq)
コード例 #4
0
    bamfile = pysam.AlignmentFile(args.bam, "rb" )
    fasta = open(args.output_dir+"/"+svid+".fasta", 'a+')
    for read in bamfile.fetch(chr, start, end):
        #### Breakpoints that only have supplementary reads and not a primary read spanning the breakpoint will be excluded
        #### No effect when testing on the truthset in recall, but see if it ever happens in real sets
        if read.query_name in exclude or read.seq == None or read.is_supplementary:
            continue
        fasta.write( ">"+read.query_name+"\n")
        fasta.write(read.seq+"\n")
        #exclude.append(read.query_name)
    fasta.close()
    bamfile.close()

print("Start:", datetime.datetime.now())

EnsemblRestClient=EnsemblRestClient()
vcf_reader = pyvcf.Reader(open(args.vcf, 'r'))
for record in vcf_reader:
    if not isinstance(record.ALT[0], pyvcf.model._Breakend):
        continue
    fusions={'donor':{}, 'acceptor':{}}
    bnd1_fusions = get_gene_overlap(record.CHROM, record.POS, record.ALT[0].orientation, '1')

    ### Skip next request if the first BND already falls outside of a gene
    if not fusions:
        continue
    bnd2_fusions=get_gene_overlap(record.ALT[0].chr, record.ALT[0].pos, record.ALT[0].remoteOrientation, '2')

    fusions.update(bnd1_fusions)
    if 'donor' in bnd2_fusions:
        fusions['donor'].update(bnd2_fusions['donor'])
コード例 #5
0
                    type=str,
                    help='Output bed file',
                    required=True)
parser.add_argument(
    '-r',
    '--region',
    type=str,
    help='List of genes or regions to select from the bam-file',
    required=True)

args = parser.parse_args()

bed = args.bed
selection = args.region
selection = selection.split(",")
EnsemblRestClient = EnsemblRestClient()

### Create a bed-file that contains all the regions that need to be selected from the full bam-file
with open(bed, "w") as bed:
    for item in selection:

        ### If a region is given in a chr:pos1-pos2 format, this format is directly used
        if re.match(".+:\d+-\d+", item):
            chrom = item.split(":")[0]
            pos1 = item.split(":")[1].split("-")[0]
            pos2 = item.split(":")[1].split("-")[1]

        else:
            server = 'http://grch37.rest.ensembl.org'

            ### If a desired region is given by a ensembl identifier (e.g. ENSG00000141510) or the gene name, the positions are taken from ensembl
コード例 #6
0
def EnsemblAnnotation(CHROM, POS):
    SERVER = 'http://grch37.rest.ensembl.org'
    ENDPOINT = "/overlap/region/human/" + str(CHROM) + ":" + str(
        POS) + "-" + str(POS)
    HEADERS = {"Content-Type": "application/json"}
    PARAMS = {"feature": "transcript"}
    genes_data = EnsemblRestClient.perform_rest_action(SERVER, ENDPOINT,
                                                       HEADERS, PARAMS)

    transcript_ccds = {}
    UNIQUE_GENES = []
    for hit in genes_data:
        if hit["biotype"] == "protein_coding":
            if "ccdsid" in hit:
                transcript_ccds[hit["id"]] = hit["ccdsid"]
            else:
                transcript_ccds[hit["id"]] = None

            if hit["Parent"] not in UNIQUE_GENES:
                UNIQUE_GENES.append(hit["Parent"])

    HITS = []
    for GENE_ID in UNIQUE_GENES:
        #GENE_ID=hit
        ENDPOINT = "/lookup/id/" + str(GENE_ID)
        PARAMS = {"expand": "1"}
        gene_info = EnsemblRestClient.perform_rest_action(
            SERVER, ENDPOINT, HEADERS, PARAMS)
        if gene_info["biotype"] == "protein_coding":
            INFO = {}
            INFO["Gene_id"] = gene_info["id"]
            INFO["Gene_name"] = gene_info["display_name"]
            INFO["Strand"] = gene_info["strand"]
            INFO["Gene_start"] = gene_info["start"]
            INFO["Gene_end"] = gene_info["end"]
            INFO["Chromosome"] = CHROM
            INFO["Flags"] = []

            if "description" in gene_info:
                if "readthrough" in gene_info["description"]:
                    INFO["Flags"].append("fusion-with-readthrough")

            ##### FLAG for CTC-...  and RP..... proteins (Often not well characterized or readthrough genes)

            for transcript in gene_info["Transcript"]:
                if transcript["is_canonical"] == 1:
                    if transcript["id"] in transcript_ccds:
                        if transcript_ccds[transcript["id"]] is None:
                            INFO["Flags"].append("No-CCDS")
                    LENGTH_CDS = 0
                    CDS = False
                    INFO["Transcript_id"] = transcript["id"]
                    INFO["Biotype"] = transcript["biotype"]
                    INFO["Transcript_start"] = transcript["start"]
                    INFO["Transcript_end"] = transcript["end"]
                    INFO["Total_exons"] = len(transcript["Exon"])
                    INFO["Original_CDS_length"] = (
                        transcript["Translation"]["length"] * 3) + 3
                    if INFO["Strand"] == 1:
                        INFO["CDS_start"] = transcript["Translation"]["start"]
                        INFO["CDS_end"] = transcript["Translation"]["end"]
                    else:
                        INFO["CDS_start"] = transcript["Translation"]["end"]
                        INFO["CDS_end"] = transcript["Translation"]["start"]
                    INFO["Exons"] = []

                    ### EXONS
                    for rank, exon in enumerate(transcript["Exon"]):
                        EXON_INFO = {}
                        EXON_INFO["Rank"] = rank + 1
                        EXON_INFO["Type"] = "exon"
                        CHRON_START = exon["start"]
                        CHRON_END = exon["end"]
                        if INFO["Strand"] == 1:
                            EXON_INFO["Start"] = exon["start"]
                            EXON_INFO["End"] = exon["end"]
                        else:
                            EXON_INFO["Start"] = exon["end"]
                            EXON_INFO["End"] = exon["start"]
                        EXON_INFO["Contains_start_CDS"] = False
                        EXON_INFO["Contains_end_CDS"] = False

                        if CDS:
                            if INFO["CDS_end"] >= CHRON_START and INFO[
                                    "CDS_end"] <= CHRON_END:
                                EXON_INFO["Contains_end_CDS"] = True
                                EXON_INFO["CDS"] = True
                                EXON_INFO["Start_phase"] = PHASE
                                #EXON_INFO["End_phase"]=-1
                                EXON_INFO["End_phase"] = 0
                                EXON_INFO["CDS_length"] = abs(
                                    INFO["CDS_end"] - EXON_INFO["Start"]) + 1
                                CDS = False
                            else:
                                EXON_INFO["CDS"] = True
                                EXON_INFO["Start_phase"] = PHASE
                                EXON_INFO["End_phase"] = (
                                    abs(EXON_INFO["End"] - EXON_INFO["Start"])
                                    + 1 + PHASE) % 3
                                EXON_INFO["CDS_length"] = abs(
                                    EXON_INFO["End"] - EXON_INFO["Start"]) + 1
                        elif INFO["CDS_start"] >= CHRON_START and INFO[
                                "CDS_start"] <= CHRON_END:
                            EXON_INFO["Contains_start_CDS"] = True
                            EXON_INFO["CDS"] = True
                            #EXON_INFO["Start_phase"]=-1
                            EXON_INFO["Start_phase"] = 0
                            if INFO["CDS_end"] >= CHRON_START and INFO[
                                    "CDS_end"] <= CHRON_END:
                                EXON_INFO["Contains_end_CDS"] = True
                                EXON_INFO["End_phase"] = 0
                                EXON_INFO["CDS_length"] = abs(
                                    INFO["CDS_end"] - INFO["CDS_start"]) + 1
                            else:
                                EXON_INFO["End_phase"] = (
                                    abs(EXON_INFO["End"] - INFO["CDS_start"]) +
                                    1) % 3
                                EXON_INFO["CDS_length"] = abs(
                                    EXON_INFO["End"] - INFO["CDS_start"]) + 1
                            CDS = True

                        else:
                            EXON_INFO["CDS"] = False
                            EXON_INFO[
                                "Start_phase"] = "-1"  #ADD SOMETHING ELSE SO AN ERROR IS GIVEN IF USED
                            EXON_INFO["End_phase"] = "-1"
                            EXON_INFO["CDS_length"] = 0
                        PHASE = EXON_INFO["End_phase"]
                        LENGTH_CDS += EXON_INFO["CDS_length"]
                        INFO["Exons"].append(EXON_INFO)

                        ### INTRONS
                        if rank < len(transcript["Exon"]) - 1:
                            INTRON_INFO = {}
                            INTRON_INFO["Type"] = "intron"
                            INTRON_INFO["Rank"] = rank + 1
                            INTRON_INFO["Phase"] = EXON_INFO["End_phase"]
                            if INFO["Strand"] == 1:
                                INTRON_INFO["Start"] = transcript["Exon"][
                                    rank]["end"] + 1
                                INTRON_INFO["End"] = transcript["Exon"][
                                    rank + 1]["start"] - 1
                            else:
                                INTRON_INFO["Start"] = transcript["Exon"][
                                    rank]["start"] - 1
                                INTRON_INFO["End"] = transcript["Exon"][
                                    rank + 1]["end"] + 1
                            # INTRON_INFO["Start"]=transcript["Exon"][rank]["end"]
                            # INTRON_INFO["End"]=transcript["Exon"][rank+1]["start"]
                            INTRON_INFO["CDS"] = CDS
                            INFO["Exons"].append(INTRON_INFO)
                    #print(INFO["Gene_id"],LENGTH_CDS, transcript["Translation"]["length"]*3)
                    if LENGTH_CDS - 3 != transcript["Translation"][
                            "length"] * 3:
                        INFO["Flags"].append(
                            "Possible-incomplete-CDS"
                        )  #as currently I am unable to request the given ENSEMBL flags. Bias towards incomplete but bases%3=0
            HITS.append(INFO)
    return HITS