Exemple #1
0
def SNP_MIP_Gap(chrome,start,end,min_hom_length,max_hom_length,tm_min,tm_max,gc_threshold_min,gc_threshold_max,ref,alt):
    returnStr=""
    
    ## New code on 1/21/2015
    # SNP on the gap fill (use + for gap)
    # if gap fill is 2 bases, it will do -+, +-
    # if gap fill is 3 bases, it will do --+, -+-, +--
    ## Fetch sequences

    for n in range(1,gap_num+1):
        for j in range(0,n):
            upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length-(n-j-1),max_hom_length).lower()
            downstream_seq = nibFragger(chrome.replace("chr",""),end+1+j,max_hom_length).lower()
            upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False)
            downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True)

            gapfill_W = nibFragger(chrome.replace("chr",""),start-(n-j-1),n)
            tmplist= list(gapfill_W)
            tmplist[n-j-1]=alt
            gapfill_M = "".join(tmplist)
            upstream_pos = chrome+":"+str(start-(n-j-1)-upstream_list[0][3])+"-"+str(start-(n-j-1)-1)
            downstream_pos = chrome+":"+str(end+1+j)+"-"+str(end+1+j+downstream_list[0][3]-1)

            returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=gapfill_M,GapFillBase_W=gapfill_W)
            returnStr = returnStr.rstrip("\n")
            returnStr+="\t"+upstream_pos+"\t"+downstream_pos+"\t"+getName(n,j)+"\n"
            returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=revcomp(gapfill_M),GapFillBase_W=revcomp(gapfill_W))
            returnStr = returnStr.rstrip("\n")
            returnStr+="\t"+upstream_pos+"\t"+downstream_pos+"\t"+getName(n,j)+"\n"
    return returnStr
Exemple #2
0
        continue
    else:
        #tmp = line.rstrip().split("\t")
        mipName, upstream,gapfill,downstream = line.rstrip().split("\t")[0:4]
    
    strand = "+"
    mm, count = mipName.split("-")
    count=int(count)
    if count % 2 == 0:
        strand = "-"

    ##based on upstream coordinate to excat seq
    upStart,upStop = upstream.split("-")
    downStart,downStop = downstream.split("-")

    myUpSeq = nibFragger(chrom,upStart,int(upStop)-int(upStart)+1)
    myDownSeq = nibFragger(chrom,downStart,int(downStop)-int(downStart)+1)

    h2pos=""
    h1pos=""
    
    H1Seq=""
    H2Seq=""

    if strand == "+":
        h2pos=upstream
        h1pos=downstream
        H2Seq = myUpSeq
        H1Seq = myDownSeq
    else:
        h1pos=upstream
Exemple #3
0
def MNP_MIP(chrome, start, end, min_hom_length, max_hom_length, tm_min, tm_max,
            gc_threshold_min, gc_threshold_max, ref, alt):
    returnStr = ""

    # MNP on the H2 forward and H1 reverse
    ## Fetch sequences
    upstream_seq = nibFragger(
        chrome.replace("chr", ""), end - max_hom_length + 1,
        max_hom_length).lower()
    downstream_seq = nibFragger(
        chrome.replace("chr", ""), end + 2, max_hom_length).lower()
    gapfill = nibFragger(chrome.replace("chr", ""), end + 1, 1)
    upstream_seq = replaceString(upstream_seq, alt, first=False)

    upstream_list = get_seq(
        upstream_seq,
        MIN_LENGTH=min_hom_length,
        MIN_TM=tm_min,
        MAX_TM=tm_max,
        GC_MIN=gc_threshold_min,
        GC_MAX=gc_threshold_max,
        right2left=False)
    downstream_list = get_seq(
        downstream_seq,
        MIN_LENGTH=min_hom_length,
        MIN_TM=tm_min,
        MAX_TM=tm_max,
        GC_MIN=gc_threshold_min,
        GC_MAX=gc_threshold_max,
        right2left=True)

    returnStr += make_Hom_pairs(
        upstream_list,
        downstream_list,
        hom_strand="+",
        MIP_Mut_Alignment="MNP_on_H2",
        GapFillBase_M=gapfill,
        GapFillBase_W=gapfill)
    returnStr += make_Hom_pairs(
        upstream_list,
        downstream_list,
        hom_strand="-",
        MIP_Mut_Alignment="MNP_on_H1",
        GapFillBase_M=revcomp(gapfill),
        GapFillBase_W=revcomp(gapfill))

    ## MNP on the H2 reverse Strand or H1 forward strand
    upstream_seq = nibFragger(
        chrome.replace("chr", ""), start - max_hom_length - 1,
        max_hom_length).lower()
    downstream_seq = nibFragger(
        chrome.replace("chr", ""), start, max_hom_length).lower()
    gapfill = nibFragger(chrome.replace("chr", ""), start - 1, 1)
    ### replace the first chracter
    downstream_seq_replaced = replaceString(downstream_seq, alt, first=True)
    upstream_list = get_seq(
        upstream_seq,
        MIN_LENGTH=min_hom_length,
        MIN_TM=tm_min,
        MAX_TM=tm_max,
        GC_MIN=gc_threshold_min,
        GC_MAX=gc_threshold_max,
        right2left=False)
    downstream_list = get_seq(
        downstream_seq_replaced,
        MIN_LENGTH=min_hom_length,
        MIN_TM=tm_min,
        MAX_TM=tm_max,
        GC_MIN=gc_threshold_min,
        GC_MAX=gc_threshold_max,
        right2left=True)

    returnStr += make_Hom_pairs(
        upstream_list,
        downstream_list,
        hom_strand="-",
        MIP_Mut_Alignment="MNP_on_H2",
        GapFillBase_M=revcomp(gapfill),
        GapFillBase_W=revcomp(gapfill))
    returnStr += make_Hom_pairs(
        upstream_list,
        downstream_list,
        hom_strand="+",
        MIP_Mut_Alignment="MNP_on_H1",
        GapFillBase_M=gapfill,
        GapFillBase_W=gapfill)

    ## Do MNP_on_H2GF
    for i in range(1, len(ref) + 1):
        upstream_seq = nibFragger(
            chrome.replace("chr", ""), start - max_hom_length - 1 + i,
            max_hom_length).lower()
        downstream_seq = nibFragger(
            chrome.replace("chr", ""), start + i, max_hom_length).lower()
        gapfillM = alt[i - 1]
        gapfillW = ref[i - 1]

        upOverlapWithMutation = i - 1
        downOverlapWithMutation = len(ref) - i
        if upOverlapWithMutation > 0:
            upstream_seq = replaceString(
                upstream_seq, alt[:i - 1], first=False)
        if downOverlapWithMutation > 0:
            downstream_seq = replaceString(downstream_seq, alt[i:], first=True)

        upstream_list = get_seq(
            upstream_seq,
            MIN_LENGTH=min_hom_length,
            MIN_TM=tm_min,
            MAX_TM=tm_max,
            GC_MIN=gc_threshold_min,
            GC_MAX=gc_threshold_max,
            right2left=False)
        downstream_list = get_seq(
            downstream_seq,
            MIN_LENGTH=min_hom_length,
            MIN_TM=tm_min,
            MAX_TM=tm_max,
            GC_MIN=gc_threshold_min,
            GC_MAX=gc_threshold_max,
            right2left=True)

        MIP_name = "MNP_on_"
        if upOverlapWithMutation > 0:
            MIP_name += "H2"
        MIP_name += "Gap"

        if downOverlapWithMutation > 0:
            MIP_name += "H1"
        returnStr += make_Hom_pairs(
            upstream_list,
            downstream_list,
            hom_strand="+",
            MIP_Mut_Alignment=MIP_name,
            GapFillBase_M=gapfillM,
            GapFillBase_W=gapfillW)

        MIP_name = "MNP_on_"
        if downOverlapWithMutation > 0:
            MIP_name += "H2"
        MIP_name += "Gap"

        if upOverlapWithMutation > 0:
            MIP_name += "H1"
        returnStr += make_Hom_pairs(
            upstream_list,
            downstream_list,
            hom_strand="-",
            MIP_Mut_Alignment=MIP_name,
            GapFillBase_M=revcomp(gapfillM),
            GapFillBase_W=revcomp(gapfillW))

    return returnStr
Exemple #4
0
for line in open(input_file):
    parts = line.rstrip().split("\t")
    if line.startswith("Gene"):
        continue

    info("Doing " + "|".join(parts[:3]))

    gene, mutation_AA, mutation_cDNA, chrome, start, end, ref, alt, gene_strand, cosmic, tumor_type = parts[
        0], parts[1], parts[2], parts[3], int(parts[4]), int(
            parts[5]), parts[6], parts[7], parts[8], parts[9], parts[10]

    ## Only works for SNP
    ## Situation 1: SNP_on_GF (snp in gap fill)

    flank5 = nibFragger(chrome.replace("chr", ""), start - 50, 50)
    flank3 = nibFragger(chrome.replace("chr", ""), end + 1, 50)
    forwardSeq = flank5.lower() + "[" + ref + "/" + alt + "]" + flank3.lower()
    reverseSeq = revcomp(flank3).lower() + "[" + revcomp(ref) + "/" + revcomp(
        alt) + "]" + revcomp(flank5).lower()
    resultStr = ""

    mType = determineMutationType(ref, alt)
    if mType == "SNP":
        resultStr = SNP_MIP_Gap(chrome, start, end, min_hom_length,
                                max_hom_length, tm_min, tm_max,
                                gc_threshold_min, gc_threshold_max, ref, alt)
    else:
        print("Right now, it only supports SNP")
        continue
Exemple #5
0
def SNP_MIP_original(chrome,start,end,min_hom_length,max_hom_length,tm_min,tm_max,gc_threshold_min,gc_threshold_max,ref,alt):
    returnStr=""
    
    # SNP on the gap fill
    ## Fetch sequences
    if (re.search(r"[AT]",ref) and re.search(r"[CG]",alt)) or (re.search(r"[AT]",alt) and re.search(r"[CG]",ref)):
        upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length,max_hom_length).lower()
        downstream_seq = nibFragger(chrome.replace("chr",""),end+1,max_hom_length).lower()
        upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False)
        downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True)
    
        returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=alt,GapFillBase_W=ref)
        returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_GF",GapFillBase_M=revcomp(alt),GapFillBase_W=revcomp(ref))
        
    
    ## SNP on the H2 forward strand and H1 reverse - Mutation Type
    upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length+1,max_hom_length).lower()
    downstream_seq = nibFragger(chrome.replace("chr",""),end+2,max_hom_length).lower()
    gapfill = nibFragger(chrome.replace("chr",""),end+1,1)
    ### replace the last chracter
    upstream_seq_replaced=replaceString(upstream_seq,alt,first=False)
    upstream_list = get_seq(upstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False)
    downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True)
    
    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H2_M",GapFillBase_M=gapfill,GapFillBase_W=gapfill)
    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H1_M",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill))
    
    ## SNP on the H2 forward strand and H1 reverse - Wild Type
    upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length+1,max_hom_length).lower()
    downstream_seq = nibFragger(chrome.replace("chr",""),end+2,max_hom_length).lower()
    gapfill = nibFragger(chrome.replace("chr",""),end+1,1)
    ### replace the last chracter
    upstream_seq_replaced=replaceString(upstream_seq,ref,first=False)
    upstream_list = get_seq(upstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False)
    downstream_list = get_seq(downstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True)
    
    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H2_W",GapFillBase_M=gapfill,GapFillBase_W=gapfill)
    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H1_W",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill))
    
    ## SNP on the H2 reverse Strand or H1 forward strand - Mutation Type
    upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length-1,max_hom_length).lower()
    downstream_seq = nibFragger(chrome.replace("chr",""),end,max_hom_length).lower()
    Mgapfill = nibFragger(chrome.replace("chr",""),end-1,1)
    ### replace the first chracter
    downstream_seq_replaced=replaceString(downstream_seq,alt,first=True)
    upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False)
    downstream_list = get_seq(downstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True)

    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H2_M",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill))
    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H1_M",GapFillBase_M=gapfill,GapFillBase_W=gapfill)

    ## SNP on the H2 reverse Strand or H1 forward strand - Wild Type
    upstream_seq = nibFragger(chrome.replace("chr",""),start-max_hom_length-1,max_hom_length).lower()
    downstream_seq = nibFragger(chrome.replace("chr",""),end,max_hom_length).lower()
    Mgapfill = nibFragger(chrome.replace("chr",""),end-1,1)
    ### replace the first chracter
    downstream_seq_replaced=replaceString(downstream_seq,ref,first=True)
    upstream_list = get_seq(upstream_seq, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=False)
    downstream_list = get_seq(downstream_seq_replaced, MIN_LENGTH = min_hom_length, MIN_TM=tm_min, MAX_TM=tm_max, GC_MIN=gc_threshold_min, GC_MAX=gc_threshold_max,right2left=True)

    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="-",MIP_Mut_Alignment="SNP_on_H2_W",GapFillBase_M=revcomp(gapfill),GapFillBase_W=revcomp(gapfill))
    returnStr+=make_Hom_pairs(upstream_list,downstream_list,hom_strand="+",MIP_Mut_Alignment="SNP_on_H1_W",GapFillBase_M=gapfill,GapFillBase_W=gapfill)
    return returnStr
Exemple #6
0
output = open(output_file, "w")
output.write("hom_name\tregion_index\tchrom\tregion_start\tregion_stop\thom_start\thom_stop\tseq\tseq_tm\tgc_count\tgc_pct\tdust_scoreH1\tdust_scoreH2\tdust_pct_H1\tdust_pct_H2\thp_run\tSNPs\tSMs\n")

out_fasta = open(output_fasta, "w")
out_bed = open(output_bed, "w")

info("There are total of {} exons".format(str(len(target_regions))))

for region_index, (chrom, start, stop) in enumerate(target_regions):
    #For every position in the index, design a hom - extend mip to the right
    i = region_index+1
    bp = str(abs(stop-start+1))
    info("Finding homs on exon {} ({}-{}| {}bp)".format(str(i),str(start),str(stop),bp))

    for hom_position in range(start - mip_offset - 20, stop + mip_offset):
        hom_seq = nibFragger(chrom.replace("chr",""), hom_position, 35)
        opt_seq, opt_tm = optimize_seq(hom_seq)
        gc_count = opt_seq.count("C") + opt_seq.count("G")
        gc_content = gc_count / float(len(opt_seq))
        (hp_run, dust_score_H1, dust_score_H2, dust_pct_H1, dust_pct_H2) = score_dust(opt_seq)

        #Disqualify homs based on thresholds
        if len(opt_seq) > max_hom_length or len(opt_seq) < min_hom_length: continue
        if opt_tm < tm_min or opt_tm > tm_max: continue
        if gc_content > gc_threshold_max or gc_content < gc_threshold_min: continue
        SNPs = snp_finder.local_snps(hom_position, hom_position + len(opt_seq) - 1)
        SMs = snp_finder.local_sms(hom_position, hom_position + len(opt_seq) - 1)
        end_position = hom_position + len(opt_seq) - 1

        output.write("{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2f}\t{}\t{:.2f}\t{}\t{}\t{:.2f}\t{:.2f}\t{}\t{}\t{}\n".format(region_index, hom_position, region_index, chrom, start, stop, hom_position, end_position, opt_seq, opt_tm, \
                gc_count, gc_content, dust_score_H1, dust_score_H2, dust_pct_H1, dust_pct_H2, hp_run, SNPs, SMs))