Beispiel #1
0
def main():
    input_file = p.liftover_input
    output_file = p.liftover_output
    error_file = output_file + '.error'
    command_liftover = '../software/liftOver ' + input_file + ' ../software/hg38ToHg19.over.chain ' + output_file + ' ' + error_file
    utils.run_command(command_liftover)
    utils.now_time("liftOver script was successfully finished!!")
Beispiel #2
0
def main():
    input_file = p.liftover_input
    output_file = p.liftover_output
    error_file = output_file + '.error'
    command_liftover = '../software/liftOver ' + input_file + ' ../software/hg38ToHg19.over.chain ' + output_file + ' ' + error_file
    utils.run_command(command_liftover)
    utils.now_time("liftOver script was successfully finished!!")
Beispiel #3
0
def main():
    input_file = p.bed_3UTR_input
    output_file = p.bed_3UTR_output
    command_bed_3UTR = '../software/bed12to3UTRbed.sh ' + input_file + ' > ' + output_file
    print (command_bed_3UTR)
    utils.run_command(command_bed_3UTR)
    utils.now_time("bed_3UTR script was successfully finished!!")
Beispiel #4
0
def main():
    utils.now_time("Input_file: " + p.mirbase_gff2bed_input)
    utils.now_time("Output_file: " + p.mirbase_gff2bed_output)

    mirbase_gff_file = open(p.mirbase_gff2bed_input,'r')
    mirbase_bed_file = open(p.mirbase_gff2bed_output,'w')

    for line in mirbase_gff_file:
        line = line.rstrip()
        data = line.split("\t")
        if re.match(r'^#',line):
            continue
        chrom = data[0]
        status = data[2]
        st = int(data[3]) - 1
        ed = data[4]
        strand = data[6]
        if status == 'miRNA_primary_transcript':
            continue
        name_infor = data[8].split(';')
        mir_id = re.sub(r'^ID=','',name_infor[0])
        mir_id_number = ''
        if re.search(r'_',mir_id):
            mir_id, mir_id_number = mir_id.split('_')
        else:
            mir_id_number = 0 #there is ONLY one miRNA coding site in your genome
        mir_name = re.sub(r'^Name=','',name_infor[2])
        name = mir_name + '|' + mir_id + '|' + str(mir_id_number)
        print (chrom, st, ed, name, 0, strand, file=mirbase_bed_file, sep="\t", end="\n")

    utils.now_time("mirbase_gff2bed script was successfully finished!!")
    mirbase_gff_file.close()
    mirbase_bed_file.close()
def main():
    utils.now_time("Input_file: " + p.mirbase_gff2bed_input)
    utils.now_time("Output_file: " + p.mirbase_gff2bed_output)

    mirbase_gff_file = open(p.mirbase_gff2bed_input, "r")
    mirbase_bed_file = open(p.mirbase_gff2bed_output, "w")

    for line in mirbase_gff_file:
        line = line.rstrip()
        data = line.split("\t")
        if re.match(r"^#", line):
            continue
        chrom = data[0]
        status = data[2]
        st = int(data[3]) - 1
        ed = data[4]
        strand = data[6]
        if status == "miRNA_primary_transcript":
            continue
        name_infor = data[8].split(";")
        mir_id = re.sub(r"^ID=", "", name_infor[0])
        mir_id_number = ""
        if re.search(r"_", mir_id):
            mir_id, mir_id_number = mir_id.split("_")
        else:
            mir_id_number = 0  # there is ONLY one miRNA coding site in your genome
        mir_name = re.sub(r"^Name=", "", name_infor[2])
        name = mir_name + "|" + mir_id + "|" + str(mir_id_number)
        print(chrom, st, ed, name, 0, strand, file=mirbase_bed_file, sep="\t", end="\n")

    utils.now_time("mirbase_gff2bed script was successfully finished!!")
    mirbase_gff_file.close()
    mirbase_bed_file.close()
Beispiel #6
0
def main():
    utils.now_time("Input_file: " + p.mirbase_pre_input)
    utils.now_time("Output_file: " + p.mirbase_pre_output)
    input_file = open(p.mirbase_pre_input, 'r')
    output_file = open(p.mirbase_pre_output, 'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>", line):  #Header
            data = line.split()
            mir_id = data[0]
            mir_id = mir_id.replace('>', '')
            symbol = data[1]
            infor = mir_id + '|' + symbol
            if flg == 1:
                print(seq, file=output_file, end="\n")
            print(infor, file=output_file, end="\t")
            flg = 1
            seq = ""
        else:  #Sequence
            seq += line
    print(seq, file=output_file, end="\n")
    utils.now_time("mirbase_pre script was successfully finished!!")
    input_file.close()
    output_file.close()
def main():
    utils.now_time("Input_file: " + p.phylop_score_R_input)
    utils.now_time("Output_file: " + p.phylop_score_R_output)

    output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt'
    output_file = open(output_s,'w')

    #for x in ['chrY']:
    for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db'
        input_shelve = shelve.open(input_s)
        max_length = 28 #Max_length: 28nt(miRNA)
        for keys in input_shelve.keys():
            values = input_shelve[keys]
            value_length = len(values)
            add_length = max_length - value_length
            null_value = [0.000 for i in range(add_length)]
            values += null_value
            value_string = "\t".join(map(str, values))
            print(keys,value_string, file=output_file, sep="\t", end="\n")
        input_shelve.close()

    output_file.close()

    utils.now_time("phylop_score_R script was successfully finished!!")
Beispiel #8
0
def main():
    utils.now_time("Input_file: " + p.mirbase_pre_input)
    utils.now_time("Output_file: " + p.mirbase_pre_output)
    input_file = open(p.mirbase_pre_input,'r')
    output_file = open(p.mirbase_pre_output,'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>",line): #Header
            data = line.split()
            mir_id = data[0]
            mir_id = mir_id.replace('>','')
            symbol = data[1]
            infor = mir_id + '|' + symbol
            if flg == 1:
                print (seq,file=output_file,end="\n")
            print (infor,file=output_file,end="\t")
            flg = 1
            seq = ""
        else: #Sequence
            seq += line
    print (seq,file=output_file,end="\n")
    utils.now_time("mirbase_pre script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #9
0
def main():
    utils.now_time("Input_file: " + p.phylop_score_R_input)
    utils.now_time("Output_file: " + p.phylop_score_R_output)

    output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt'
    output_file = open(output_s, 'w')

    #for x in ['chrY']:
    for x in [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX', 'chrY', 'chrM'
    ]:
        input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db'
        input_shelve = shelve.open(input_s)
        max_length = 28  #Max_length: 28nt(miRNA)
        for keys in input_shelve.keys():
            values = input_shelve[keys]
            value_length = len(values)
            add_length = max_length - value_length
            null_value = [0.000 for i in range(add_length)]
            values += null_value
            value_string = "\t".join(map(str, values))
            print(keys, value_string, file=output_file, sep="\t", end="\n")
        input_shelve.close()

    output_file.close()

    utils.now_time("phylop_score_R script was successfully finished!!")
def main():
    utils.now_time("Input_file: " + p.phastcons_prep_input)
    utils.now_time("Output_file: " + p.phastcons_prep_output)

    for x in ['chrY']: #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix'
        output_s = p.phastcons_prep_input + x + '.phastCons46way.bed'
        phastcons_prep_input_file = open(input_s,'r')
        phastcons_prep_output_file = open(output_s,'w')

        chrom = ''
        start_site = 0
        step = 1

        for line in phastcons_prep_input_file:
            line = line.rstrip()
            if re.match(r'^fixedStep',line):
                regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)'
                seq = re.match(regex,line)
                chrom = seq.group('chrom')
                start_site = int(seq.group('start')) - 1
                step = int(seq.group('step'))
                continue
            score = line
            #end_site = start_site + step
            for x in range(step):
                print (start_site, score, file=phastcons_prep_output_file, sep="\t", end="\n")
                start_site += 1

        utils.now_time("phastcons_prep script was successfully finished!!")
        phastcons_prep_input_file.close()
        phastcons_prep_output_file.close()
def main():
    utils.now_time("Input_file: " + p.phastcons_prep_input)
    utils.now_time("Output_file: " + p.phastcons_prep_output)

    for x in [
            'chrY'
    ]:  #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix'
        output_s = p.phastcons_prep_input + x + '.phastCons46way.bed'
        phastcons_prep_input_file = open(input_s, 'r')
        phastcons_prep_output_file = open(output_s, 'w')

        chrom = ''
        start_site = 0
        step = 1

        for line in phastcons_prep_input_file:
            line = line.rstrip()
            if re.match(r'^fixedStep', line):
                regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)'
                seq = re.match(regex, line)
                chrom = seq.group('chrom')
                start_site = int(seq.group('start')) - 1
                step = int(seq.group('step'))
                continue
            score = line
            #end_site = start_site + step
            for x in range(step):
                print(start_site,
                      score,
                      file=phastcons_prep_output_file,
                      sep="\t",
                      end="\n")
                start_site += 1

        utils.now_time("phastcons_prep script was successfully finished!!")
        phastcons_prep_input_file.close()
        phastcons_prep_output_file.close()
Beispiel #12
0
def main():
    utils.now_time("Input_file: " + p.refseq_pre_input)
    utils.now_time("Output_file: " + p.refseq_pre_output)
    input_file = open(p.refseq_pre_input,'r')
    output_file = open(p.refseq_pre_output,'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>",line): #Header
            data = line.split()
            refseq_id = data[0]
            refseq_id = refseq_id.replace('>hg19_refGene_','')
            if flg == 1:
                print (seq,file=output_file,end="\n")
            print (refseq_id,file=output_file,end="\t")
            flg = 1
            seq = ""
        else: #Sequence
            seq += line
    print (seq,file=output_file,end="\n")
    utils.now_time("Refseq_pre script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #13
0
def main():
    utils.now_time("Input_file: " + p.refseq_pre_input)
    utils.now_time("Output_file: " + p.refseq_pre_output)
    input_file = open(p.refseq_pre_input, 'r')
    output_file = open(p.refseq_pre_output, 'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>", line):  #Header
            data = line.split()
            refseq_id = data[0]
            refseq_id = refseq_id.replace('>hg19_refGene_', '')
            if flg == 1:
                print(seq, file=output_file, end="\n")
            print(refseq_id, file=output_file, end="\t")
            flg = 1
            seq = ""
        else:  #Sequence
            seq += line
    print(seq, file=output_file, end="\n")
    utils.now_time("Refseq_pre script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #14
0
def main():
    utils.now_time("Input_file: " + p.mirmark_pos)
    utils.now_time("Output_file: " + p.mirmark_output)
    utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta)
    utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output, 'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa', mirbase_id):  #Only choose h**o sapiens miRNA
            continue
        mirbase_dict[mirbase_id] = [symbol,
                                    seq]  #miRNA_symbol => [0] | seq => [1]

    #refseq_dict
    refseq_file = open(p.refseq_pre_output, 'r')
    for line in refseq_file:
        line = line.rstrip()
        data = line.split("\t")
        refseq_id = data[0]
        seq = data[1]
        refseq_dict[refseq_id] = seq

    #main
    input_file = open(p.mirmark_pos, 'r')
    output_file = open(p.mirmark_output, 'w')
    mirna_file = open(p.mirmark_mirna_fasta, 'w')
    targetrna_file = open(p.mirmark_targetrna_fasta, 'w')
    error_file = open(p.mirmark_error, 'w')
    mirna_dist = {}
    targetrna_dist = {}
    for line in input_file:
        line = line.rstrip()
        data = line.split(",")
        if data[0] == 'miR_ID':
            continue
        mirbase_id = data[0]
        refseq_id = data[1]
        utr_st = 0  #int(data[3])-8 #int(data[2]) - 1
        utr_ed = int(data[3])
        if mirbase_id in convert_mirbase_id:
            mirbase_id = convert_mirbase_id[mirbase_id]
        if (refseq_id in refseq_dict and mirbase_id in mirbase_dict):
            symbol = mirbase_dict[mirbase_id][0]
            mir_seq = mirbase_dict[mirbase_id][1]
            mir_seq_length = len(mir_seq)
            utr_st = utr_ed - mir_seq_length - 5
            ref_seq_raw = refseq_dict[refseq_id]
            ref_seq = refseq_dict[refseq_id][utr_st:utr_ed]
            ref_seq = ref_seq.replace("T", "U")
            mir_tag = '>' + mirbase_id + '|' + symbol
            refseq_tag = '>' + refseq_id
            print(mirbase_id,
                  symbol,
                  mir_seq,
                  refseq_id,
                  utr_st,
                  utr_ed,
                  ref_seq,
                  file=output_file,
                  sep="\t",
                  end="\n")
            mirna_dist[mir_tag] = mir_seq
            targetrna_dist[refseq_tag] = ref_seq_raw
            #miRNA_fasta
            #print(mir_tag,file=mirna_file,end="\n")
            #print(mir_seq,file=mirna_file,end="\n")
            #targetRNA_fasta
            #print(refseq_tag,file=targetrna_file,end="\n")
            #print(ref_seq_raw,file=targetrna_file,end="\n")
        else:
            print("ERROR: " + refseq_id + '|' + mirbase_id,
                  file=error_file,
                  end="\n")
    for key in list(mirna_dist.keys()):
        print(key, file=mirna_file, end="\n")
        print(mirna_dist[key], file=mirna_file, end="\n")
    for key in list(targetrna_dist.keys()):
        print(key, file=targetrna_file, end="\n")
        print(targetrna_dist[key], file=targetrna_file, end="\n")

    utils.now_time("mirmark_result script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #15
0
def main():
    utils.now_time("Input_file: " + p.cupid_pos)
    utils.now_time("Output_file: " + p.cupid_output)
    utils.now_time("miRNA_file: " + p.cupid_mirna_fasta)
    utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output, 'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa', mirbase_id):
            continue
        mirbase_dict[mirbase_id] = [symbol,
                                    seq]  #miRNA_symbol => [0] | seq => [1]

    #refseq_dict
    refseq_file = open(p.refseq_pre_output, 'r')
    for line in refseq_file:
        line = line.rstrip()
        data = line.split("\t")
        refseq_id = data[0]
        seq = data[1]
        refseq_dict[refseq_id] = seq

    #main
    input_file = open(p.cupid_pos, 'r')
    output_file = open(p.cupid_output, 'w')
    mirna_file = open(p.cupid_mirna_fasta, 'w')
    targetrna_file = open(p.cupid_targetrna_fasta, 'w')
    error_file = open(p.cupid_error, 'w')
    mirna_dist = {}
    targetrna_dist = {}
    for line in input_file:
        line = line.rstrip()
        data = line.split("\t")
        if data[0] == 'AvgProb[0,1]':
            continue
        mirbase_id = data[4]
        refseq_id = data[3]
        if refseq_id == "NM_000927":
            continue
        utr_infor = data[5].split('-')
        utr_st = int(utr_infor[0])
        utr_ed = int(utr_infor[1])
        if mirbase_id in convert_mirbase_id:
            mirbase_id = convert_mirbase_id[mirbase_id]
        if (refseq_id in refseq_dict and mirbase_id in mirbase_dict):
            symbol = mirbase_dict[mirbase_id][0]
            mir_seq = mirbase_dict[mirbase_id][1]
            mir_seq_length = len(mir_seq)
            utr_st = utr_ed - mir_seq_length - 5
            ref_seq_raw = refseq_dict[refseq_id]
            ref_seq = refseq_dict[refseq_id][utr_st:utr_ed]
            ref_seq = ref_seq.replace("T", "U")
            mir_tag = '>' + mirbase_id + '|' + symbol
            refseq_tag = '>' + refseq_id
            print(mirbase_id,
                  symbol,
                  mir_seq,
                  refseq_id,
                  utr_st,
                  utr_ed,
                  ref_seq,
                  file=output_file,
                  sep="\t",
                  end="\n")
            mirna_dist[mir_tag] = mir_seq
            targetrna_dist[refseq_tag] = ref_seq_raw
        else:
            print("ERROR: " + refseq_id + '|' + mirbase_id,
                  file=error_file,
                  end="\n")
    for key in list(mirna_dist.keys()):
        print(key, file=mirna_file, end="\n")
        print(mirna_dist[key], file=mirna_file, end="\n")
    for key in list(targetrna_dist.keys()):
        print(key, file=targetrna_file, end="\n")
        print(targetrna_dist[key], file=targetrna_file, end="\n")

    utils.now_time("cupid_result script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #16
0
def main():
    utils.now_time("Input_file: " + p.mirmark_pos)
    utils.now_time("Output_file: " + p.mirmark_output)
    utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta)
    utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output,'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa',mirbase_id): #Only choose h**o sapiens miRNA
            continue
        mirbase_dict[mirbase_id] = [symbol,seq] #miRNA_symbol => [0] | seq => [1]

    #refseq_dict 
    refseq_file = open(p.refseq_pre_output,'r')
    for line in refseq_file:
        line = line.rstrip()
        data = line.split("\t")
        refseq_id = data[0]
        seq = data[1]
        refseq_dict[refseq_id] = seq

    #main
    input_file = open(p.mirmark_pos,'r')
    output_file = open(p.mirmark_output,'w')
    mirna_file = open(p.mirmark_mirna_fasta,'w')
    targetrna_file = open(p.mirmark_targetrna_fasta,'w')
    error_file = open(p.mirmark_error,'w')
    mirna_dist = {}
    targetrna_dist = {}
    for line in input_file:
        line = line.rstrip()
        data = line.split(",")
        if data[0] == 'miR_ID':
            continue
        mirbase_id = data[0]
        refseq_id = data[1]
        utr_st = 0 #int(data[3])-8 #int(data[2]) - 1
        utr_ed = int(data[3])
        if mirbase_id in convert_mirbase_id:
            mirbase_id = convert_mirbase_id[mirbase_id]
        if (refseq_id in refseq_dict and mirbase_id in mirbase_dict):
            symbol = mirbase_dict[mirbase_id][0]
            mir_seq = mirbase_dict[mirbase_id][1]
            mir_seq_length = len(mir_seq)
            utr_st = utr_ed - mir_seq_length - 5
            ref_seq_raw = refseq_dict[refseq_id]
            ref_seq = refseq_dict[refseq_id][utr_st:utr_ed]
            ref_seq = ref_seq.replace("T","U")
            mir_tag = '>' + mirbase_id + '|' + symbol
            refseq_tag = '>' + refseq_id
            print(mirbase_id,symbol,mir_seq,refseq_id,utr_st,utr_ed,ref_seq,file=output_file,sep="\t",end="\n")
            mirna_dist[mir_tag] = mir_seq
            targetrna_dist[refseq_tag] = ref_seq_raw
            #miRNA_fasta
            #print(mir_tag,file=mirna_file,end="\n")
            #print(mir_seq,file=mirna_file,end="\n")
            #targetRNA_fasta
            #print(refseq_tag,file=targetrna_file,end="\n")
            #print(ref_seq_raw,file=targetrna_file,end="\n")
        else:
            print ("ERROR: " + refseq_id + '|' + mirbase_id,file=error_file,end="\n")
    for key in list(mirna_dist.keys()):
        print(key,file=mirna_file,end="\n")
        print(mirna_dist[key],file=mirna_file,end="\n")
    for key in list(targetrna_dist.keys()):
        print(key,file=targetrna_file,end="\n")
        print(targetrna_dist[key],file=targetrna_file,end="\n")

    utils.now_time("mirmark_result script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #17
0
def main():
    utils.now_time("Input_file: " + p.phylop_score_list_db_input)
    utils.now_time("Reference_file: " + p.phylop_score_list_reference)
    utils.now_time("Output_file: " + p.phylop_score_list_db_output)

    output_merge = p.phylop_score_list_db_output + 'phyloP46way_Refseq_for_MIRAGE_CDS.db' #'phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db'
    output_merge_shelve = shelve.open(output_merge)

    #for x in ['chrY']:
    for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        ref_s = p.phylop_score_list_reference #mirBase, Refseq etc...
        ref_file = open(ref_s,'r')

        input_s = p.phylop_score_list_db_input + x + '.phyloP46way_Refseq_CDS.db' #'.phyloP46way_Refseq.db'
        output_s = p.phylop_score_list_db_output +  x + '.phyloP46way_Refseq_for_MIRAGE_CDS.db' #'.phyloP46way_Refseq_for_MIRAGE.db'

        input_shelve = shelve.open(input_s)
        output_shelve = shelve.open(output_s)

        score_list_dict = {}

        for line in ref_file:
            line = line.rstrip()
            data = line.split("\t")
            chrom = data[0]
            if not chrom == x:
                continue
            strand = data[5]
            if len(data) >= 12: #12bed format
                exon_block = data[10].split(',')
                exon_block.pop() #Remove the last item ''
                exon_st = data[11].split(',')
                exon_st.pop() #Remove the last item ''
                name = data[3]
                score_list_dict[name] = []
                for y in range(len(exon_block)):
                    st = int(data[1]) + int(exon_st[y])
                    ed = int(data[1]) + int(exon_st[y]) + int(exon_block[y])
                    length = ed - st
                    for z in range(length):
                        score = input_shelve[str(st)]
                        score_list_dict[name].append(score)
                        st += 1
                if strand == '-':
                    rev_score = score_list_dict[name][::-1]
                    score_list_dict[name] = rev_score
            elif len(data) >= 3: #6bed format
                st = int(data[1])
                ed = int(data[2])
                length = ed - st
                name = data[3]
                score_list_dict[name] = []
                for z in range(length):
                    score = input_shelve[str(st)]
                    score_list_dict[name].append(score)
                    st += 1
                if strand == '-':
                    rev_score = score_list_dict[name][::-1]
                    score_list_dict[name] = rev_score
            else:
                print('ERROR: Your BED format file have less than three column.')
                print ('BED format file need to have at least three column [chr, st, ed]...')
                sys.exit(1)

        output_shelve.update(score_list_dict)
        output_merge_shelve.update(score_list_dict)
        input_shelve.close()
        output_shelve.close()

    utils.now_time("phylop_score_list script was successfully finished!!")
    output_merge_shelve.close()
Beispiel #18
0
def main():
    utils.now_time("Input_file: " + p.cupid_pos)
    utils.now_time("Output_file: " + p.cupid_output)
    utils.now_time("miRNA_file: " + p.cupid_mirna_fasta)
    utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output,'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa',mirbase_id):
            continue
        mirbase_dict[mirbase_id] = [symbol,seq] #miRNA_symbol => [0] | seq => [1]

    #refseq_dict 
    refseq_file = open(p.refseq_pre_output,'r')
    for line in refseq_file:
        line = line.rstrip()
        data = line.split("\t")
        refseq_id = data[0]
        seq = data[1]
        refseq_dict[refseq_id] = seq

    #main
    input_file = open(p.cupid_pos,'r')
    output_file = open(p.cupid_output,'w')
    mirna_file = open(p.cupid_mirna_fasta,'w')
    targetrna_file = open(p.cupid_targetrna_fasta,'w')
    error_file = open(p.cupid_error,'w')
    mirna_dist = {}
    targetrna_dist = {}
    for line in input_file:
        line = line.rstrip()
        data = line.split("\t")
        if data[0] == 'AvgProb[0,1]':
            continue
        mirbase_id = data[4]
        refseq_id = data[3]
        if refseq_id == "NM_000927":
            continue
        utr_infor = data[5].split('-')
        utr_st = int(utr_infor[0])
        utr_ed = int(utr_infor[1])
        if mirbase_id in convert_mirbase_id:
            mirbase_id = convert_mirbase_id[mirbase_id]
        if (refseq_id in refseq_dict and mirbase_id in mirbase_dict):
            symbol = mirbase_dict[mirbase_id][0]
            mir_seq = mirbase_dict[mirbase_id][1]
            mir_seq_length = len(mir_seq)
            utr_st = utr_ed - mir_seq_length - 5
            ref_seq_raw = refseq_dict[refseq_id]
            ref_seq = refseq_dict[refseq_id][utr_st:utr_ed]
            ref_seq = ref_seq.replace("T","U")
            mir_tag = '>' + mirbase_id + '|' + symbol
            refseq_tag = '>' + refseq_id
            print(mirbase_id,symbol,mir_seq,refseq_id,utr_st,utr_ed,ref_seq,file=output_file,sep="\t",end="\n")
            mirna_dist[mir_tag] = mir_seq
            targetrna_dist[refseq_tag] = ref_seq_raw
        else:
            print ("ERROR: " + refseq_id + '|' + mirbase_id,file=error_file,end="\n")
    for key in list(mirna_dist.keys()):
        print(key,file=mirna_file,end="\n")
        print(mirna_dist[key],file=mirna_file,end="\n")
    for key in list(targetrna_dist.keys()):
        print(key,file=targetrna_file,end="\n")
        print(targetrna_dist[key],file=targetrna_file,end="\n")

    utils.now_time("cupid_result script was successfully finished!!")
    input_file.close()
    output_file.close()
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser(
        prog='mirage',
        description='MIRAGE - Comprehensive miRNA target prediction pipeline')
    parser.add_argument('analysis_type',
                        action='store',
                        help='Analysis_type: Choose estimation or prediction',
                        choices=['estimation', 'prediction'])
    parser.add_argument(
        'mirna_fasta',
        action='store',
        help='miRNA fasta file: Specify miRNA fasta file to use the analysis')
    parser.add_argument(
        'targetrna_fasta',
        action='store',
        help=
        'TargetRNA fasta file: Specify TargetRNA fasta file to use the analysis'
    )
    parser.add_argument(
        '-m',
        '--mirna-conservation-score-file',
        action='store',
        dest='mirna_conservation',
        help=
        'Conservation score file about miRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.'
    )
    parser.add_argument(
        '-t',
        '--targetrna-conservation-score-file',
        action='store',
        dest='targetrna_conservation',
        help=
        'Conservation score file about TargetRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.'
    )

    args = parser.parse_args()

    #Start analysis - logging
    greeting()
    utils.now_time("MIRAGE miRNA target prediction starting...")
    analysis_type = args.analysis_type
    mirna_fasta_path = args.mirna_fasta
    targetrna_fasta_path = args.targetrna_fasta
    mirna_conservation_score = args.mirna_conservation
    targetrna_conservation_score = args.targetrna_conservation

    #Check fasta files
    if not os.path.isfile(mirna_fasta_path):
        print("Error: miRNA fasta file does not exist...")
        sys.exit(1)
    if not os.path.isfile(targetrna_fasta_path):
        print("Error: TargetRNA fasta file does not exist...")

    #Check conservation score db files
    #if

    #parameters
    param = dict(
        MIRNA_FASTA_PATH=mirna_fasta_path,
        TARGETRNA_FASTA_PATH=targetrna_fasta_path,
    )
    common_parameters.update(param)
    p = utils.Bunch(common_parameters)
    print('miRNA_Fasta_file: ' + p.MIRNA_FASTA_PATH, end="\n")
    print('TargetRNA_Fasta_file: ' + p.TARGETRNA_FASTA_PATH, end="\n")
    '''
    mirna_dict = utils.load_fasta(mirna_fasta_path)
    #print (mirna_dict['hsa-miR-34b-5p|MIMAT0000685'],end="\n")
    #print (mirna_dict['hsa-miR-20a-5p|MIMAT0000075'],end="\n")
    targetrna_dict = utils.load_fasta(targetrna_fasta_path)
    #print (targetrna_dict['NM_000594'],end="\n")
    #print (targetrna_dict['NM_030938'],end="\n")
    
    query_mirna.update(mirna_dict)
    print (query_mirna)
    mirna = utils.Bunch(query_mirna)
    query_targetrna.update(targetrna_dict)
    targetrna = utils.Bunch(query_targetrna)
    if hasattr (mirna,'hsa-miR-34b-5p|MIMAT0000685'):
        print ("OK!!")
        print (mirna.items())
        sys.exit(0)
    else:
        print ("Error...")
        sys.exit(1)
    #test = targetrna.'NM_000594'
    #print (test,end="\n")
    #sys.exit(0)
    '''

    #runpy - choose analysis type
    if analysis_type == 'estimation':
        runpy.run_module('module.estimate',
                         run_name="__main__",
                         alter_sys=True)
    elif analysis_type == 'prediction':
        runpy.run_module('module.predict', run_name="__main__", alter_sys=True)
    else:
        print('Error: Analysis type is wrong...')
        sys.exit(1)
Beispiel #20
0
def main():
    utils.now_time("Input_file: " + p.phylop_sizedown_score_input)
    utils.now_time("Reference_file: " + p.phylop_sizedown_bed_input)
    utils.now_time("Output_file: " + p.phylop_sizedown_score_output)

    '''
    ref_s = p.phastcons_sizedown_bed_input #mirBase, Refseq etc...
    ref_file = open(ref_s,'r')
    ref_dict = {} #{NM_000XXXX: [st1,ed1],[st2,ed2]}
    for line in ref_file:
        line = line.rstrip()
        data = line.split("\t")

        if len(data) >= 12: #12bed format
            st = 0
            ed = 0
            exon_block = data[10].split(',')
            exon_block.pop()
            exon_st = data[11].split(',')
            exon_st.pop()
            chrom = data[0]
            name = data[3]
            for y in range(len(exon_block)):
                st = int(data[1]) + int(exon_st[y])
                ed = int(data[1]) + int(exon_st[y]) + int(exon_block[y])
                if not name in ref_dict:
                    ref_dict[name] = [[chrom,st,ed]]
                else:
                    ref_dict[name].append([chrom,st,ed])
        else: #6bed format
            st = data[1]
            ed = data[2]
            name = data[3]
            if not name in ref_dict:
                ref_dict[name] = [[chrom,st,ed]]
            else:
                ref_dict[name].append([chrom,st,ed])
    '''

    for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
    #for x in ['chrY']:
        ref_s = p.phylop_sizedown_bed_input #mirBase, Refseq etc...
        ref_file = open(ref_s,'r')

        input_s = p.phylop_sizedown_score_input + x + '.phyloP46way.bed'
        output_s = p.phylop_sizedown_score_output +  x + '.phyloP46way_Refseq_CDS.db'
        phylop_sizedown_input_file = open(input_s,'r')

        score_dict = {}

        for line in ref_file:
            line = line.rstrip()
            data = line.split("\t")
            chrom = data[0]
            if not x == chrom:
                continue
            if len(data) >= 12: #12bed format
                exon_block = data[10].split(',')
                exon_block.pop() #Remove the last item ''
                exon_st = data[11].split(',')
                exon_st.pop() #Remove the last item ''
                #name = data[3]
                for y in range(len(exon_block)):
                    st = int(data[1]) + int(exon_st[y])
                    ed = int(data[1]) + int(exon_st[y]) + int(exon_block[y])
                    length = ed - st
                    for z in range(length):
                        score_dict[str(st)] = 0
                        st += 1
            elif len(data) >= 3: #6bed format
                st = int(data[1])
                ed = int(data[2])
                length = ed - st
                for z in range(length):
                    score_dict[str(st)] = 0
                    st += 1
            else:
                print('ERROR: Your BED format file have less than three column.')
                print ('BED format file need to have at least three column [chr, st, ed]...')
                sys.exit(1)

        utils.now_time('Reference_file was loaded.')

        for line in phylop_sizedown_input_file:
            line = line.rstrip()
            data = line.split("\t")
            st_site = 0
            score = 0
            if re.match(r'^chr',data[0]):
                st_site = data[1] #
                score = data[2] #
            else:
                st_site = data[0] #
                score = data[1] #
            if st_site in score_dict:
                score_dict[str(st_site)] = score

        shelve_db = shelve.open(output_s)
        shelve_db.update(score_dict)
            
        utils.now_time("phylop_sizedown script was successfully finished!!")
        phylop_sizedown_input_file.close()
        shelve_db.close()
Beispiel #21
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
from parameter.convert_mirbase_id import convert_mirbase_id
import utils.setting_utils as utils

utils.now_time("cupid_result script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.cupid_pos)
    utils.now_time("Output_file: " + p.cupid_output)
    utils.now_time("miRNA_file: " + p.cupid_mirna_fasta)
    utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output,'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa',mirbase_id):
            continue
Beispiel #22
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("mirbase_pre script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.mirbase_pre_input)
    utils.now_time("Output_file: " + p.mirbase_pre_output)
    input_file = open(p.mirbase_pre_input,'r')
    output_file = open(p.mirbase_pre_output,'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>",line): #Header
            data = line.split()
            mir_id = data[0]
            mir_id = mir_id.replace('>','')
            symbol = data[1]
            infor = mir_id + '|' + symbol
            if flg == 1:
                print (seq,file=output_file,end="\n")
            print (infor,file=output_file,end="\t")
            flg = 1
            seq = ""
        else: #Sequence
            seq += line
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("mirbase_gff2bed script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.mirbase_gff2bed_input)
    utils.now_time("Output_file: " + p.mirbase_gff2bed_output)

    mirbase_gff_file = open(p.mirbase_gff2bed_input, "r")
    mirbase_bed_file = open(p.mirbase_gff2bed_output, "w")

    for line in mirbase_gff_file:
        line = line.rstrip()
        data = line.split("\t")
        if re.match(r"^#", line):
            continue
        chrom = data[0]
        status = data[2]
        st = int(data[3]) - 1
        ed = data[4]
        strand = data[6]
        if status == "miRNA_primary_transcript":
            continue
        name_infor = data[8].split(";")
        mir_id = re.sub(r"^ID=", "", name_infor[0])
Beispiel #24
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("liftOver script starting...")
p = utils.Bunch(common_parameters)

def main():
    input_file = p.liftover_input
    output_file = p.liftover_output
    error_file = output_file + '.error'
    command_liftover = '../software/liftOver ' + input_file + ' ../software/hg38ToHg19.over.chain ' + output_file + ' ' + error_file
    utils.run_command(command_liftover)
    utils.now_time("liftOver script was successfully finished!!")

if __name__ == '__main__':
    main()
Beispiel #25
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
from parameter.convert_mirbase_id import convert_mirbase_id
import utils.setting_utils as utils

utils.now_time("mirmark_result script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.mirmark_pos)
    utils.now_time("Output_file: " + p.mirmark_output)
    utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta)
    utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output, 'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa', mirbase_id):  #Only choose h**o sapiens miRNA
Beispiel #26
0
#!usr/bin/env python

import sys
import re
import shelve
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phastcons_sizedown script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.phastcons_sizedown_score_input)
    utils.now_time("Reference_file: " + p.phastcons_sizedown_bed_input)
    utils.now_time("Output_file: " + p.phastcons_sizedown_score_output)

    '''
    ref_s = p.phastcons_sizedown_bed_input #mirBase, Refseq etc...
    ref_file = open(ref_s,'r')
    ref_dict = {} #{NM_000XXXX: [st1,ed1],[st2,ed2]}
    for line in ref_file:
        line = line.rstrip()
        data = line.split("\t")

        if len(data) >= 12: #12bed format
            st = 0
            ed = 0
            exon_block = data[10].split(',')
            exon_block.pop()
            exon_st = data[11].split(',')
            exon_st.pop()
Beispiel #27
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("Refseq_pre script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.refseq_pre_input)
    utils.now_time("Output_file: " + p.refseq_pre_output)
    input_file = open(p.refseq_pre_input,'r')
    output_file = open(p.refseq_pre_output,'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>",line): #Header
            data = line.split()
            refseq_id = data[0]
            refseq_id = refseq_id.replace('>hg19_refGene_','')
            if flg == 1:
                print (seq,file=output_file,end="\n")
            print (refseq_id,file=output_file,end="\t")
            flg = 1
            seq = ""
        else: #Sequence
            seq += line
    print (seq,file=output_file,end="\n")
    utils.now_time("Refseq_pre script was successfully finished!!")
Beispiel #28
0
    #run_log("Calculating target site composition...", 4)
    result_dict = target_site_composition(targetrna_seq, tmp_dict)
    return run_result(result_dict)
    
def detect_rev_seed_match(mirna_id, targetrna_id):
    '''
    X1_seed_match_rev

    '''
    mirna_seq, targetrna_seq = get_sequence(mirna_id, targetrna_id)
    targetrna_seq_revcomp = utils.reverse_complement(targetrna_seq)
    tmp_dict = find_mirna_subtarget_candidates(mirna_id,mirna_seq,targetrna_id,targetrna_seq_revcomp) # => list()
    return run_result(tmp_dict)

###MAIN###
utils.now_time("MIRAGE estimate is starting...")
mirna_dict = utils.load_fasta(p.MIRNA_FASTA_PATH)
targetrna_dict = utils.load_fasta(p.TARGETRNA_FASTA_PATH)

'''#shelve
#shelve_file
###Save_file
shelve_path = utils.get_absolute_path('./seed_match.db')
if os.path.isfile(shelve_path): #if shelve_file exists, it'll be removed.
    os.remove(shelve_path)
seed_match_db = shelve.open('./seed_match.db')
'''#shelve

###Conservation_files
mirna_phastcons_path = utils.get_absolute_path('../data/PhastCons46Ways/phastCons46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db')
mirna_phylop_path = utils.get_absolute_path('../data/PhyloP/phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db')
Beispiel #29
0
#!usr/bin/env python

import sys
import re
import shelve
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phylop_sizedown script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.phylop_sizedown_score_input)
    utils.now_time("Reference_file: " + p.phylop_sizedown_bed_input)
    utils.now_time("Output_file: " + p.phylop_sizedown_score_output)

    '''
    ref_s = p.phastcons_sizedown_bed_input #mirBase, Refseq etc...
    ref_file = open(ref_s,'r')
    ref_dict = {} #{NM_000XXXX: [st1,ed1],[st2,ed2]}
    for line in ref_file:
        line = line.rstrip()
        data = line.split("\t")

        if len(data) >= 12: #12bed format
            st = 0
            ed = 0
            exon_block = data[10].split(',')
            exon_block.pop()
            exon_st = data[11].split(',')
            exon_st.pop()
Beispiel #30
0
def run_log(comment, step):
    global flg_find_mirna_target_candidates
    if flg_find_mirna_target_candidates == step:
        utils.now_time(comment)
        flg_find_mirna_target_candidates += 1
#!usr/bin/env python

import os
import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phastcons_prep script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.phastcons_prep_input)
    utils.now_time("Output_file: " + p.phastcons_prep_output)

    for x in [
            'chrY'
    ]:  #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix'
        output_s = p.phastcons_prep_input + x + '.phastCons46way.bed'
        phastcons_prep_input_file = open(input_s, 'r')
        phastcons_prep_output_file = open(output_s, 'w')

        chrom = ''
        start_site = 0
        step = 1

        for line in phastcons_prep_input_file:
            line = line.rstrip()
            if re.match(r'^fixedStep', line):
                regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)'
Beispiel #32
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("mirbase_pre script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.mirbase_pre_input)
    utils.now_time("Output_file: " + p.mirbase_pre_output)
    input_file = open(p.mirbase_pre_input, 'r')
    output_file = open(p.mirbase_pre_output, 'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>", line):  #Header
            data = line.split()
            mir_id = data[0]
            mir_id = mir_id.replace('>', '')
            symbol = data[1]
            infor = mir_id + '|' + symbol
            if flg == 1:
                print(seq, file=output_file, end="\n")
            print(infor, file=output_file, end="\t")
            flg = 1
            seq = ""
        else:  #Sequence
Beispiel #33
0
#!/usr/bin/env python

import shelve
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phastcons_score_R script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.phylop_score_R_input)
    utils.now_time("Output_file: " + p.phylop_score_R_output)

    output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt'
    output_file = open(output_s, 'w')

    #for x in ['chrY']:
    for x in [
            'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
            'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
            'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
            'chrX', 'chrY', 'chrM'
    ]:
        input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db'
        input_shelve = shelve.open(input_s)
        max_length = 28  #Max_length: 28nt(miRNA)
        for keys in input_shelve.keys():
            values = input_shelve[keys]
            value_length = len(values)
            add_length = max_length - value_length
Beispiel #34
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("mirbase_gff2bed script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.mirbase_gff2bed_input)
    utils.now_time("Output_file: " + p.mirbase_gff2bed_output)

    mirbase_gff_file = open(p.mirbase_gff2bed_input,'r')
    mirbase_bed_file = open(p.mirbase_gff2bed_output,'w')

    for line in mirbase_gff_file:
        line = line.rstrip()
        data = line.split("\t")
        if re.match(r'^#',line):
            continue
        chrom = data[0]
        status = data[2]
        st = int(data[3]) - 1
        ed = data[4]
        strand = data[6]
        if status == 'miRNA_primary_transcript':
            continue
        name_infor = data[8].split(';')
        mir_id = re.sub(r'^ID=','',name_infor[0])
        mir_id_number = ''
Beispiel #35
0
#!usr/bin/env python

import sys
import re
import shelve
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phylop_score_list script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.phylop_score_list_db_input)
    utils.now_time("Reference_file: " + p.phylop_score_list_reference)
    utils.now_time("Output_file: " + p.phylop_score_list_db_output)

    output_merge = p.phylop_score_list_db_output + 'phyloP46way_Refseq_for_MIRAGE_CDS.db' #'phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db'
    output_merge_shelve = shelve.open(output_merge)

    #for x in ['chrY']:
    for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        ref_s = p.phylop_score_list_reference #mirBase, Refseq etc...
        ref_file = open(ref_s,'r')

        input_s = p.phylop_score_list_db_input + x + '.phyloP46way_Refseq_CDS.db' #'.phyloP46way_Refseq.db'
        output_s = p.phylop_score_list_db_output +  x + '.phyloP46way_Refseq_for_MIRAGE_CDS.db' #'.phyloP46way_Refseq_for_MIRAGE.db'

        input_shelve = shelve.open(input_s)
        output_shelve = shelve.open(output_s)

        score_list_dict = {}
Beispiel #36
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("bed_3UTR script starting...")
p = utils.Bunch(common_parameters)

def main():
    input_file = p.bed_3UTR_input
    output_file = p.bed_3UTR_output
    command_bed_3UTR = '../software/bed12to3UTRbed.sh ' + input_file + ' > ' + output_file
    print (command_bed_3UTR)
    utils.run_command(command_bed_3UTR)
    utils.now_time("bed_3UTR script was successfully finished!!")

if __name__ == '__main__':
    main()
Beispiel #37
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
from parameter.convert_mirbase_id import convert_mirbase_id
import utils.setting_utils as utils

utils.now_time("mirmark_result script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.mirmark_pos)
    utils.now_time("Output_file: " + p.mirmark_output)
    utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta)
    utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output,'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa',mirbase_id): #Only choose h**o sapiens miRNA
            continue
Beispiel #38
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("Refseq_pre script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.refseq_pre_input)
    utils.now_time("Output_file: " + p.refseq_pre_output)
    input_file = open(p.refseq_pre_input, 'r')
    output_file = open(p.refseq_pre_output, 'w')
    flg = 0
    seq = ""
    for line in input_file:
        line = line.rstrip()
        if re.match(r"^>", line):  #Header
            data = line.split()
            refseq_id = data[0]
            refseq_id = refseq_id.replace('>hg19_refGene_', '')
            if flg == 1:
                print(seq, file=output_file, end="\n")
            print(refseq_id, file=output_file, end="\t")
            flg = 1
            seq = ""
        else:  #Sequence
            seq += line
    print(seq, file=output_file, end="\n")
Beispiel #39
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
from parameter.convert_mirbase_id import convert_mirbase_id
import utils.setting_utils as utils

utils.now_time("cupid_result script starting...")
p = utils.Bunch(common_parameters)


def main():
    utils.now_time("Input_file: " + p.cupid_pos)
    utils.now_time("Output_file: " + p.cupid_output)
    utils.now_time("miRNA_file: " + p.cupid_mirna_fasta)
    utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta)
    utils.now_time("Refseq_data: " + p.refseq_pre_output)
    utils.now_time("miRBase_data: " + p.mirbase_pre_output)
    refseq_dict = {}
    mirbase_dict = {}

    #mirbase_dict
    mirbase_file = open(p.mirbase_pre_output, 'r')
    for line in mirbase_file:
        line = line.rstrip()
        data = line.split("\t")
        infor = data[0].split('|')
        mirbase_id = infor[0]
        symbol = infor[1]
        seq = data[1]
        if not re.match('hsa', mirbase_id):
Beispiel #40
0
#!/usr/bin/env python

import shelve
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phastcons_score_R script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.phylop_score_R_input)
    utils.now_time("Output_file: " + p.phylop_score_R_output)

    output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt'
    output_file = open(output_s,'w')

    #for x in ['chrY']:
    for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db'
        input_shelve = shelve.open(input_s)
        max_length = 28 #Max_length: 28nt(miRNA)
        for keys in input_shelve.keys():
            values = input_shelve[keys]
            value_length = len(values)
            add_length = max_length - value_length
            null_value = [0.000 for i in range(add_length)]
            values += null_value
            value_string = "\t".join(map(str, values))
            print(keys,value_string, file=output_file, sep="\t", end="\n")
        input_shelve.close()
Beispiel #41
0
def main():
    parser = argparse.ArgumentParser(
        prog="mirage_prepare",
        description="MIRAGE preparation toolkit - Data preparation for MIRAGE")
    parser.add_argument(
        'preparation_type',
        action='store',
        choices=[
            'bed_3UTR', 'mirbase_gff2bed', 'liftOver', 'phylop_score_prep',
            'phastcons_prep', 'phylop_sizedown', 'phastcons_sizedown',
            'phylop_score_list', 'phastcons_score_list', 'phastcons_score_R',
            'phylop_score_R', 'refseq_pre', 'mirbase_pre', 'mirmark_result',
            'cupid_result'
        ],
        help=
        'Preparation Type: refseq_pre|mirbase_pre|mirmark_result|cupid_result|'
    )
    parser.add_argument(
        '-i',
        '--input-file',
        action='store',
        dest='input_file',
        help='Input file: Specify a input file name and its path')
    parser.add_argument(
        '-r',
        '--reference-file',
        action='store',
        dest='reference_file',
        help='reference file: Specify a reference file name and its path')
    parser.add_argument(
        '-a',
        '--additional-file',
        action='store',
        dest='add_file',
        nargs=3,
        help=
        'Additional_file: Specify 1-refseq_pre file dir, 2-mirbase_pre file dir, 3-error log dir)'
    )
    parser.add_argument(
        '-o',
        '--ouput-file',
        action='store',
        dest='output_file',
        help='Output file: Specify a output file name and its path')
    args = parser.parse_args()

    #Start analysis - logging
    greeting()
    utils.now_time('MIRAGE Data Preparation starting...')

    #Parameter preparation
    prep_type = args.preparation_type
    if (args.input_file or args.output_file):
        if not (os.path.isfile(args.input_file)):
            utils.now_time("ERROR: InputFile does not exist...")
            sys.exit(1)
        if not (args.output_file):
            utils.now_time("ERROR: -o option are required...")
            sys.exit(1)
        custom_params = {}
        if prep_type == 'bed_3UTR':
            custom_params['bed_3UTR_input'] = args.input_file
            custom_params['bed_3UTR_output'] = args.output_file
            common_parameters.updata(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'mirbase_gff2bed':
            custom_params['mirbase_gff2bed_input'] = args.input_file
            custom_params['mirbase_gff2bed_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'liftOver':
            custom_params['liftover_input'] = args.input_file
            custom_params['liftover_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'phylop_score_prep':
            custom_params['phylop_score_prep_input'] = args.input_file
            custom_params['phylop_score_prep_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'phastcons_prep':
            custom_params['phastcons_prep_input'] = args.input_file
            custom_params['phastcons_prep_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'phylop_sizedown':
            if args.reference_file:
                custom_params[
                    'phylop_sizedown_bed_input'] = args.reference_file
                custom_params['phylop_sizedown_score_input'] = args.input_file
                custom_params[
                    'phylop_sizedown_score_output'] = args.output_file
                common_parameters.update(custom_params)
                p = utils.Bunch(common_parameters)
            else:
                utils.now_time("ERROR: -r option is required...")
                sys.exit(1)
        elif prep_type == 'phastcons_sizedown':
            if args.reference_file:
                custom_params[
                    'phastcons_sizedown_bed_input'] = args.reference_file
                custom_params[
                    'phastcons_sizedown_score_input'] = args.input_file
                custom_params[
                    'phastcons_sizedown_score_output'] = args.output_file
                common_parameters.update(custom_params)
                p = utils.Bunch(common_parameters)
            else:
                utils.now_time("ERROR: -r option is required...")
                sys.exit(1)
        elif prep_type == 'phylop_score_list':
            if args.reference_file:
                custom_params[
                    'phylop_score_list_reference'] = args.reference_file
                custom_params['phylop_score_list_db_input'] = args.input_file
                custom_params['phylop_score_list_db_output'] = args.output_file
                common_parameters.update(custom_params)
                p = utils.Bunch(common_parameters)
            else:
                utils.now_time("ERROR: -r option is required...")
                sys.exit(1)
        elif prep_type == 'phastcons_score_list':
            if args.reference_file:
                custom_params[
                    'phastcons_score_list_reference'] = args.reference_file
                custom_params[
                    'phastcons_score_list_db_input'] = args.input_file
                custom_params[
                    'phastcons_score_list_db_output'] = args.output_file
                common_parameters.update(custom_params)
                p = utils.Bunch(common_parameters)
            else:
                utils.now_time("ERROR: -r option is required...")
                sys.exit(1)
        elif prep_type == 'phastcons_score_R':
            custom_params['phastcons_score_R_input'] = args.input_file
            custom_params['phastcons_score_R_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'phylop_score_R':
            custom_params['phylop_score_R_input'] = args.input_file
            custom_params['phylop_score_R_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'refseq_pre':
            custom_params['refseq_pre_input'] = args.input_file
            custom_params['refseq_pre_output'] = args.output_file
            common_parameters.update(custom_params)
            p = utils.Bunch(common_parameters)
        elif prep_type == 'mirbase_pre':
            custom_params['mirbase_pre_input'] = args.input_file
            custom_params['mirbase_pre_output'] = args.output_file
            p = utils.Bunch(common_parameters)
        elif prep_type == 'mirmark_result':
            if args.add_file:
                custom_params['refseq_pre_output'] = args.add_file[0]
                custom_params['mirbase_pre_output'] = args.add_file[1]
                custom_params['mirmark_pos'] = args.input_file
                custom_params['mirmark_output'] = args.output_file
                custom_params['mirmark_error'] = args.add_file[2]
                p = utils.Bunch(common_parameters)
            else:
                utils.now_time("ERROR: -a option is required...")
                sys.exit(1)
        elif prep_type == 'cupid_result':
            if args.add_file:
                custom_params['refseq_pre_output'] = args.add_file[0]
                custom_params['mirbase_pre_output'] = args.add_file[1]
                custom_params['cupid_pos'] = args.input_file
                custom_params['cupid_output'] = args.output_file
                custom_params['cupid_error'] = args.add_file[2]
                p = utils.Bunch(common_parameters)
            else:
                utils.now_time("ERROR: -a option is required...")
                sys.exit(1)

        else:
            utils.now_time("ERROR: Wrong preparation type...")
            sys.exit(1)
    elif not (args.input_file and args.output_file):
        if prep_type:
            p = utils.Bunch(common_parameters)
        else:
            utils.now_time("ERROR: Wrong preparation type...")
            sys.exit(1)
    else:
        utils.now_time("ERROR: -i and -o option are required...")
        sys.exit(1)

    #Preparation type
    if prep_type == 'bed_3UTR':
        runpy.run_module('module.preparation.bed_3UTR',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'mirbase_gff2bed':
        runpy.run_module('module.preparation.mirbase_gff2bed',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'liftOver':
        runpy.run_module('module.preparation.liftOver',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'phylop_score_prep':
        runpy.run_module('module.preparation.phylop_score_prep',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'phastcons_prep':
        runpy.run_module('module.preparation.phastcons_score_prep',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'phastcons_sizedown':
        runpy.run_module('module.preparation.phastcons_sizedown',
                         run_name='__main__',
                         alter_sys=True)
    elif prep_type == 'phylop_sizedown':
        runpy.run_module('module.preparation.phylop_sizedown',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'phylop_score_list':
        runpy.run_module('module.preparation.phylop_score_list',
                         run_name='__main__',
                         alter_sys=True)
    elif prep_type == 'phastcons_score_list':
        runpy.run_module('module.preparation.phastcons_score_list',
                         run_name='__main__',
                         alter_sys=True)
    elif prep_type == 'phastcons_score_R':
        runpy.run_module('module.preparation.phastcons_score_R',
                         run_name='__main__',
                         alter_sys=True)
    elif prep_type == 'phylop_score_R':
        runpy.run_module('module.preparation.phylop_score_R',
                         run_name='__main__',
                         alter_sys=True)
    elif prep_type == 'refseq_pre':
        runpy.run_module('module.preparation.refseq_pre',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'mirbase_pre':
        runpy.run_module('module.preparation.mirbase_pre',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'mirmark_result':
        runpy.run_module('module.preparation.mirmark_result',
                         run_name="__main__",
                         alter_sys=True)
    elif prep_type == 'cupid_result':
        runpy.run_module('module.preparation.cupid_result',
                         run_name="__main__",
                         alter_sys=True)
    else:
        utils.now_time("ERROR: Wrong preparation type...")
        sys.exit(1)
#!usr/bin/env python

import os
import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("phastcons_prep script starting...")
p = utils.Bunch(common_parameters)

def main():
    utils.now_time("Input_file: " + p.phastcons_prep_input)
    utils.now_time("Output_file: " + p.phastcons_prep_output)

    for x in ['chrY']: #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']:
        input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix'
        output_s = p.phastcons_prep_input + x + '.phastCons46way.bed'
        phastcons_prep_input_file = open(input_s,'r')
        phastcons_prep_output_file = open(output_s,'w')

        chrom = ''
        start_site = 0
        step = 1

        for line in phastcons_prep_input_file:
            line = line.rstrip()
            if re.match(r'^fixedStep',line):
                regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)'
                seq = re.match(regex,line)
                chrom = seq.group('chrom')
                start_site = int(seq.group('start')) - 1
Beispiel #43
0
def main():
    parser = argparse.ArgumentParser(prog='mirage',description='MIRAGE - Comprehensive miRNA target prediction pipeline')
    parser.add_argument('analysis_type',action='store',help='Analysis_type: Choose estimation or prediction',choices=['estimation','prediction'])
    parser.add_argument('mirna_fasta',action='store',help='miRNA fasta file: Specify miRNA fasta file to use the analysis')
    parser.add_argument('targetrna_fasta',action='store',help='TargetRNA fasta file: Specify TargetRNA fasta file to use the analysis')
    parser.add_argument('-m','--mirna-conservation-score-file',action='store',dest='mirna_conservation',help='Conservation score file about miRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.')
    parser.add_argument('-t','--targetrna-conservation-score-file',action='store',dest='targetrna_conservation',help='Conservation score file about TargetRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.')

    args = parser.parse_args()

    #Start analysis - logging
    greeting()
    utils.now_time("MIRAGE miRNA target prediction starting...")
    analysis_type = args.analysis_type
    mirna_fasta_path = args.mirna_fasta
    targetrna_fasta_path = args.targetrna_fasta
    mirna_conservation_score = args.mirna_conservation
    targetrna_conservation_score = args.targetrna_conservation

    #Check fasta files
    if not os.path.isfile(mirna_fasta_path):
        print ("Error: miRNA fasta file does not exist...")
        sys.exit(1)
    if not os.path.isfile(targetrna_fasta_path):
        print ("Error: TargetRNA fasta file does not exist...")

    #Check conservation score db files
    #if 

    #parameters
    param = dict(
        MIRNA_FASTA_PATH = mirna_fasta_path,
        TARGETRNA_FASTA_PATH = targetrna_fasta_path,
    )
    common_parameters.update(param)
    p = utils.Bunch(common_parameters)
    print ('miRNA_Fasta_file: ' + p.MIRNA_FASTA_PATH,end="\n")
    print ('TargetRNA_Fasta_file: ' + p.TARGETRNA_FASTA_PATH,end="\n")

    '''
    mirna_dict = utils.load_fasta(mirna_fasta_path)
    #print (mirna_dict['hsa-miR-34b-5p|MIMAT0000685'],end="\n")
    #print (mirna_dict['hsa-miR-20a-5p|MIMAT0000075'],end="\n")
    targetrna_dict = utils.load_fasta(targetrna_fasta_path)
    #print (targetrna_dict['NM_000594'],end="\n")
    #print (targetrna_dict['NM_030938'],end="\n")
    
    query_mirna.update(mirna_dict)
    print (query_mirna)
    mirna = utils.Bunch(query_mirna)
    query_targetrna.update(targetrna_dict)
    targetrna = utils.Bunch(query_targetrna)
    if hasattr (mirna,'hsa-miR-34b-5p|MIMAT0000685'):
        print ("OK!!")
        print (mirna.items())
        sys.exit(0)
    else:
        print ("Error...")
        sys.exit(1)
    #test = targetrna.'NM_000594'
    #print (test,end="\n")
    #sys.exit(0)
    '''

    #runpy - choose analysis type
    if analysis_type == 'estimation':
        runpy.run_module('module.estimate',run_name="__main__",alter_sys=True)
    elif analysis_type == 'prediction':
        runpy.run_module('module.predict',run_name="__main__",alter_sys=True)
    else:
        print ('Error: Analysis type is wrong...')
        sys.exit(1)
Beispiel #44
0
#!usr/bin/env python

import re
from parameter.common_parameters import common_parameters
import utils.setting_utils as utils

utils.now_time("liftOver script starting...")
p = utils.Bunch(common_parameters)


def main():
    input_file = p.liftover_input
    output_file = p.liftover_output
    error_file = output_file + '.error'
    command_liftover = '../software/liftOver ' + input_file + ' ../software/hg38ToHg19.over.chain ' + output_file + ' ' + error_file
    utils.run_command(command_liftover)
    utils.now_time("liftOver script was successfully finished!!")


if __name__ == '__main__':
    main()