def main(): input_file = p.liftover_input output_file = p.liftover_output error_file = output_file + '.error' command_liftover = '../software/liftOver ' + input_file + ' ../software/hg38ToHg19.over.chain ' + output_file + ' ' + error_file utils.run_command(command_liftover) utils.now_time("liftOver script was successfully finished!!")
def main(): input_file = p.bed_3UTR_input output_file = p.bed_3UTR_output command_bed_3UTR = '../software/bed12to3UTRbed.sh ' + input_file + ' > ' + output_file print (command_bed_3UTR) utils.run_command(command_bed_3UTR) utils.now_time("bed_3UTR script was successfully finished!!")
def main(): utils.now_time("Input_file: " + p.mirbase_gff2bed_input) utils.now_time("Output_file: " + p.mirbase_gff2bed_output) mirbase_gff_file = open(p.mirbase_gff2bed_input,'r') mirbase_bed_file = open(p.mirbase_gff2bed_output,'w') for line in mirbase_gff_file: line = line.rstrip() data = line.split("\t") if re.match(r'^#',line): continue chrom = data[0] status = data[2] st = int(data[3]) - 1 ed = data[4] strand = data[6] if status == 'miRNA_primary_transcript': continue name_infor = data[8].split(';') mir_id = re.sub(r'^ID=','',name_infor[0]) mir_id_number = '' if re.search(r'_',mir_id): mir_id, mir_id_number = mir_id.split('_') else: mir_id_number = 0 #there is ONLY one miRNA coding site in your genome mir_name = re.sub(r'^Name=','',name_infor[2]) name = mir_name + '|' + mir_id + '|' + str(mir_id_number) print (chrom, st, ed, name, 0, strand, file=mirbase_bed_file, sep="\t", end="\n") utils.now_time("mirbase_gff2bed script was successfully finished!!") mirbase_gff_file.close() mirbase_bed_file.close()
def main(): utils.now_time("Input_file: " + p.mirbase_gff2bed_input) utils.now_time("Output_file: " + p.mirbase_gff2bed_output) mirbase_gff_file = open(p.mirbase_gff2bed_input, "r") mirbase_bed_file = open(p.mirbase_gff2bed_output, "w") for line in mirbase_gff_file: line = line.rstrip() data = line.split("\t") if re.match(r"^#", line): continue chrom = data[0] status = data[2] st = int(data[3]) - 1 ed = data[4] strand = data[6] if status == "miRNA_primary_transcript": continue name_infor = data[8].split(";") mir_id = re.sub(r"^ID=", "", name_infor[0]) mir_id_number = "" if re.search(r"_", mir_id): mir_id, mir_id_number = mir_id.split("_") else: mir_id_number = 0 # there is ONLY one miRNA coding site in your genome mir_name = re.sub(r"^Name=", "", name_infor[2]) name = mir_name + "|" + mir_id + "|" + str(mir_id_number) print(chrom, st, ed, name, 0, strand, file=mirbase_bed_file, sep="\t", end="\n") utils.now_time("mirbase_gff2bed script was successfully finished!!") mirbase_gff_file.close() mirbase_bed_file.close()
def main(): utils.now_time("Input_file: " + p.mirbase_pre_input) utils.now_time("Output_file: " + p.mirbase_pre_output) input_file = open(p.mirbase_pre_input, 'r') output_file = open(p.mirbase_pre_output, 'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>", line): #Header data = line.split() mir_id = data[0] mir_id = mir_id.replace('>', '') symbol = data[1] infor = mir_id + '|' + symbol if flg == 1: print(seq, file=output_file, end="\n") print(infor, file=output_file, end="\t") flg = 1 seq = "" else: #Sequence seq += line print(seq, file=output_file, end="\n") utils.now_time("mirbase_pre script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.phylop_score_R_input) utils.now_time("Output_file: " + p.phylop_score_R_output) output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt' output_file = open(output_s,'w') #for x in ['chrY']: for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db' input_shelve = shelve.open(input_s) max_length = 28 #Max_length: 28nt(miRNA) for keys in input_shelve.keys(): values = input_shelve[keys] value_length = len(values) add_length = max_length - value_length null_value = [0.000 for i in range(add_length)] values += null_value value_string = "\t".join(map(str, values)) print(keys,value_string, file=output_file, sep="\t", end="\n") input_shelve.close() output_file.close() utils.now_time("phylop_score_R script was successfully finished!!")
def main(): utils.now_time("Input_file: " + p.mirbase_pre_input) utils.now_time("Output_file: " + p.mirbase_pre_output) input_file = open(p.mirbase_pre_input,'r') output_file = open(p.mirbase_pre_output,'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>",line): #Header data = line.split() mir_id = data[0] mir_id = mir_id.replace('>','') symbol = data[1] infor = mir_id + '|' + symbol if flg == 1: print (seq,file=output_file,end="\n") print (infor,file=output_file,end="\t") flg = 1 seq = "" else: #Sequence seq += line print (seq,file=output_file,end="\n") utils.now_time("mirbase_pre script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.phylop_score_R_input) utils.now_time("Output_file: " + p.phylop_score_R_output) output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt' output_file = open(output_s, 'w') #for x in ['chrY']: for x in [ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM' ]: input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db' input_shelve = shelve.open(input_s) max_length = 28 #Max_length: 28nt(miRNA) for keys in input_shelve.keys(): values = input_shelve[keys] value_length = len(values) add_length = max_length - value_length null_value = [0.000 for i in range(add_length)] values += null_value value_string = "\t".join(map(str, values)) print(keys, value_string, file=output_file, sep="\t", end="\n") input_shelve.close() output_file.close() utils.now_time("phylop_score_R script was successfully finished!!")
def main(): utils.now_time("Input_file: " + p.phastcons_prep_input) utils.now_time("Output_file: " + p.phastcons_prep_output) for x in ['chrY']: #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix' output_s = p.phastcons_prep_input + x + '.phastCons46way.bed' phastcons_prep_input_file = open(input_s,'r') phastcons_prep_output_file = open(output_s,'w') chrom = '' start_site = 0 step = 1 for line in phastcons_prep_input_file: line = line.rstrip() if re.match(r'^fixedStep',line): regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)' seq = re.match(regex,line) chrom = seq.group('chrom') start_site = int(seq.group('start')) - 1 step = int(seq.group('step')) continue score = line #end_site = start_site + step for x in range(step): print (start_site, score, file=phastcons_prep_output_file, sep="\t", end="\n") start_site += 1 utils.now_time("phastcons_prep script was successfully finished!!") phastcons_prep_input_file.close() phastcons_prep_output_file.close()
def main(): utils.now_time("Input_file: " + p.phastcons_prep_input) utils.now_time("Output_file: " + p.phastcons_prep_output) for x in [ 'chrY' ]: #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix' output_s = p.phastcons_prep_input + x + '.phastCons46way.bed' phastcons_prep_input_file = open(input_s, 'r') phastcons_prep_output_file = open(output_s, 'w') chrom = '' start_site = 0 step = 1 for line in phastcons_prep_input_file: line = line.rstrip() if re.match(r'^fixedStep', line): regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)' seq = re.match(regex, line) chrom = seq.group('chrom') start_site = int(seq.group('start')) - 1 step = int(seq.group('step')) continue score = line #end_site = start_site + step for x in range(step): print(start_site, score, file=phastcons_prep_output_file, sep="\t", end="\n") start_site += 1 utils.now_time("phastcons_prep script was successfully finished!!") phastcons_prep_input_file.close() phastcons_prep_output_file.close()
def main(): utils.now_time("Input_file: " + p.refseq_pre_input) utils.now_time("Output_file: " + p.refseq_pre_output) input_file = open(p.refseq_pre_input,'r') output_file = open(p.refseq_pre_output,'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>",line): #Header data = line.split() refseq_id = data[0] refseq_id = refseq_id.replace('>hg19_refGene_','') if flg == 1: print (seq,file=output_file,end="\n") print (refseq_id,file=output_file,end="\t") flg = 1 seq = "" else: #Sequence seq += line print (seq,file=output_file,end="\n") utils.now_time("Refseq_pre script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.refseq_pre_input) utils.now_time("Output_file: " + p.refseq_pre_output) input_file = open(p.refseq_pre_input, 'r') output_file = open(p.refseq_pre_output, 'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>", line): #Header data = line.split() refseq_id = data[0] refseq_id = refseq_id.replace('>hg19_refGene_', '') if flg == 1: print(seq, file=output_file, end="\n") print(refseq_id, file=output_file, end="\t") flg = 1 seq = "" else: #Sequence seq += line print(seq, file=output_file, end="\n") utils.now_time("Refseq_pre script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.mirmark_pos) utils.now_time("Output_file: " + p.mirmark_output) utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta) utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output, 'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa', mirbase_id): #Only choose h**o sapiens miRNA continue mirbase_dict[mirbase_id] = [symbol, seq] #miRNA_symbol => [0] | seq => [1] #refseq_dict refseq_file = open(p.refseq_pre_output, 'r') for line in refseq_file: line = line.rstrip() data = line.split("\t") refseq_id = data[0] seq = data[1] refseq_dict[refseq_id] = seq #main input_file = open(p.mirmark_pos, 'r') output_file = open(p.mirmark_output, 'w') mirna_file = open(p.mirmark_mirna_fasta, 'w') targetrna_file = open(p.mirmark_targetrna_fasta, 'w') error_file = open(p.mirmark_error, 'w') mirna_dist = {} targetrna_dist = {} for line in input_file: line = line.rstrip() data = line.split(",") if data[0] == 'miR_ID': continue mirbase_id = data[0] refseq_id = data[1] utr_st = 0 #int(data[3])-8 #int(data[2]) - 1 utr_ed = int(data[3]) if mirbase_id in convert_mirbase_id: mirbase_id = convert_mirbase_id[mirbase_id] if (refseq_id in refseq_dict and mirbase_id in mirbase_dict): symbol = mirbase_dict[mirbase_id][0] mir_seq = mirbase_dict[mirbase_id][1] mir_seq_length = len(mir_seq) utr_st = utr_ed - mir_seq_length - 5 ref_seq_raw = refseq_dict[refseq_id] ref_seq = refseq_dict[refseq_id][utr_st:utr_ed] ref_seq = ref_seq.replace("T", "U") mir_tag = '>' + mirbase_id + '|' + symbol refseq_tag = '>' + refseq_id print(mirbase_id, symbol, mir_seq, refseq_id, utr_st, utr_ed, ref_seq, file=output_file, sep="\t", end="\n") mirna_dist[mir_tag] = mir_seq targetrna_dist[refseq_tag] = ref_seq_raw #miRNA_fasta #print(mir_tag,file=mirna_file,end="\n") #print(mir_seq,file=mirna_file,end="\n") #targetRNA_fasta #print(refseq_tag,file=targetrna_file,end="\n") #print(ref_seq_raw,file=targetrna_file,end="\n") else: print("ERROR: " + refseq_id + '|' + mirbase_id, file=error_file, end="\n") for key in list(mirna_dist.keys()): print(key, file=mirna_file, end="\n") print(mirna_dist[key], file=mirna_file, end="\n") for key in list(targetrna_dist.keys()): print(key, file=targetrna_file, end="\n") print(targetrna_dist[key], file=targetrna_file, end="\n") utils.now_time("mirmark_result script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.cupid_pos) utils.now_time("Output_file: " + p.cupid_output) utils.now_time("miRNA_file: " + p.cupid_mirna_fasta) utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output, 'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa', mirbase_id): continue mirbase_dict[mirbase_id] = [symbol, seq] #miRNA_symbol => [0] | seq => [1] #refseq_dict refseq_file = open(p.refseq_pre_output, 'r') for line in refseq_file: line = line.rstrip() data = line.split("\t") refseq_id = data[0] seq = data[1] refseq_dict[refseq_id] = seq #main input_file = open(p.cupid_pos, 'r') output_file = open(p.cupid_output, 'w') mirna_file = open(p.cupid_mirna_fasta, 'w') targetrna_file = open(p.cupid_targetrna_fasta, 'w') error_file = open(p.cupid_error, 'w') mirna_dist = {} targetrna_dist = {} for line in input_file: line = line.rstrip() data = line.split("\t") if data[0] == 'AvgProb[0,1]': continue mirbase_id = data[4] refseq_id = data[3] if refseq_id == "NM_000927": continue utr_infor = data[5].split('-') utr_st = int(utr_infor[0]) utr_ed = int(utr_infor[1]) if mirbase_id in convert_mirbase_id: mirbase_id = convert_mirbase_id[mirbase_id] if (refseq_id in refseq_dict and mirbase_id in mirbase_dict): symbol = mirbase_dict[mirbase_id][0] mir_seq = mirbase_dict[mirbase_id][1] mir_seq_length = len(mir_seq) utr_st = utr_ed - mir_seq_length - 5 ref_seq_raw = refseq_dict[refseq_id] ref_seq = refseq_dict[refseq_id][utr_st:utr_ed] ref_seq = ref_seq.replace("T", "U") mir_tag = '>' + mirbase_id + '|' + symbol refseq_tag = '>' + refseq_id print(mirbase_id, symbol, mir_seq, refseq_id, utr_st, utr_ed, ref_seq, file=output_file, sep="\t", end="\n") mirna_dist[mir_tag] = mir_seq targetrna_dist[refseq_tag] = ref_seq_raw else: print("ERROR: " + refseq_id + '|' + mirbase_id, file=error_file, end="\n") for key in list(mirna_dist.keys()): print(key, file=mirna_file, end="\n") print(mirna_dist[key], file=mirna_file, end="\n") for key in list(targetrna_dist.keys()): print(key, file=targetrna_file, end="\n") print(targetrna_dist[key], file=targetrna_file, end="\n") utils.now_time("cupid_result script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.mirmark_pos) utils.now_time("Output_file: " + p.mirmark_output) utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta) utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output,'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa',mirbase_id): #Only choose h**o sapiens miRNA continue mirbase_dict[mirbase_id] = [symbol,seq] #miRNA_symbol => [0] | seq => [1] #refseq_dict refseq_file = open(p.refseq_pre_output,'r') for line in refseq_file: line = line.rstrip() data = line.split("\t") refseq_id = data[0] seq = data[1] refseq_dict[refseq_id] = seq #main input_file = open(p.mirmark_pos,'r') output_file = open(p.mirmark_output,'w') mirna_file = open(p.mirmark_mirna_fasta,'w') targetrna_file = open(p.mirmark_targetrna_fasta,'w') error_file = open(p.mirmark_error,'w') mirna_dist = {} targetrna_dist = {} for line in input_file: line = line.rstrip() data = line.split(",") if data[0] == 'miR_ID': continue mirbase_id = data[0] refseq_id = data[1] utr_st = 0 #int(data[3])-8 #int(data[2]) - 1 utr_ed = int(data[3]) if mirbase_id in convert_mirbase_id: mirbase_id = convert_mirbase_id[mirbase_id] if (refseq_id in refseq_dict and mirbase_id in mirbase_dict): symbol = mirbase_dict[mirbase_id][0] mir_seq = mirbase_dict[mirbase_id][1] mir_seq_length = len(mir_seq) utr_st = utr_ed - mir_seq_length - 5 ref_seq_raw = refseq_dict[refseq_id] ref_seq = refseq_dict[refseq_id][utr_st:utr_ed] ref_seq = ref_seq.replace("T","U") mir_tag = '>' + mirbase_id + '|' + symbol refseq_tag = '>' + refseq_id print(mirbase_id,symbol,mir_seq,refseq_id,utr_st,utr_ed,ref_seq,file=output_file,sep="\t",end="\n") mirna_dist[mir_tag] = mir_seq targetrna_dist[refseq_tag] = ref_seq_raw #miRNA_fasta #print(mir_tag,file=mirna_file,end="\n") #print(mir_seq,file=mirna_file,end="\n") #targetRNA_fasta #print(refseq_tag,file=targetrna_file,end="\n") #print(ref_seq_raw,file=targetrna_file,end="\n") else: print ("ERROR: " + refseq_id + '|' + mirbase_id,file=error_file,end="\n") for key in list(mirna_dist.keys()): print(key,file=mirna_file,end="\n") print(mirna_dist[key],file=mirna_file,end="\n") for key in list(targetrna_dist.keys()): print(key,file=targetrna_file,end="\n") print(targetrna_dist[key],file=targetrna_file,end="\n") utils.now_time("mirmark_result script was successfully finished!!") input_file.close() output_file.close()
def main(): utils.now_time("Input_file: " + p.phylop_score_list_db_input) utils.now_time("Reference_file: " + p.phylop_score_list_reference) utils.now_time("Output_file: " + p.phylop_score_list_db_output) output_merge = p.phylop_score_list_db_output + 'phyloP46way_Refseq_for_MIRAGE_CDS.db' #'phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db' output_merge_shelve = shelve.open(output_merge) #for x in ['chrY']: for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: ref_s = p.phylop_score_list_reference #mirBase, Refseq etc... ref_file = open(ref_s,'r') input_s = p.phylop_score_list_db_input + x + '.phyloP46way_Refseq_CDS.db' #'.phyloP46way_Refseq.db' output_s = p.phylop_score_list_db_output + x + '.phyloP46way_Refseq_for_MIRAGE_CDS.db' #'.phyloP46way_Refseq_for_MIRAGE.db' input_shelve = shelve.open(input_s) output_shelve = shelve.open(output_s) score_list_dict = {} for line in ref_file: line = line.rstrip() data = line.split("\t") chrom = data[0] if not chrom == x: continue strand = data[5] if len(data) >= 12: #12bed format exon_block = data[10].split(',') exon_block.pop() #Remove the last item '' exon_st = data[11].split(',') exon_st.pop() #Remove the last item '' name = data[3] score_list_dict[name] = [] for y in range(len(exon_block)): st = int(data[1]) + int(exon_st[y]) ed = int(data[1]) + int(exon_st[y]) + int(exon_block[y]) length = ed - st for z in range(length): score = input_shelve[str(st)] score_list_dict[name].append(score) st += 1 if strand == '-': rev_score = score_list_dict[name][::-1] score_list_dict[name] = rev_score elif len(data) >= 3: #6bed format st = int(data[1]) ed = int(data[2]) length = ed - st name = data[3] score_list_dict[name] = [] for z in range(length): score = input_shelve[str(st)] score_list_dict[name].append(score) st += 1 if strand == '-': rev_score = score_list_dict[name][::-1] score_list_dict[name] = rev_score else: print('ERROR: Your BED format file have less than three column.') print ('BED format file need to have at least three column [chr, st, ed]...') sys.exit(1) output_shelve.update(score_list_dict) output_merge_shelve.update(score_list_dict) input_shelve.close() output_shelve.close() utils.now_time("phylop_score_list script was successfully finished!!") output_merge_shelve.close()
def main(): utils.now_time("Input_file: " + p.cupid_pos) utils.now_time("Output_file: " + p.cupid_output) utils.now_time("miRNA_file: " + p.cupid_mirna_fasta) utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output,'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa',mirbase_id): continue mirbase_dict[mirbase_id] = [symbol,seq] #miRNA_symbol => [0] | seq => [1] #refseq_dict refseq_file = open(p.refseq_pre_output,'r') for line in refseq_file: line = line.rstrip() data = line.split("\t") refseq_id = data[0] seq = data[1] refseq_dict[refseq_id] = seq #main input_file = open(p.cupid_pos,'r') output_file = open(p.cupid_output,'w') mirna_file = open(p.cupid_mirna_fasta,'w') targetrna_file = open(p.cupid_targetrna_fasta,'w') error_file = open(p.cupid_error,'w') mirna_dist = {} targetrna_dist = {} for line in input_file: line = line.rstrip() data = line.split("\t") if data[0] == 'AvgProb[0,1]': continue mirbase_id = data[4] refseq_id = data[3] if refseq_id == "NM_000927": continue utr_infor = data[5].split('-') utr_st = int(utr_infor[0]) utr_ed = int(utr_infor[1]) if mirbase_id in convert_mirbase_id: mirbase_id = convert_mirbase_id[mirbase_id] if (refseq_id in refseq_dict and mirbase_id in mirbase_dict): symbol = mirbase_dict[mirbase_id][0] mir_seq = mirbase_dict[mirbase_id][1] mir_seq_length = len(mir_seq) utr_st = utr_ed - mir_seq_length - 5 ref_seq_raw = refseq_dict[refseq_id] ref_seq = refseq_dict[refseq_id][utr_st:utr_ed] ref_seq = ref_seq.replace("T","U") mir_tag = '>' + mirbase_id + '|' + symbol refseq_tag = '>' + refseq_id print(mirbase_id,symbol,mir_seq,refseq_id,utr_st,utr_ed,ref_seq,file=output_file,sep="\t",end="\n") mirna_dist[mir_tag] = mir_seq targetrna_dist[refseq_tag] = ref_seq_raw else: print ("ERROR: " + refseq_id + '|' + mirbase_id,file=error_file,end="\n") for key in list(mirna_dist.keys()): print(key,file=mirna_file,end="\n") print(mirna_dist[key],file=mirna_file,end="\n") for key in list(targetrna_dist.keys()): print(key,file=targetrna_file,end="\n") print(targetrna_dist[key],file=targetrna_file,end="\n") utils.now_time("cupid_result script was successfully finished!!") input_file.close() output_file.close()
def main(): parser = argparse.ArgumentParser( prog='mirage', description='MIRAGE - Comprehensive miRNA target prediction pipeline') parser.add_argument('analysis_type', action='store', help='Analysis_type: Choose estimation or prediction', choices=['estimation', 'prediction']) parser.add_argument( 'mirna_fasta', action='store', help='miRNA fasta file: Specify miRNA fasta file to use the analysis') parser.add_argument( 'targetrna_fasta', action='store', help= 'TargetRNA fasta file: Specify TargetRNA fasta file to use the analysis' ) parser.add_argument( '-m', '--mirna-conservation-score-file', action='store', dest='mirna_conservation', help= 'Conservation score file about miRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.' ) parser.add_argument( '-t', '--targetrna-conservation-score-file', action='store', dest='targetrna_conservation', help= 'Conservation score file about TargetRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.' ) args = parser.parse_args() #Start analysis - logging greeting() utils.now_time("MIRAGE miRNA target prediction starting...") analysis_type = args.analysis_type mirna_fasta_path = args.mirna_fasta targetrna_fasta_path = args.targetrna_fasta mirna_conservation_score = args.mirna_conservation targetrna_conservation_score = args.targetrna_conservation #Check fasta files if not os.path.isfile(mirna_fasta_path): print("Error: miRNA fasta file does not exist...") sys.exit(1) if not os.path.isfile(targetrna_fasta_path): print("Error: TargetRNA fasta file does not exist...") #Check conservation score db files #if #parameters param = dict( MIRNA_FASTA_PATH=mirna_fasta_path, TARGETRNA_FASTA_PATH=targetrna_fasta_path, ) common_parameters.update(param) p = utils.Bunch(common_parameters) print('miRNA_Fasta_file: ' + p.MIRNA_FASTA_PATH, end="\n") print('TargetRNA_Fasta_file: ' + p.TARGETRNA_FASTA_PATH, end="\n") ''' mirna_dict = utils.load_fasta(mirna_fasta_path) #print (mirna_dict['hsa-miR-34b-5p|MIMAT0000685'],end="\n") #print (mirna_dict['hsa-miR-20a-5p|MIMAT0000075'],end="\n") targetrna_dict = utils.load_fasta(targetrna_fasta_path) #print (targetrna_dict['NM_000594'],end="\n") #print (targetrna_dict['NM_030938'],end="\n") query_mirna.update(mirna_dict) print (query_mirna) mirna = utils.Bunch(query_mirna) query_targetrna.update(targetrna_dict) targetrna = utils.Bunch(query_targetrna) if hasattr (mirna,'hsa-miR-34b-5p|MIMAT0000685'): print ("OK!!") print (mirna.items()) sys.exit(0) else: print ("Error...") sys.exit(1) #test = targetrna.'NM_000594' #print (test,end="\n") #sys.exit(0) ''' #runpy - choose analysis type if analysis_type == 'estimation': runpy.run_module('module.estimate', run_name="__main__", alter_sys=True) elif analysis_type == 'prediction': runpy.run_module('module.predict', run_name="__main__", alter_sys=True) else: print('Error: Analysis type is wrong...') sys.exit(1)
def main(): utils.now_time("Input_file: " + p.phylop_sizedown_score_input) utils.now_time("Reference_file: " + p.phylop_sizedown_bed_input) utils.now_time("Output_file: " + p.phylop_sizedown_score_output) ''' ref_s = p.phastcons_sizedown_bed_input #mirBase, Refseq etc... ref_file = open(ref_s,'r') ref_dict = {} #{NM_000XXXX: [st1,ed1],[st2,ed2]} for line in ref_file: line = line.rstrip() data = line.split("\t") if len(data) >= 12: #12bed format st = 0 ed = 0 exon_block = data[10].split(',') exon_block.pop() exon_st = data[11].split(',') exon_st.pop() chrom = data[0] name = data[3] for y in range(len(exon_block)): st = int(data[1]) + int(exon_st[y]) ed = int(data[1]) + int(exon_st[y]) + int(exon_block[y]) if not name in ref_dict: ref_dict[name] = [[chrom,st,ed]] else: ref_dict[name].append([chrom,st,ed]) else: #6bed format st = data[1] ed = data[2] name = data[3] if not name in ref_dict: ref_dict[name] = [[chrom,st,ed]] else: ref_dict[name].append([chrom,st,ed]) ''' for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: #for x in ['chrY']: ref_s = p.phylop_sizedown_bed_input #mirBase, Refseq etc... ref_file = open(ref_s,'r') input_s = p.phylop_sizedown_score_input + x + '.phyloP46way.bed' output_s = p.phylop_sizedown_score_output + x + '.phyloP46way_Refseq_CDS.db' phylop_sizedown_input_file = open(input_s,'r') score_dict = {} for line in ref_file: line = line.rstrip() data = line.split("\t") chrom = data[0] if not x == chrom: continue if len(data) >= 12: #12bed format exon_block = data[10].split(',') exon_block.pop() #Remove the last item '' exon_st = data[11].split(',') exon_st.pop() #Remove the last item '' #name = data[3] for y in range(len(exon_block)): st = int(data[1]) + int(exon_st[y]) ed = int(data[1]) + int(exon_st[y]) + int(exon_block[y]) length = ed - st for z in range(length): score_dict[str(st)] = 0 st += 1 elif len(data) >= 3: #6bed format st = int(data[1]) ed = int(data[2]) length = ed - st for z in range(length): score_dict[str(st)] = 0 st += 1 else: print('ERROR: Your BED format file have less than three column.') print ('BED format file need to have at least three column [chr, st, ed]...') sys.exit(1) utils.now_time('Reference_file was loaded.') for line in phylop_sizedown_input_file: line = line.rstrip() data = line.split("\t") st_site = 0 score = 0 if re.match(r'^chr',data[0]): st_site = data[1] # score = data[2] # else: st_site = data[0] # score = data[1] # if st_site in score_dict: score_dict[str(st_site)] = score shelve_db = shelve.open(output_s) shelve_db.update(score_dict) utils.now_time("phylop_sizedown script was successfully finished!!") phylop_sizedown_input_file.close() shelve_db.close()
#!usr/bin/env python import re from parameter.common_parameters import common_parameters from parameter.convert_mirbase_id import convert_mirbase_id import utils.setting_utils as utils utils.now_time("cupid_result script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.cupid_pos) utils.now_time("Output_file: " + p.cupid_output) utils.now_time("miRNA_file: " + p.cupid_mirna_fasta) utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output,'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa',mirbase_id): continue
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("mirbase_pre script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.mirbase_pre_input) utils.now_time("Output_file: " + p.mirbase_pre_output) input_file = open(p.mirbase_pre_input,'r') output_file = open(p.mirbase_pre_output,'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>",line): #Header data = line.split() mir_id = data[0] mir_id = mir_id.replace('>','') symbol = data[1] infor = mir_id + '|' + symbol if flg == 1: print (seq,file=output_file,end="\n") print (infor,file=output_file,end="\t") flg = 1 seq = "" else: #Sequence seq += line
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("mirbase_gff2bed script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.mirbase_gff2bed_input) utils.now_time("Output_file: " + p.mirbase_gff2bed_output) mirbase_gff_file = open(p.mirbase_gff2bed_input, "r") mirbase_bed_file = open(p.mirbase_gff2bed_output, "w") for line in mirbase_gff_file: line = line.rstrip() data = line.split("\t") if re.match(r"^#", line): continue chrom = data[0] status = data[2] st = int(data[3]) - 1 ed = data[4] strand = data[6] if status == "miRNA_primary_transcript": continue name_infor = data[8].split(";") mir_id = re.sub(r"^ID=", "", name_infor[0])
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("liftOver script starting...") p = utils.Bunch(common_parameters) def main(): input_file = p.liftover_input output_file = p.liftover_output error_file = output_file + '.error' command_liftover = '../software/liftOver ' + input_file + ' ../software/hg38ToHg19.over.chain ' + output_file + ' ' + error_file utils.run_command(command_liftover) utils.now_time("liftOver script was successfully finished!!") if __name__ == '__main__': main()
#!usr/bin/env python import re from parameter.common_parameters import common_parameters from parameter.convert_mirbase_id import convert_mirbase_id import utils.setting_utils as utils utils.now_time("mirmark_result script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.mirmark_pos) utils.now_time("Output_file: " + p.mirmark_output) utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta) utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output, 'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa', mirbase_id): #Only choose h**o sapiens miRNA
#!usr/bin/env python import sys import re import shelve from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phastcons_sizedown script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phastcons_sizedown_score_input) utils.now_time("Reference_file: " + p.phastcons_sizedown_bed_input) utils.now_time("Output_file: " + p.phastcons_sizedown_score_output) ''' ref_s = p.phastcons_sizedown_bed_input #mirBase, Refseq etc... ref_file = open(ref_s,'r') ref_dict = {} #{NM_000XXXX: [st1,ed1],[st2,ed2]} for line in ref_file: line = line.rstrip() data = line.split("\t") if len(data) >= 12: #12bed format st = 0 ed = 0 exon_block = data[10].split(',') exon_block.pop() exon_st = data[11].split(',') exon_st.pop()
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("Refseq_pre script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.refseq_pre_input) utils.now_time("Output_file: " + p.refseq_pre_output) input_file = open(p.refseq_pre_input,'r') output_file = open(p.refseq_pre_output,'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>",line): #Header data = line.split() refseq_id = data[0] refseq_id = refseq_id.replace('>hg19_refGene_','') if flg == 1: print (seq,file=output_file,end="\n") print (refseq_id,file=output_file,end="\t") flg = 1 seq = "" else: #Sequence seq += line print (seq,file=output_file,end="\n") utils.now_time("Refseq_pre script was successfully finished!!")
#run_log("Calculating target site composition...", 4) result_dict = target_site_composition(targetrna_seq, tmp_dict) return run_result(result_dict) def detect_rev_seed_match(mirna_id, targetrna_id): ''' X1_seed_match_rev ''' mirna_seq, targetrna_seq = get_sequence(mirna_id, targetrna_id) targetrna_seq_revcomp = utils.reverse_complement(targetrna_seq) tmp_dict = find_mirna_subtarget_candidates(mirna_id,mirna_seq,targetrna_id,targetrna_seq_revcomp) # => list() return run_result(tmp_dict) ###MAIN### utils.now_time("MIRAGE estimate is starting...") mirna_dict = utils.load_fasta(p.MIRNA_FASTA_PATH) targetrna_dict = utils.load_fasta(p.TARGETRNA_FASTA_PATH) '''#shelve #shelve_file ###Save_file shelve_path = utils.get_absolute_path('./seed_match.db') if os.path.isfile(shelve_path): #if shelve_file exists, it'll be removed. os.remove(shelve_path) seed_match_db = shelve.open('./seed_match.db') '''#shelve ###Conservation_files mirna_phastcons_path = utils.get_absolute_path('../data/PhastCons46Ways/phastCons46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db') mirna_phylop_path = utils.get_absolute_path('../data/PhyloP/phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db')
#!usr/bin/env python import sys import re import shelve from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phylop_sizedown script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phylop_sizedown_score_input) utils.now_time("Reference_file: " + p.phylop_sizedown_bed_input) utils.now_time("Output_file: " + p.phylop_sizedown_score_output) ''' ref_s = p.phastcons_sizedown_bed_input #mirBase, Refseq etc... ref_file = open(ref_s,'r') ref_dict = {} #{NM_000XXXX: [st1,ed1],[st2,ed2]} for line in ref_file: line = line.rstrip() data = line.split("\t") if len(data) >= 12: #12bed format st = 0 ed = 0 exon_block = data[10].split(',') exon_block.pop() exon_st = data[11].split(',') exon_st.pop()
def run_log(comment, step): global flg_find_mirna_target_candidates if flg_find_mirna_target_candidates == step: utils.now_time(comment) flg_find_mirna_target_candidates += 1
#!usr/bin/env python import os import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phastcons_prep script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phastcons_prep_input) utils.now_time("Output_file: " + p.phastcons_prep_output) for x in [ 'chrY' ]: #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix' output_s = p.phastcons_prep_input + x + '.phastCons46way.bed' phastcons_prep_input_file = open(input_s, 'r') phastcons_prep_output_file = open(output_s, 'w') chrom = '' start_site = 0 step = 1 for line in phastcons_prep_input_file: line = line.rstrip() if re.match(r'^fixedStep', line): regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)'
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("mirbase_pre script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.mirbase_pre_input) utils.now_time("Output_file: " + p.mirbase_pre_output) input_file = open(p.mirbase_pre_input, 'r') output_file = open(p.mirbase_pre_output, 'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>", line): #Header data = line.split() mir_id = data[0] mir_id = mir_id.replace('>', '') symbol = data[1] infor = mir_id + '|' + symbol if flg == 1: print(seq, file=output_file, end="\n") print(infor, file=output_file, end="\t") flg = 1 seq = "" else: #Sequence
#!/usr/bin/env python import shelve from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phastcons_score_R script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phylop_score_R_input) utils.now_time("Output_file: " + p.phylop_score_R_output) output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt' output_file = open(output_s, 'w') #for x in ['chrY']: for x in [ 'chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM' ]: input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db' input_shelve = shelve.open(input_s) max_length = 28 #Max_length: 28nt(miRNA) for keys in input_shelve.keys(): values = input_shelve[keys] value_length = len(values) add_length = max_length - value_length
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("mirbase_gff2bed script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.mirbase_gff2bed_input) utils.now_time("Output_file: " + p.mirbase_gff2bed_output) mirbase_gff_file = open(p.mirbase_gff2bed_input,'r') mirbase_bed_file = open(p.mirbase_gff2bed_output,'w') for line in mirbase_gff_file: line = line.rstrip() data = line.split("\t") if re.match(r'^#',line): continue chrom = data[0] status = data[2] st = int(data[3]) - 1 ed = data[4] strand = data[6] if status == 'miRNA_primary_transcript': continue name_infor = data[8].split(';') mir_id = re.sub(r'^ID=','',name_infor[0]) mir_id_number = ''
#!usr/bin/env python import sys import re import shelve from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phylop_score_list script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phylop_score_list_db_input) utils.now_time("Reference_file: " + p.phylop_score_list_reference) utils.now_time("Output_file: " + p.phylop_score_list_db_output) output_merge = p.phylop_score_list_db_output + 'phyloP46way_Refseq_for_MIRAGE_CDS.db' #'phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db' output_merge_shelve = shelve.open(output_merge) #for x in ['chrY']: for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: ref_s = p.phylop_score_list_reference #mirBase, Refseq etc... ref_file = open(ref_s,'r') input_s = p.phylop_score_list_db_input + x + '.phyloP46way_Refseq_CDS.db' #'.phyloP46way_Refseq.db' output_s = p.phylop_score_list_db_output + x + '.phyloP46way_Refseq_for_MIRAGE_CDS.db' #'.phyloP46way_Refseq_for_MIRAGE.db' input_shelve = shelve.open(input_s) output_shelve = shelve.open(output_s) score_list_dict = {}
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("bed_3UTR script starting...") p = utils.Bunch(common_parameters) def main(): input_file = p.bed_3UTR_input output_file = p.bed_3UTR_output command_bed_3UTR = '../software/bed12to3UTRbed.sh ' + input_file + ' > ' + output_file print (command_bed_3UTR) utils.run_command(command_bed_3UTR) utils.now_time("bed_3UTR script was successfully finished!!") if __name__ == '__main__': main()
#!usr/bin/env python import re from parameter.common_parameters import common_parameters from parameter.convert_mirbase_id import convert_mirbase_id import utils.setting_utils as utils utils.now_time("mirmark_result script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.mirmark_pos) utils.now_time("Output_file: " + p.mirmark_output) utils.now_time("miRNA_file: " + p.mirmark_mirna_fasta) utils.now_time("TargetRNA_file: " + p.mirmark_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output,'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa',mirbase_id): #Only choose h**o sapiens miRNA continue
#!usr/bin/env python import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("Refseq_pre script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.refseq_pre_input) utils.now_time("Output_file: " + p.refseq_pre_output) input_file = open(p.refseq_pre_input, 'r') output_file = open(p.refseq_pre_output, 'w') flg = 0 seq = "" for line in input_file: line = line.rstrip() if re.match(r"^>", line): #Header data = line.split() refseq_id = data[0] refseq_id = refseq_id.replace('>hg19_refGene_', '') if flg == 1: print(seq, file=output_file, end="\n") print(refseq_id, file=output_file, end="\t") flg = 1 seq = "" else: #Sequence seq += line print(seq, file=output_file, end="\n")
#!usr/bin/env python import re from parameter.common_parameters import common_parameters from parameter.convert_mirbase_id import convert_mirbase_id import utils.setting_utils as utils utils.now_time("cupid_result script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.cupid_pos) utils.now_time("Output_file: " + p.cupid_output) utils.now_time("miRNA_file: " + p.cupid_mirna_fasta) utils.now_time("targetRNA_file: " + p.cupid_targetrna_fasta) utils.now_time("Refseq_data: " + p.refseq_pre_output) utils.now_time("miRBase_data: " + p.mirbase_pre_output) refseq_dict = {} mirbase_dict = {} #mirbase_dict mirbase_file = open(p.mirbase_pre_output, 'r') for line in mirbase_file: line = line.rstrip() data = line.split("\t") infor = data[0].split('|') mirbase_id = infor[0] symbol = infor[1] seq = data[1] if not re.match('hsa', mirbase_id):
#!/usr/bin/env python import shelve from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phastcons_score_R script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phylop_score_R_input) utils.now_time("Output_file: " + p.phylop_score_R_output) output_s = p.phylop_score_R_output + 'phyloP46way_miRBase_v21_hg38Tohg19.txt' output_file = open(output_s,'w') #for x in ['chrY']: for x in ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: input_s = p.phylop_score_R_input + x + '.phyloP46way_miRBase_v21_hg38Tohg19_for_MIRAGE.db' input_shelve = shelve.open(input_s) max_length = 28 #Max_length: 28nt(miRNA) for keys in input_shelve.keys(): values = input_shelve[keys] value_length = len(values) add_length = max_length - value_length null_value = [0.000 for i in range(add_length)] values += null_value value_string = "\t".join(map(str, values)) print(keys,value_string, file=output_file, sep="\t", end="\n") input_shelve.close()
def main(): parser = argparse.ArgumentParser( prog="mirage_prepare", description="MIRAGE preparation toolkit - Data preparation for MIRAGE") parser.add_argument( 'preparation_type', action='store', choices=[ 'bed_3UTR', 'mirbase_gff2bed', 'liftOver', 'phylop_score_prep', 'phastcons_prep', 'phylop_sizedown', 'phastcons_sizedown', 'phylop_score_list', 'phastcons_score_list', 'phastcons_score_R', 'phylop_score_R', 'refseq_pre', 'mirbase_pre', 'mirmark_result', 'cupid_result' ], help= 'Preparation Type: refseq_pre|mirbase_pre|mirmark_result|cupid_result|' ) parser.add_argument( '-i', '--input-file', action='store', dest='input_file', help='Input file: Specify a input file name and its path') parser.add_argument( '-r', '--reference-file', action='store', dest='reference_file', help='reference file: Specify a reference file name and its path') parser.add_argument( '-a', '--additional-file', action='store', dest='add_file', nargs=3, help= 'Additional_file: Specify 1-refseq_pre file dir, 2-mirbase_pre file dir, 3-error log dir)' ) parser.add_argument( '-o', '--ouput-file', action='store', dest='output_file', help='Output file: Specify a output file name and its path') args = parser.parse_args() #Start analysis - logging greeting() utils.now_time('MIRAGE Data Preparation starting...') #Parameter preparation prep_type = args.preparation_type if (args.input_file or args.output_file): if not (os.path.isfile(args.input_file)): utils.now_time("ERROR: InputFile does not exist...") sys.exit(1) if not (args.output_file): utils.now_time("ERROR: -o option are required...") sys.exit(1) custom_params = {} if prep_type == 'bed_3UTR': custom_params['bed_3UTR_input'] = args.input_file custom_params['bed_3UTR_output'] = args.output_file common_parameters.updata(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'mirbase_gff2bed': custom_params['mirbase_gff2bed_input'] = args.input_file custom_params['mirbase_gff2bed_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'liftOver': custom_params['liftover_input'] = args.input_file custom_params['liftover_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'phylop_score_prep': custom_params['phylop_score_prep_input'] = args.input_file custom_params['phylop_score_prep_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'phastcons_prep': custom_params['phastcons_prep_input'] = args.input_file custom_params['phastcons_prep_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'phylop_sizedown': if args.reference_file: custom_params[ 'phylop_sizedown_bed_input'] = args.reference_file custom_params['phylop_sizedown_score_input'] = args.input_file custom_params[ 'phylop_sizedown_score_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: -r option is required...") sys.exit(1) elif prep_type == 'phastcons_sizedown': if args.reference_file: custom_params[ 'phastcons_sizedown_bed_input'] = args.reference_file custom_params[ 'phastcons_sizedown_score_input'] = args.input_file custom_params[ 'phastcons_sizedown_score_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: -r option is required...") sys.exit(1) elif prep_type == 'phylop_score_list': if args.reference_file: custom_params[ 'phylop_score_list_reference'] = args.reference_file custom_params['phylop_score_list_db_input'] = args.input_file custom_params['phylop_score_list_db_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: -r option is required...") sys.exit(1) elif prep_type == 'phastcons_score_list': if args.reference_file: custom_params[ 'phastcons_score_list_reference'] = args.reference_file custom_params[ 'phastcons_score_list_db_input'] = args.input_file custom_params[ 'phastcons_score_list_db_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: -r option is required...") sys.exit(1) elif prep_type == 'phastcons_score_R': custom_params['phastcons_score_R_input'] = args.input_file custom_params['phastcons_score_R_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'phylop_score_R': custom_params['phylop_score_R_input'] = args.input_file custom_params['phylop_score_R_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'refseq_pre': custom_params['refseq_pre_input'] = args.input_file custom_params['refseq_pre_output'] = args.output_file common_parameters.update(custom_params) p = utils.Bunch(common_parameters) elif prep_type == 'mirbase_pre': custom_params['mirbase_pre_input'] = args.input_file custom_params['mirbase_pre_output'] = args.output_file p = utils.Bunch(common_parameters) elif prep_type == 'mirmark_result': if args.add_file: custom_params['refseq_pre_output'] = args.add_file[0] custom_params['mirbase_pre_output'] = args.add_file[1] custom_params['mirmark_pos'] = args.input_file custom_params['mirmark_output'] = args.output_file custom_params['mirmark_error'] = args.add_file[2] p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: -a option is required...") sys.exit(1) elif prep_type == 'cupid_result': if args.add_file: custom_params['refseq_pre_output'] = args.add_file[0] custom_params['mirbase_pre_output'] = args.add_file[1] custom_params['cupid_pos'] = args.input_file custom_params['cupid_output'] = args.output_file custom_params['cupid_error'] = args.add_file[2] p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: -a option is required...") sys.exit(1) else: utils.now_time("ERROR: Wrong preparation type...") sys.exit(1) elif not (args.input_file and args.output_file): if prep_type: p = utils.Bunch(common_parameters) else: utils.now_time("ERROR: Wrong preparation type...") sys.exit(1) else: utils.now_time("ERROR: -i and -o option are required...") sys.exit(1) #Preparation type if prep_type == 'bed_3UTR': runpy.run_module('module.preparation.bed_3UTR', run_name="__main__", alter_sys=True) elif prep_type == 'mirbase_gff2bed': runpy.run_module('module.preparation.mirbase_gff2bed', run_name="__main__", alter_sys=True) elif prep_type == 'liftOver': runpy.run_module('module.preparation.liftOver', run_name="__main__", alter_sys=True) elif prep_type == 'phylop_score_prep': runpy.run_module('module.preparation.phylop_score_prep', run_name="__main__", alter_sys=True) elif prep_type == 'phastcons_prep': runpy.run_module('module.preparation.phastcons_score_prep', run_name="__main__", alter_sys=True) elif prep_type == 'phastcons_sizedown': runpy.run_module('module.preparation.phastcons_sizedown', run_name='__main__', alter_sys=True) elif prep_type == 'phylop_sizedown': runpy.run_module('module.preparation.phylop_sizedown', run_name="__main__", alter_sys=True) elif prep_type == 'phylop_score_list': runpy.run_module('module.preparation.phylop_score_list', run_name='__main__', alter_sys=True) elif prep_type == 'phastcons_score_list': runpy.run_module('module.preparation.phastcons_score_list', run_name='__main__', alter_sys=True) elif prep_type == 'phastcons_score_R': runpy.run_module('module.preparation.phastcons_score_R', run_name='__main__', alter_sys=True) elif prep_type == 'phylop_score_R': runpy.run_module('module.preparation.phylop_score_R', run_name='__main__', alter_sys=True) elif prep_type == 'refseq_pre': runpy.run_module('module.preparation.refseq_pre', run_name="__main__", alter_sys=True) elif prep_type == 'mirbase_pre': runpy.run_module('module.preparation.mirbase_pre', run_name="__main__", alter_sys=True) elif prep_type == 'mirmark_result': runpy.run_module('module.preparation.mirmark_result', run_name="__main__", alter_sys=True) elif prep_type == 'cupid_result': runpy.run_module('module.preparation.cupid_result', run_name="__main__", alter_sys=True) else: utils.now_time("ERROR: Wrong preparation type...") sys.exit(1)
#!usr/bin/env python import os import re from parameter.common_parameters import common_parameters import utils.setting_utils as utils utils.now_time("phastcons_prep script starting...") p = utils.Bunch(common_parameters) def main(): utils.now_time("Input_file: " + p.phastcons_prep_input) utils.now_time("Output_file: " + p.phastcons_prep_output) for x in ['chrY']: #['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']: input_s = p.phastcons_prep_input + x + '.phastCons46way.wigFix' output_s = p.phastcons_prep_input + x + '.phastCons46way.bed' phastcons_prep_input_file = open(input_s,'r') phastcons_prep_output_file = open(output_s,'w') chrom = '' start_site = 0 step = 1 for line in phastcons_prep_input_file: line = line.rstrip() if re.match(r'^fixedStep',line): regex = r'fixedStep chrom=(?P<chrom>.+) start=(?P<start>.+) step=(?P<step>.+)' seq = re.match(regex,line) chrom = seq.group('chrom') start_site = int(seq.group('start')) - 1
def main(): parser = argparse.ArgumentParser(prog='mirage',description='MIRAGE - Comprehensive miRNA target prediction pipeline') parser.add_argument('analysis_type',action='store',help='Analysis_type: Choose estimation or prediction',choices=['estimation','prediction']) parser.add_argument('mirna_fasta',action='store',help='miRNA fasta file: Specify miRNA fasta file to use the analysis') parser.add_argument('targetrna_fasta',action='store',help='TargetRNA fasta file: Specify TargetRNA fasta file to use the analysis') parser.add_argument('-m','--mirna-conservation-score-file',action='store',dest='mirna_conservation',help='Conservation score file about miRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.') parser.add_argument('-t','--targetrna-conservation-score-file',action='store',dest='targetrna_conservation',help='Conservation score file about TargetRNA: Specify your conservation score db file. MIRAGE preparetion toolkits enables you to make the score files about TargetRNA or miRNA bed files.') args = parser.parse_args() #Start analysis - logging greeting() utils.now_time("MIRAGE miRNA target prediction starting...") analysis_type = args.analysis_type mirna_fasta_path = args.mirna_fasta targetrna_fasta_path = args.targetrna_fasta mirna_conservation_score = args.mirna_conservation targetrna_conservation_score = args.targetrna_conservation #Check fasta files if not os.path.isfile(mirna_fasta_path): print ("Error: miRNA fasta file does not exist...") sys.exit(1) if not os.path.isfile(targetrna_fasta_path): print ("Error: TargetRNA fasta file does not exist...") #Check conservation score db files #if #parameters param = dict( MIRNA_FASTA_PATH = mirna_fasta_path, TARGETRNA_FASTA_PATH = targetrna_fasta_path, ) common_parameters.update(param) p = utils.Bunch(common_parameters) print ('miRNA_Fasta_file: ' + p.MIRNA_FASTA_PATH,end="\n") print ('TargetRNA_Fasta_file: ' + p.TARGETRNA_FASTA_PATH,end="\n") ''' mirna_dict = utils.load_fasta(mirna_fasta_path) #print (mirna_dict['hsa-miR-34b-5p|MIMAT0000685'],end="\n") #print (mirna_dict['hsa-miR-20a-5p|MIMAT0000075'],end="\n") targetrna_dict = utils.load_fasta(targetrna_fasta_path) #print (targetrna_dict['NM_000594'],end="\n") #print (targetrna_dict['NM_030938'],end="\n") query_mirna.update(mirna_dict) print (query_mirna) mirna = utils.Bunch(query_mirna) query_targetrna.update(targetrna_dict) targetrna = utils.Bunch(query_targetrna) if hasattr (mirna,'hsa-miR-34b-5p|MIMAT0000685'): print ("OK!!") print (mirna.items()) sys.exit(0) else: print ("Error...") sys.exit(1) #test = targetrna.'NM_000594' #print (test,end="\n") #sys.exit(0) ''' #runpy - choose analysis type if analysis_type == 'estimation': runpy.run_module('module.estimate',run_name="__main__",alter_sys=True) elif analysis_type == 'prediction': runpy.run_module('module.predict',run_name="__main__",alter_sys=True) else: print ('Error: Analysis type is wrong...') sys.exit(1)