def hmmpred(obs_seq, na3, forw_rerv, hmmoptions, commonOptions): obs_seq = obs_seq.replace('-', '') #obs_seq = obs_seq.replace('N', ''); obs_seq = obs_seq.replace('n', ''); bp = getBasePair() len_repPat = printHMMmatrix.get_len_repPat(na3, commonOptions) trainsmat, startprob, emisionmat, obs_symbols, states, numStates, numSymbols, state3class, tol_info = hmmoptions hmmmodel = hmm.MultinomialHMM(numStates) hmmmodel.transmat_ = trainsmat hmmmodel.startprob_ = startprob hmmmodel.emissionprob_ = emisionmat hmmmodel.n_features = numSymbols myobs = [] for osi in range(len(obs_seq)): myobs.append((np.where(obs_symbols == obs_seq[osi]))[0][0]) logprob, predstats = hmmmodel.decode(np.array([myobs]).T, algorithm="viterbi") newstr, ststar, pre0 = getPred(predstats, obs_seq, state3class, len_repPat) if cur_M_STAT <= M_DEBUG: #int(len(newstr)/float(len_repPat)+0.5)<14: #False: #True: #False: #int(len(newstr)/float(len_repPat)) in [8,13]: print 'hmmB:', obs_seq, int(len(newstr) / float(len_repPat) + 0.5) psstr = [] for ps in predstats: psstr.append(str(ps)) print 'hmmB:', ''.join(psstr) return [newstr, pre0, ststar]
def getRepeatForGivenGene(commonOptions, specifiedOptions, moreOptions): logging.info(moreOptions['chr'] + ' ' + str(moreOptions['repeat_start_end'])) chr = moreOptions['chr'] repeatName = moreOptions['repeatName'] gene_start_end = moreOptions['gene_start_end'] repeat_start_end = moreOptions['repeat_start_end'] repPat = moreOptions['repPat'] forw_rerv = moreOptions['forw_rerv'] bamfile = specifiedOptions['bamfile'] unique_file_id = specifiedOptions['unique_file_id'] analysis_file_id = specifiedOptions['analysis_file_id'] isGapCorrection = commonOptions['isGapCorrection'] repeatFlankLength = commonOptions['repeatFlankLength'] MinSup = commonOptions['MinSup'] len_repPat = printHMMmatrix.get_len_repPat(repPat, commonOptions) logging.info("len_repPat=" + str(len_repPat)) # print repeatName, alignfolder = specifiedOptions['align'] # 'align/' if not os.path.isdir(alignfolder): os.system('mkdir ' + alignfolder) ref_repeat = (repeat_start_end[1] - repeat_start_end[0] + 1) / float(len_repPat) # 3.0 alignfile = alignfolder + repeatName + unique_file_id + '.alignment.txt' get_alg_cmd = 'samtools view ' + bamfile + ' ' + chr + ':' + \ str(gene_start_end[0]) + '-' + str(gene_start_end[1]) + ' > ' + alignfile if 'thread' not in specifiedOptions: logging.info('Running ' + get_alg_cmd) os.system(get_alg_cmd) if os.path.getsize(alignfile) == 0: if commonOptions['outlog'] <= M_WARNING: logging.info(get_alg_cmd + '\n') logging.info('The file %s have zero size\nTry without chr' % alignfile) #print ('The file %s have zero size\nTry without chr' % alignfile) get_alg_cmd = 'samtools view ' + bamfile + ' ' + \ chr[3:] + ':' + str(gene_start_end[0]) + '-' + \ str(gene_start_end[1]) + ' > ' + alignfile if commonOptions['outlog'] <= M_INFO and ('thread' not in specifiedOptions): logging.info('Running ' + get_alg_cmd) os.system(get_alg_cmd) if commonOptions['outlog'] <= M_INFO: logging.info('Produced ' + alignfile + ' done!') if (not os.path.isfile(alignfile)) or os.path.getsize(alignfile) == 0: if commonOptions['outlog'] <= M_FATAL: logging.error('!!!!Cannot produce ' + alignfile + ' for ' + repeatName) # sys.exit(1) os.system('rm ' + alignfile) return None aligndata = myReadTxtFile(alignfile) os.system('rm ' + alignfile) repregion_len_threhold = len_repPat # 3; predata, mfadata, sufdata = getGene(repeatName, chr, gene_start_end, unique_file_id, analysis_file_id, commonOptions['hgfile'], repeatFlankLength, specifiedOptions) rep_predata, rep_mfadata, rep_sufdata = getGene( repeatName, chr, repeat_start_end, unique_file_id, analysis_file_id, commonOptions['hgfile'], repeatFlankLength, specifiedOptions) commonOptions['rep_flanking_data'] = rep_predata, rep_sufdata wrongalign = 0 hmmoptions = getHMMOptions(repeatFlankLength, repPat, forw_rerv, commonOptions) repeats = [] repeats_dict = {} ids = [] for line in aligndata: lsp = line.split('\t') readid = lsp[0] cchr = lsp[2] if not (chr == cchr or (len(chr) > 3 and chr[3:] == cchr) or (len(cchr) > 3 and cchr[3:] == chr)): continue pos = int(lsp[3]) aligninfo = lsp[5] aainfo = lsp[9] if pos > repeat_start_end[0] - repeatFlankLength: wrongalign += 1 # continue; #logging.error('The start pos in ref Genome is greater than the start position of repeats' + str(pos) +' ' + str(repeat_start_end[0])); if not (cchr == chr or cchr == chr[3:]): logging.error('Not same ' + cchr + ' ' + chr) continue numreg = re.compile('\d+') numinfo = numreg.findall(aligninfo) mdireg = re.compile('[MIDNSHPX=]{1}') mdiinfo = mdireg.findall(aligninfo) if not len(numinfo) == len(mdiinfo): logging.error('Num is equal to mid' + str(len(numinfo)) + ' ' + str(len(mdiinfo))) continue queryind = 0 hpadd = 0 queryrep = '' longer = False query_start_ind = None query_end_ind = None for n1ind in range(len(numinfo)): n1 = int(numinfo[n1ind]) mdi = mdiinfo[n1ind] for n1i in range(n1): qrepadd = False if mdi == 'M': pos = pos + 1 queryind = queryind + 1 qrepadd = True elif mdi == 'I': qrepadd = True queryind = queryind + 1 elif mdi == 'D': pos = pos + 1 elif mdi == 'S': queryind = queryind + 1 qrepadd = True elif mdi == 'H': if qrepadd: hpadd += 1 # pass elif mdi == 'P': if qrepadd: hpadd += 1 # pass else: logging.warning('Warning unknow CIGAR element ' + str(n1) + ' ' + mdi) if qrepadd: if pos - 1 >= repeat_start_end[0] - repeatFlankLength and pos - 1 <= repeat_start_end[1] + repeatFlankLength: queryrep = queryrep + aainfo[queryind - 1] if pos - 1 < repeat_start_end[0] - repeatFlankLength: query_start_ind = queryind - 1 if pos - 1 >= repeat_start_end[1] and pos - 1 < repeat_start_end[1] + repeatFlankLength: query_end_ind = queryind if pos - 1 > repeat_start_end[1] + repeatFlankLength: longer = True if readid not in repeats_dict: repeats_dict[readid] = [query_start_ind, query_end_ind, aainfo] else: if query_start_ind is not None: if repeats_dict[readid][0] is None or repeats_dict[readid][0] > query_start_ind: repeats_dict[readid][0] = query_start_ind if query_end_ind is not None: if repeats_dict[readid][1] is None or repeats_dict[readid][1] < query_end_ind: repeats_dict[readid][1] = query_end_ind if len(repeats_dict[readid][2]) < len(aainfo): repeats_dict[readid][2] = aainfo if len(queryrep) >= repregion_len_threhold: repeats.append([longer, queryrep, lsp[0]]) ids.append(readid) handleint = True if handleint: repeats = [] ids = [] repeatskeys = repeats_dict.keys() for rk in repeatskeys: if repeats_dict[rk][0] is None or repeats_dict[rk][1] is None: repeats.append([False, str(repeats_dict[rk][0]) + '-to-' + str(repeats_dict[rk][1]), rk]) ids.append(rk) else: if repeats_dict[rk][1] - repeats_dict[rk][0] > 0: repeats.append([True, repeats_dict[rk][2] [repeats_dict[rk][0]:(repeats_dict[rk][1] + 1)], rk]) ids.append(rk) else: if commonOptions['outlog'] <= M_WARNING: print('Warning!!! negative ', rk, repeats_dict[rk][:2]) repeats.append([False, str(repeats_dict[rk][0]) + '-to-' + str(repeats_dict[rk][1]), rk]) ids.append(rk) rptrue = [] rpfalse = [] orignial = [] for currep_ind in range(len(repeats)): currep = repeats[currep_ind] newstr = currep[1] pre0 = 0 predstats = '' if len(newstr) < commonOptions['MaxRep'] * len_repPat: if currep[0]: # print 'BAMhandler', repeat_start_end, chr newstr, pre0, predstats = getUnsymAlignAndHMM( repPat, forw_rerv, repeatFlankLength, hmmoptions, currep[1], commonOptions, ids[currep_ind]) else: if 'thread' not in specifiedOptions: logging.warning('The sequence is partial: ' + str(len(newstr)) + ' ' + chr + ' ' + repeatName + ' ' + repPat + ' ' + str( currep[0]) + ' reads name:' + currep[2] + " " + str(commonOptions['MaxRep']) + " " + str(commonOptions['MaxRep'] * len_repPat)) if handleint: logging.warning(str(repeats_dict[currep[2]][:2])) else: logging.warning('The sequence is too long: ' + str(len(newstr)) + ' ' + chr + ' ' + repeatName + ' ' + repPat + ' ' + str( currep[0]) + ' reads name:' + currep[2] + " " + str(commonOptions['MaxRep']) + " " + str(commonOptions['MaxRep'] * len_repPat)) if handleint: logging.warning(str(repeats_dict[currep[2]][:2])) orignial.append([currep[1], pre0, predstats]) currep[1] = newstr if currep[0]: rptrue.append(len(currep[1]) / float(len_repPat)) # 3.0); else: rpfalse.append(len(currep[1]) / float(len_repPat)) # 3.0); rptrue.sort() rpfalse.sort() trstr = 'true ' + str(len(rptrue)) + ' [' for rpt in rptrue: trstr = trstr + ('%.0f,' % rpt) trstr = trstr[:-1] + ']' logging.debug(trstr) p2, allocr = myGaussianMixtureModel.get2Peaks(rptrue, MinSup, commonoptions=commonOptions) if len(rpfalse) > 0: flstr = 'fals ' + str(len(rpfalse)) + ' [' for rpf in rpfalse: flstr = flstr + ('%.0f,' % rpf) flstr = flstr[:-1] + ']' logging.debug(flstr) logging.info('ref_repeat ' + ('%.0f' % ref_repeat) + '\t' + repPat + '\t' + forw_rerv) for currep_ind in range(len(repeats)): currep = repeats[currep_ind] aaprinindex = -1 if not (currep[0]): aaprinindex = 300 logging.debug('\t' + str(currep[0]) + ' o:' + str(len(orignial[currep_ind] [0])) + '\t' + orignial[currep_ind][0][:aaprinindex]); prestr = '' # print currep_ind, orignial[currep_ind][1], orignial[currep_ind] for i in range(orignial[currep_ind][1]): prestr = prestr + ' ' logging.debug('\t' + str(currep[0]) + ' p:' + str(len(currep[1]) ) + '\t' + prestr + (currep[1][:aaprinindex])) return [repeatName, ref_repeat, p2, allocr, len(rptrue), len(rpfalse) + wrongalign]
def getTransition_start_emission_prob_x(repPat, commonOptions, forprint=False): repPat = string.strip(repPat); if (commonOptions['CompRep']=='0' and len(repPat)<1) and (len(commonOptions['CompRep'])<1 and (not commonOptions['CompRep']=='0')): return None len_repPat = printHMMmatrix.get_len_repPat(repPat, commonOptions) if commonOptions['CompRep']=='0': CompRepPat = printHMMmatrix.getCompRepFromSimple(repPat) else: CompRepPat = commonOptions['CompRep'] tol_info = produce_tolerate_mismatch(repPat, commonOptions) if commonOptions['outlog'] <= M_INFO: print 'tol_info', tol_info logging.info('tol_info=' + str(tol_info)) avgsub = 0.0005 avgsub = hmm_random_rep_transit/len_repPat avgsub = 1e-9 typeOfRepEle = ['', 'I', 'D']; repEle = []; for rp_ind in range(len_repPat): repEle.append(''.join(['r', str(rp_ind+1)])); states = ['N']; for typRE in typeOfRepEle: for rp in repEle: states.append(''.join([typRE, rp])); if commonOptions.has_key('transitionm') and (not commonOptions['transitionm']==None): trainsmat = commonOptions['transitionm'] else: trainsmat = np.full((len(states), len(states)), 1e-9); #for N to N trainsmat[0][0] = 0.96; #for N to rep; if not len_repPat<2: trainsmat[0][1] = 0.02; else: trainsmat[0][1] = 0.04; if not len_repPat<2: trainsmat[0][1+len(repEle)*2] = 0.02; #for rep to N; trainsmat[len(repEle)][0] = 0.02; trainsmat[len(repEle)*2][0] = 0.02; if not len_repPat<2: trainsmat[len(repEle)*3-1][0] = 0.02; #avgsub for i in range(1, len(states)): for j in range(len(repEle)): trainsmat[i][j+1] = avgsub #for insertion add_index = len(repEle)+1; for typ_ind in range(len(typeOfRepEle)): for j in range(len(repEle)): if typ_ind<len(typeOfRepEle)-1: jind = j else: jind = j+1; if jind > len(repEle)-1: jind = 0; trainsmat[len(repEle)*typ_ind+j+1][jind+add_index] = commonOptions['hmm_insert_rate'] #0.11 #for deletion add_index = len(repEle)*2+1; for typ_ind in range(len(typeOfRepEle)): for j in range(len(repEle)): for k in range(1, len(repEle)): if typ_ind<len(typeOfRepEle)-1: jind = j+k else: if k>=len(repEle)-1: continue jind = j+k+1 if jind > len(repEle)-1: jind -= len(repEle); trainsmat[len(repEle)*typ_ind+j+1][jind+add_index] = commonOptions['hmm_del_rate']**k # if trainsmat[len(repEle)*typ_ind+j+1][jind+add_index]<1e-9: trainsmat[len(repEle)*typ_ind+j+1][jind+add_index] = 1e-9 #for between rep add_index = 1; for typ_ind in range(len(typeOfRepEle)): for j in range(len(repEle)): if typ_ind<len(typeOfRepEle)-1: jind = j+1; if jind > len(repEle)-1: jind = 0; else: jind = j+2 if jind > len(repEle)-1: jind -= len(repEle) restprob = 1; for jstat in range(len(states)): if jstat==jind+add_index: pass else: restprob -= trainsmat[len(repEle)*typ_ind+j+1][jstat]; trainsmat[len(repEle)*typ_ind+j+1][jind+add_index] = restprob startprob = [] for i in range(len(states)): startprob.append(1e-9) startprob[0] = 0.96; if len_repPat<2: startprob[1] = 0.04 else: startprob[1] = 0.02 startprob[1+len(repEle)*2] = 0.02 startprob = np.array(startprob) if commonOptions.has_key('emissionm') and (not commonOptions['emissionm']==None): emisionmat = commonOptions['emissionm'] else: #emisionmat = np.full((len(repEle)*len(typeOfRepEle)+1, 4), commonOptions['hmm_sub_rate']/4) emisionmat = np.full((len(repEle)*len(typeOfRepEle)+1, 5), commonOptions['hmm_sub_rate']/4) randrow = [0] for j in range(len(repEle)): randrow.append(j+len(repEle)+1); if len_repPat<2: randrow.append(len(repEle)*len(typeOfRepEle)) #print randrow for rdr in randrow: for jcol in range(4): #emisionmat[rdr][jcol] = 0.25; if not rdr==0: emisionmat[rdr][jcol] = 0.25; else: emisionmat[rdr][jcol] = 0.2 for nset in range(len(repEle)*len(typeOfRepEle)+1): if not nset==0: emisionmat[nset][4] = 1e-9 else: emisionmat[nset][4] = 0.2 obs_symbols = np.array(['A', 'C', 'G', 'T', 'N']) for naind in range(len_repPat): CompRepPatkeys1 = CompRepPat[naind].keys(); for k1 in CompRepPatkeys1: emind = (np.where(obs_symbols==k1))[0][0] emisionmat[naind+1][emind] += (1-commonOptions['hmm_sub_rate'])*CompRepPat[naind][k1] if len_repPat<2: continue; if naind<len_repPat-1: afterd = naind + 1; else: afterd = 0; CompRepPatkeys2 = CompRepPat[afterd].keys(); for k2 in CompRepPatkeys2: emind = (np.where(obs_symbols==k2))[0][0] emisionmat[naind+1+len_repPat*2][emind] += (1-commonOptions['hmm_sub_rate'])*CompRepPat[afterd][k2] if forprint: if commonOptions['outlog'] <= M_INFO: print 'HMMmatrix1' printHMMmatrix.printHMMmatrix(states, obs_symbols, trainsmat, emisionmat, startprob) state3class = [range(1, len_repPat+1), range(len_repPat+1, 2*len_repPat+1), range(2*len_repPat+1, 3*len_repPat+1)] # 0 1 2 3 4 5 6 7 8 return [trainsmat, startprob, emisionmat, obs_symbols, np.array(states), len(states), len(obs_symbols), state3class, tol_info]
def getSCA3ForGivenGene(commonOptions, specifiedOptions, moreOptions): predres = [] mgloc = moreOptions['mgloc'] repeatName = moreOptions['repeatName'] gene_start_end = moreOptions['gene_start_end'] repeat_start_end = moreOptions['repeat_start_end'] fastafile = specifiedOptions['fastafile'] unique_file_id = specifiedOptions['unique_file_id'] analysis_file_id = specifiedOptions['analysis_file_id'] hgfile = commonOptions['hgfile'] MinSup = commonOptions['MinSup'] repPat = moreOptions['repPat'] myHMM.produce_for_repPat(commonOptions, moreOptions) len_repPat = printHMMmatrix.get_len_repPat(repPat, commonOptions) logging.info("len_repPat=" + str(len_repPat)) repPat = moreOptions['repPat'] upstreamstr, repregion, downstreamstr = get3part( mgloc, gene_start_end, repeat_start_end, repeatName, unique_file_id, analysis_file_id, hgfile, specifiedOptions) if len(repregion) == 0: logging.error("Not repeat region! please check!!" + repeatName + (' gene_location=[%d, %d], repeat_location=[%d, %d]' % (gene_start_end[0], gene_start_end[1], repeat_start_end[0], repeat_start_end[1]))) sys.exit(1) logging.info("Test " + repeatName + ( ' gene_location=[%d, %d], repeat_location=[%d, %d]; upstreamsize=%d, downstreamsize=%d' % (gene_start_end[0], gene_start_end[1], repeat_start_end[0], repeat_start_end[1], repeat_start_end[0] - gene_start_end[0], gene_start_end[1] - repeat_start_end[1]))) logging.info("Normal/Pathogenic repeats: %s" % mgloc[5]) orirepeat = int(len(repregion) / float(len_repPat)) #3) logging.info("Orignal Test read=" + '<<<' + repregion + '>>>' + (" #repeat=%d; #len=%d" % (orirepeat, len(repregion)))) bwamem_w_option = 90 * 4 max_w_option, min_w_option = 500, 100 if bwamem_w_option < min_w_option: bwamem_w_option = min_w_option if bwamem_w_option > max_w_option: bwamem_w_option = max_w_option bwamem_w_option = bwamem_w_option + int( len(upstreamstr + repregion + downstreamstr) * 0.4) if bwamem_w_option > max_w_option: bwamem_w_option = max_w_option start_time = time.time() bamfile = fastafile + '.bam' bamfile = fastafile + unique_file_id + '.bam' specifiedOptions['bamfile'] = bamfile myret = {} myretdetail = {} #cmd = 'bwa mem -k17 -w'+str(bwamem_w_option)+' -W40 -r10 -A1 -B1 -O1 -E1 -L1 -t '+mthreads+' -v 2 '+hg_reference_and_index+'/'+hgfile+' '+ fastafile +' | samtools view -S -b | samtools sort > '+bamfile cmd = 'bwa mem -k17 -w' + str( bwamem_w_option ) + ' -W40 -r10 -A1 -B1 -O1 -E1 -L1 -t ' + mthreads + ' -v 2 ' + hgfile + ' ' + fastafile + ' | samtools view -S -b | samtools sort > ' + bamfile logging.info(cmd) os.system(cmd) cmd = 'samtools index ' + bamfile logging.info(cmd) os.system(cmd) if (commonOptions['SplitAndReAlign'] in [0, 2]) or testall: start_time = time.time() if commonOptions['outlog'] <= M_INFO and ( not specifiedOptions.has_key('thread')): print 'p2bamhmm start' sys.stdout.flush() p2bamhmm = myBAMhandler.getRepeatForGivenGene(commonOptions, specifiedOptions, moreOptions) memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 if p2bamhmm == None: print 'ERROR None detection', moreOptions['repeatName'], mgloc logging.error('ERROR None detection: ' + str(moreOptions['repeatName']) + ' ' + str(mgloc)) else: myBAMhandler.addSumForAGene(p2bamhmm, myret, myretdetail, 'p2bamhmm', 2) end_time = time.time() if commonOptions['outlog'] <= M_WARNING and ( not specifiedOptions.has_key('thread')): print('p2bamhmm end---running time%.0f mem%d' % (end_time - start_time, memres)) sys.stdout.flush() if (commonOptions['SplitAndReAlign'] in [1, 2]) or testall: start_time = time.time() if commonOptions['outlog'] <= M_INFO and ( not specifiedOptions.has_key('thread')): print 'start p2sp' sys.stdout.flush() #moreOptions['fafqfile'] = specifiedOptions['fastafile'] #moreOptions['fafqtype'] = 'fq' moreOptions['fafqfile'] = bamfile moreOptions['fafqtype'] = 'bam' p2sp = myRepeatReAlignment.getRepeatCounts(commonOptions, specifiedOptions, moreOptions) memres = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024 if p2sp == None: print 'ERROR None detection (sp)', moreOptions['repeatName'], mgloc logging.error('ERROR None detection (sp): ' + str(moreOptions['repeatName']) + ' ' + str(mgloc)) else: myBAMhandler.addSumForAGene(p2sp, myret, myretdetail, 'p2sp', 2) end_time = time.time() if commonOptions['outlog'] <= M_WARNING and ( not specifiedOptions.has_key('thread')): print('p2sp end---running time%.0f mem%d' % (end_time - start_time, memres)) sys.stdout.flush() os.system('rm ' + bamfile) os.system('rm ' + bamfile + '.bai') return [myret, myretdetail]