def discover(args): datafile = args.datafile outputfile = args.output paramsfile = args.params sample_req = args.sample sample_flag = False #used to check whether sample_req exists print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') output.write( 'SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') while sample: if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str( len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model( modelParams, sample['observations'] [target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV( output, sample['sample_id'], targets, pathlist, sample['observations'] [target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'
def discover(args) : datafile = args.datafile outputfile = args.output paramsfile = args.params sample_req = args.sample sample_flag = False #used to check whether sample_req exists print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') output.write('SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') while sample : if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model(modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'
conf_prob,trans_prob=pro._tran_conf_prob(train,test_wordcount,word_count,hidden_states) ''' test = [["中华人民共和国今天成立了中国人民从此站起来了"], ["江泽民的三个代表是中国在社会主义改革过程中的智慧结晶"], ["人民日报称改革开发的伟大旗帜要坚定不移动的走下去"], ["日理万机的周总理"], ["国务院今天颁发了关于农业的改革方向前进步伐"], ["机器学习及其翻译激起了人们极其浓厚的兴趣"], ["中共中央书记"]] observations = pro._str2words(test) ''' observations = test phi = {'B':0.5,'E':0,'M':0,'S':0.5} model = Model(S,observation,phi,trans_prob,conf_prob) o_hstate = [] for obser in observations: ''' Notice,if a setence is too long,when we use viterbi algorithm it may result in the beta = 0 There are two solution,one is split the setence into serval sub_setence,another is use log function for the viterbi here we select the first method ''' length = len(obser) index,sub_obser,state= 0,[],[] while index < length: sub_obser.append(obser[index]) if obser[index] == '。' or obser[index]==',': sub_state = model.decode(sub_obser) sub_obser = []
# -*- coding: utf-8 -*- from hmm import Model states = ("rainy", "sunny") symbols = ("walk", "shop", "clean") start_prob = {"rainy": 0.5, "sunny": 0.5} trans_prob = {"rainy": {"rainy": 0.7, "sunny": 0.3}, "sunny": {"rainy": 0.4, "sunny": 0.6}} emit_prob = {"rainy": {"walk": 0.1, "shop": 0.4, "clean": 0.5}, "sunny": {"walk": 0.6, "shop": 0.3, "clean": 0.1}} sequence = ["walk", "shop", "clean", "clean", "walk", "walk", "walk", "clean"] model = Model(states, symbols, start_prob, trans_prob, emit_prob) print model.evaluate(sequence) print model.decode(sequence)
trans_prob = { 'rainy': { 'rainy': 0.7, 'sunny': 0.3 }, 'sunny': { 'rainy': 0.4, 'sunny': 0.6 } } emit_prob = { 'rainy': { 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }, 'sunny': { 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 } } sequence = ['walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean'] model = Model(states, symbols, start_prob, trans_prob, emit_prob) print model.evaluate(sequence) print model.decode(sequence)
outputfile = 'output' paramsfile = 'params.txt' print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') while sample : #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model(modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close()
#The hidden states states = [1,2,3] #The observation states observation = [1,2] #The intial probability for the hidden states phi = {1:0.333,2:0.333,3:0.333} #The trans prob for the hidden states trans_prob = { 1:{1:0.333,2:0.333,3:0.333}, 2:{1:0.333,2:0.333,3:0.333}, 3:{1:0.333,2:0.333,3:0.333} } #The prob of observation in condition of a hidden state conf_prob = { 1:{1:0.5,2:0.5}, 2:{1:0.75,2:0.25}, 3:{1:0.25,2:0.75} } observations =[1,1,1,1,2,1,2,2,2,2] model = Model(states,observation,phi,trans_prob,conf_prob) print model.evaluate(observations) print model.decode(observations)
emit_prob = { 'rainy': { 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }, 'sunny': { 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 } } sequence = ['walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean'] sequence2 = ['walk', 'walk', 'walk', 'walk', 'walk', 'walk', 'walk', 'walk'] model = Model(states, symbols, start_prob, trans_prob, emit_prob) print(model.evaluate(sequence)) for i in range(0, len(sequence)): print(i) print( model._forward(sequence)[i]['sunny'] * model._backward(sequence)[i]['sunny'] + model._forward(sequence)[i]['rainy'] * model._backward(sequence)[i]['rainy']) print(model.decode(sequence)) print(model.decode(sequence2)) print(model._forward(sequence))
conf_prob, trans_prob = pro._tran_conf_prob(train, test_wordcount, word_count, hidden_states) ''' test = [["中华人民共和国今天成立了中国人民从此站起来了"], ["江泽民的三个代表是中国在社会主义改革过程中的智慧结晶"], ["人民日报称改革开发的伟大旗帜要坚定不移动的走下去"], ["日理万机的周总理"], ["国务院今天颁发了关于农业的改革方向前进步伐"], ["机器学习及其翻译激起了人们极其浓厚的兴趣"], ["中共中央书记"]] observations = pro._str2words(test) ''' observations = test phi = {'B': 0.5, 'E': 0, 'M': 0, 'S': 0.5} model = Model(S, observation, phi, trans_prob, conf_prob) o_hstate = [] for obser in observations: ''' Notice,if a setence is too long,when we use viterbi algorithm it may result in the beta = 0 There are two solution,one is split the setence into serval sub_setence,another is use log function for the viterbi here we select the first method ''' length = len(obser) index, sub_obser, state = 0, [], [] while index < length: sub_obser.append(obser[index]) if obser[index] == '。' or obser[index] == ',': sub_state = model.decode(sub_obser) sub_obser = []
params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') while sample: #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str( len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model( modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV( output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close()
#The prob of observation in condition of a hidden state conf_prob = { 'rainy': { 'walk': 0.1, 'shop': 0.3, 'clean': 0.6 }, 'sunny': { 'walk': 0.4, 'shop': 0.5, 'clean': 0.1 }, 'cloudy': { 'walk': 0.6, 'shop': 0.25, 'clean': 0.15 } } observations = [ 'walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean' ] #The iter_num is the iteration number in the EM algorithm iter_num = 50 model = Model(states, observation, phi, trans_prob, conf_prob, iter_num) print model.evaluate(observations) print model.decode(observations)
def discover(args) : paramsfile = args.params sample_req = args.sample hetsnp = args.hetsnp tagsnp = args.tagsnp vcf_file = args.vcf if hetsnp == 'True' or hetsnp == 'TRUE': hetsnp = True else: hetsnp = False if tagsnp == 'True' or tagsnp == 'TRUE': tagsnp = True else: tagsnp = False datafile = args.rpkm_matrix f_dir = os.path.dirname(datafile) if f_dir != '': f_dir = f_dir + '/' if args.output: outputfile = f_dir + str(args.output) tagsnp_file = args.tagsnp_file mode = args.mode sample_flag = False #used to check whether sample_req exists # Build a reference set if mode == 'single' or mode == 'baseline' or mode == 'reference' or mode == 'ref': print 'Building the reference dataset...' dataloader = DataManager(datafile) samples_np = dataloader.getAllSamples() dataloader.closeFile() print 'Baseline is Done.' print 'Loading data file...', dataloader = DataManager(datafile) print 'Done!' print 'Loading paramters...', params = dataloader.getParams(paramsfile) print 'Done!' dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output_aux = file(outputfile+'.aux', 'w') output_aux.write('SAMPLE_ID\tCNV_TYPE\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') output = file(outputfile,'w') output.write('SAMPLE_ID\tCNV_TYPE\tINTERVAL\tCHROMOSOME\tSTART\tSTOP\tLENGTH\n') if (hetsnp or tagsnp) and vcf_file == '': print 'Error: please indicate a vcf file!' system.exit(0) if vcf_file != '': vcf_reader = VCFReader(vcf_file) else: vcf_reader = False if tagsnp: print 'Loading tagSNP information ...', cnp_dict = vcf_reader.loadTagSNP(tagsnp_file) print 'Done!' while sample : if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ,sample_req,'......' #Renjie added: To check whether the VCF contains sample_req. vcf_checker = vcf.Reader(open(vcf_file,'r')) if sample['sample_id'] in vcf_checker.samples: sample_in_VCF = True elif sample_req in vcf_checker.samples: sample_in_VCF = True else: print 'No sample %s in VCF file.'%sample_req sample_in_VCF = False if hetsnp and sample_in_VCF : print 'Parsing SNV information from VCF file for: ' + sample['sample_id'] snp_info = vcf_reader.getSNPInfo(sample['sample_id'], targets_list) if tagsnp and sample_in_VCF: print 'Analysing tagSNP information from tagSNP database for: ' + sample['sample_id'], cnp_list = vcf_reader.findTagSNPForSample(sample['sample_pop'], sample['sample_id'], cnp_dict) tagsnp_info_list = vcf_reader.findExonWithTagSNP(cnp_list, targets_list, overlap_threshold=0.5) print len(tagsnp_info_list) #estimate NB paramters from sample['observations'] sample_observations = [] remove_list = [] sample['observations'] = [ float(x) for x in sample['observations']] #slicing: target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 sample_observations_list = [] snp_info_list = [] for i, targets in enumerate(targets_list): target_index_end = target_index_begin + len(targets) if hetsnp and sample_in_VCF: snp_info_list.append(snp_info[target_index_begin:target_index_end]) sample_observations_list.append(sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end # Filtering: if mode == 'svd' or mode == 'SVD' or mode == 'pooled' or mode == 'pooled-sample': for i in range(len(sample_observations_list)): sample_observations_list[i] = ndarray.tolist(stats.zscore(sample_observations_list[i])) elif mode == 'baseline' or mode == 'reference' or mode == 'single' or mode == 'single-sample': # filtering lists whose observation equals to 0 for i in range(len(targets_list)): rem_index = [] for j in range(len(targets_list[i])): value = sample_observations_list[i][j] if np.isnan(float(value)): rem_index.append(j) #filter target_list, snp_list and observation_list targets_list[i] = jf.filter_list_by_list(targets_list[i], rem_index) sample_observations_list[i] = jf.filter_list_by_list(sample_observations_list[i], rem_index) if hetsnp and sample_in_VCF: snp_info_list[i] = jf.filter_list_by_list(snp_info_list[i], rem_index) if tagsnp and sample_in_VCF: tagsnp_info_list[i] = jf.filter_list_by_list(tagsnp_info_list[i], rem_index) #Parameters estimation observations_all_list = [] for i in range(len(sample_observations_list)): observations_all_list.extend(sample_observations_list[i]) parameterLoader = ParameterEstimation(observations_all_list) parameterList = parameterLoader.fit(observations_all_list,0.01,0.99) print "Estimated Paramters: ",parameterList params.append(parameterList[0])#mu params.append(parameterList[1])#sd for i, targets in enumerate(targets_list): print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '|' + str(len(targets_list)) + ']' temp += 1 #Run the HMM if not hetsnp and not tagsnp: modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0) elif sample_in_VCF and hetsnp and not tagsnp: modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp=0) elif sample_in_VCF and not hetsnp and tagsnp: modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=tagsnp_info_list[i]) elif sample_in_VCF and hetsnp and tagsnp: modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp_info_list[i]) elif not sample_in_VCF and hetsnp and tagsnp: modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0) else: pdb.set_trace() model = Model(mode, modelParams, sample_observations_list[i]) pathlist = list() if vcf_reader and sample_in_VCF: pathlist = model.forwardBackward_Viterbi(mode, if_snp = True) else: pathlist = model.forwardBackward_Viterbi(mode, if_snp = False) dataloader.outputCNVaux(output_aux, sample['sample_id'], targets, pathlist, sample_observations_list[i]) dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample_observations_list[i]) sample = dataloader.getNextSample() output.close() output_aux.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'