def discover(args): datafile = args.datafile outputfile = args.output paramsfile = args.params sample_req = args.sample sample_flag = False #used to check whether sample_req exists print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') output.write( 'SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') while sample: if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str( len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model( modelParams, sample['observations'] [target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV( output, sample['sample_id'], targets, pathlist, sample['observations'] [target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'
trans_prob = { 'rainy': { 'rainy': 0.7, 'sunny': 0.3 }, 'sunny': { 'rainy': 0.4, 'sunny': 0.6 } } emit_prob = { 'rainy': { 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }, 'sunny': { 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 } } sequence = ['walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean'] model = Model(states, symbols, start_prob, trans_prob, emit_prob) print model.evaluate(sequence) print model.decode(sequence)
params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') while sample: #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str( len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model( modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV( output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close()
conf_prob, trans_prob = pro._tran_conf_prob(train, test_wordcount, word_count, hidden_states) ''' test = [["中华人民共和国今天成立了中国人民从此站起来了"], ["江泽民的三个代表是中国在社会主义改革过程中的智慧结晶"], ["人民日报称改革开发的伟大旗帜要坚定不移动的走下去"], ["日理万机的周总理"], ["国务院今天颁发了关于农业的改革方向前进步伐"], ["机器学习及其翻译激起了人们极其浓厚的兴趣"], ["中共中央书记"]] observations = pro._str2words(test) ''' observations = test phi = {'B': 0.5, 'E': 0, 'M': 0, 'S': 0.5} model = Model(S, observation, phi, trans_prob, conf_prob) o_hstate = [] for obser in observations: ''' Notice,if a setence is too long,when we use viterbi algorithm it may result in the beta = 0 There are two solution,one is split the setence into serval sub_setence,another is use log function for the viterbi here we select the first method ''' length = len(obser) index, sub_obser, state = 0, [], [] while index < length: sub_obser.append(obser[index]) if obser[index] == '。' or obser[index] == ',': sub_state = model.decode(sub_obser) sub_obser = []
#The prob of observation in condition of a hidden state conf_prob = { 'rainy': { 'walk': 0.1, 'shop': 0.3, 'clean': 0.6 }, 'sunny': { 'walk': 0.4, 'shop': 0.5, 'clean': 0.1 }, 'cloudy': { 'walk': 0.6, 'shop': 0.25, 'clean': 0.15 } } observations = [ 'walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean' ] #The iter_num is the iteration number in the EM algorithm iter_num = 50 model = Model(states, observation, phi, trans_prob, conf_prob, iter_num) print model.evaluate(observations) print model.decode(observations)