Example #1
0
def discover(args):
    datafile = args.datafile
    outputfile = args.output
    paramsfile = args.params
    sample_req = args.sample
    sample_flag = False  #used to check whether sample_req exists

    print 'Loading data file...'
    dataloader = DataManager(datafile)
    params = dataloader.getParams(paramsfile)
    dataloader.skipHeadline()
    sample = dataloader.getNextSample()
    targets_list = dataloader.getTargetsList()
    output = file(outputfile, 'w')
    output.write(
        'SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n')
    while sample:
        if sample_req == '' or (sample_req != ''
                                and sample['sample_id'] == sample_req):
            sample_flag = True
            #target_index is used to split observations sequence
            target_index_begin = 0
            target_index_end = 0
            temp = 1
            for targets in targets_list:
                print 'Running HMM for sample[' + sample['sample_id'] + ']: ',
                print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(
                    len(targets_list)) + ']'
                temp += 1
                target_index_end = target_index_begin + len(targets)

                modelParams = ModelParams(params, targets)
                #the 'observations' of sample is splitted
                model = Model(
                    modelParams, sample['observations']
                    [target_index_begin:target_index_end])
                pathlist = model.forwardBackward_Viterbi()
                dataloader.outputCNV(
                    output, sample['sample_id'], targets, pathlist,
                    sample['observations']
                    [target_index_begin:target_index_end])
                target_index_begin = target_index_end
        sample = dataloader.getNextSample()

    output.close()
    dataloader.closeFile()

    if not sample_flag:
        print 'Could not find the sample_id specified.'
Example #2
0
def discover(args) :
    datafile = args.datafile
    outputfile = args.output
    paramsfile = args.params
    sample_req = args.sample
    sample_flag = False #used to check whether sample_req exists

    print 'Loading data file...'
    dataloader = DataManager(datafile)
    params = dataloader.getParams(paramsfile)
    dataloader.skipHeadline()
    sample = dataloader.getNextSample()
    targets_list = dataloader.getTargetsList()
    output = file(outputfile, 'w')
    output.write('SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n')
    while sample :
        if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req):
            sample_flag = True
            #target_index is used to split observations sequence
            target_index_begin = 0
            target_index_end = 0
            temp = 1
            for targets in targets_list:
                print 'Running HMM for sample[' + sample['sample_id'] + ']: ',
                print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(len(targets_list)) + ']'
                temp += 1
                target_index_end = target_index_begin + len(targets)

                modelParams = ModelParams(params, targets)
                #the 'observations' of sample is splitted
                model = Model(modelParams, sample['observations'][target_index_begin:target_index_end])
                pathlist = model.forwardBackward_Viterbi()
                dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end])
                target_index_begin = target_index_end
        sample = dataloader.getNextSample()

    output.close()
    dataloader.closeFile()

    if not sample_flag:
        print 'Could not find the sample_id specified.'
conf_prob,trans_prob=pro._tran_conf_prob(train,test_wordcount,word_count,hidden_states)

'''
test = [["中华人民共和国今天成立了中国人民从此站起来了"],
        ["江泽民的三个代表是中国在社会主义改革过程中的智慧结晶"],
        ["人民日报称改革开发的伟大旗帜要坚定不移动的走下去"],
        ["日理万机的周总理"],
        ["国务院今天颁发了关于农业的改革方向前进步伐"],
        ["机器学习及其翻译激起了人们极其浓厚的兴趣"],
        ["中共中央书记"]]
observations = pro._str2words(test)
'''
observations = test

phi = {'B':0.5,'E':0,'M':0,'S':0.5}
model = Model(S,observation,phi,trans_prob,conf_prob)
o_hstate = []

for obser in observations:
    '''
    Notice,if a setence is too long,when we use viterbi algorithm it may result in the beta = 0
    There are two solution,one is split the setence into serval sub_setence,another is use log function for the viterbi 
    here we select the first method
    '''
    length = len(obser)
    index,sub_obser,state= 0,[],[]
    while index < length:
        sub_obser.append(obser[index])
        if obser[index] == '。' or obser[index]==',':
            sub_state = model.decode(sub_obser)
            sub_obser = []
Example #4
0
# -*- coding: utf-8 -*-

from hmm import Model

states = ("rainy", "sunny")
symbols = ("walk", "shop", "clean")

start_prob = {"rainy": 0.5, "sunny": 0.5}

trans_prob = {"rainy": {"rainy": 0.7, "sunny": 0.3}, "sunny": {"rainy": 0.4, "sunny": 0.6}}

emit_prob = {"rainy": {"walk": 0.1, "shop": 0.4, "clean": 0.5}, "sunny": {"walk": 0.6, "shop": 0.3, "clean": 0.1}}

sequence = ["walk", "shop", "clean", "clean", "walk", "walk", "walk", "clean"]
model = Model(states, symbols, start_prob, trans_prob, emit_prob)

print model.evaluate(sequence)
print model.decode(sequence)
Example #5
0
trans_prob = {
    'rainy': {
        'rainy': 0.7,
        'sunny': 0.3
    },
    'sunny': {
        'rainy': 0.4,
        'sunny': 0.6
    }
}

emit_prob = {
    'rainy': {
        'walk': 0.1,
        'shop': 0.4,
        'clean': 0.5
    },
    'sunny': {
        'walk': 0.6,
        'shop': 0.3,
        'clean': 0.1
    }
}

sequence = ['walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean']
model = Model(states, symbols, start_prob, trans_prob, emit_prob)

print model.evaluate(sequence)
print model.decode(sequence)
Example #6
0
    outputfile = 'output'
    paramsfile = 'params.txt'

    print 'Loading data file...'
    dataloader = DataManager(datafile)
    params = dataloader.getParams(paramsfile)
    dataloader.skipHeadline()
    sample = dataloader.getNextSample()
    targets_list = dataloader.getTargetsList()
    output = file(outputfile, 'w')
    while sample :
        #target_index is used to split observations sequence
        target_index_begin = 0
        target_index_end = 0
        temp = 1
        for targets in targets_list:
            print 'Running HMM for sample[' + sample['sample_id'] + ']: ',
            print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(len(targets_list)) + ']'
            temp += 1
            target_index_end = target_index_begin + len(targets)

            modelParams = ModelParams(params, targets)
            #the 'observations' of sample is splitted
            model = Model(modelParams, sample['observations'][target_index_begin:target_index_end])
            pathlist = model.forwardBackward_Viterbi()
            dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end])
            target_index_begin = target_index_end
        sample = dataloader.getNextSample()

    output.close()
Example #7
0
#The hidden states 
states = [1,2,3]

#The observation states
observation = [1,2]


#The intial probability for the hidden states
phi = {1:0.333,2:0.333,3:0.333}

#The trans prob for the hidden states
trans_prob = {
    1:{1:0.333,2:0.333,3:0.333},
    2:{1:0.333,2:0.333,3:0.333},
    3:{1:0.333,2:0.333,3:0.333}
}

#The prob of observation in condition of a hidden state
conf_prob = {
    1:{1:0.5,2:0.5},
    2:{1:0.75,2:0.25},
    3:{1:0.25,2:0.75}
}

observations =[1,1,1,1,2,1,2,2,2,2]

model = Model(states,observation,phi,trans_prob,conf_prob)

print model.evaluate(observations)
print model.decode(observations)
Example #8
0
emit_prob = {
    'rainy': {
        'walk': 0.1,
        'shop': 0.4,
        'clean': 0.5
    },
    'sunny': {
        'walk': 0.6,
        'shop': 0.3,
        'clean': 0.1
    }
}

sequence = ['walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean']
sequence2 = ['walk', 'walk', 'walk', 'walk', 'walk', 'walk', 'walk', 'walk']
model = Model(states, symbols, start_prob, trans_prob, emit_prob)

print(model.evaluate(sequence))

for i in range(0, len(sequence)):
    print(i)
    print(
        model._forward(sequence)[i]['sunny'] *
        model._backward(sequence)[i]['sunny'] +
        model._forward(sequence)[i]['rainy'] *
        model._backward(sequence)[i]['rainy'])

print(model.decode(sequence))
print(model.decode(sequence2))

print(model._forward(sequence))
Example #9
0
conf_prob, trans_prob = pro._tran_conf_prob(train, test_wordcount, word_count,
                                            hidden_states)
'''
test = [["中华人民共和国今天成立了中国人民从此站起来了"],
        ["江泽民的三个代表是中国在社会主义改革过程中的智慧结晶"],
        ["人民日报称改革开发的伟大旗帜要坚定不移动的走下去"],
        ["日理万机的周总理"],
        ["国务院今天颁发了关于农业的改革方向前进步伐"],
        ["机器学习及其翻译激起了人们极其浓厚的兴趣"],
        ["中共中央书记"]]
observations = pro._str2words(test)
'''
observations = test

phi = {'B': 0.5, 'E': 0, 'M': 0, 'S': 0.5}
model = Model(S, observation, phi, trans_prob, conf_prob)
o_hstate = []

for obser in observations:
    '''
    Notice,if a setence is too long,when we use viterbi algorithm it may result in the beta = 0
    There are two solution,one is split the setence into serval sub_setence,another is use log function for the viterbi 
    here we select the first method
    '''
    length = len(obser)
    index, sub_obser, state = 0, [], []
    while index < length:
        sub_obser.append(obser[index])
        if obser[index] == '。' or obser[index] == ',':
            sub_state = model.decode(sub_obser)
            sub_obser = []
Example #10
0
    params = dataloader.getParams(paramsfile)
    dataloader.skipHeadline()
    sample = dataloader.getNextSample()
    targets_list = dataloader.getTargetsList()
    output = file(outputfile, 'w')
    while sample:
        #target_index is used to split observations sequence
        target_index_begin = 0
        target_index_end = 0
        temp = 1
        for targets in targets_list:
            print 'Running HMM for sample[' + sample['sample_id'] + ']: ',
            print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(
                len(targets_list)) + ']'
            temp += 1
            target_index_end = target_index_begin + len(targets)

            modelParams = ModelParams(params, targets)
            #the 'observations' of sample is splitted
            model = Model(
                modelParams,
                sample['observations'][target_index_begin:target_index_end])
            pathlist = model.forwardBackward_Viterbi()
            dataloader.outputCNV(
                output, sample['sample_id'], targets, pathlist,
                sample['observations'][target_index_begin:target_index_end])
            target_index_begin = target_index_end
        sample = dataloader.getNextSample()

    output.close()
Example #11
0
#The prob of observation in condition of a hidden state
conf_prob = {
    'rainy': {
        'walk': 0.1,
        'shop': 0.3,
        'clean': 0.6
    },
    'sunny': {
        'walk': 0.4,
        'shop': 0.5,
        'clean': 0.1
    },
    'cloudy': {
        'walk': 0.6,
        'shop': 0.25,
        'clean': 0.15
    }
}

observations = [
    'walk', 'shop', 'clean', 'clean', 'walk', 'walk', 'walk', 'clean'
]
#The iter_num is the iteration number in the EM algorithm
iter_num = 50

model = Model(states, observation, phi, trans_prob, conf_prob, iter_num)

print model.evaluate(observations)
print model.decode(observations)
Example #12
0
def discover(args) :
    paramsfile = args.params
    sample_req = args.sample
    hetsnp = args.hetsnp
    tagsnp = args.tagsnp
    vcf_file = args.vcf

    if hetsnp == 'True' or hetsnp == 'TRUE':
        hetsnp = True
    else:
        hetsnp = False
    
    if tagsnp == 'True' or tagsnp == 'TRUE':
        tagsnp = True
    else:
        tagsnp = False

    datafile = args.rpkm_matrix
    f_dir = os.path.dirname(datafile)
    if f_dir != '':
        f_dir = f_dir + '/'

    if args.output:
		outputfile = f_dir + str(args.output)

    tagsnp_file = args.tagsnp_file
    mode = args.mode

    sample_flag = False #used to check whether sample_req exists

    # Build a reference set 
    if mode == 'single' or mode == 'baseline' or mode == 'reference' or mode == 'ref':
        print 'Building the reference dataset...'
        dataloader = DataManager(datafile)
        samples_np = dataloader.getAllSamples()
        dataloader.closeFile()
        print 'Baseline is Done.'

    print 'Loading data file...',
    dataloader = DataManager(datafile)
    print 'Done!'
    print 'Loading paramters...',
    params = dataloader.getParams(paramsfile)
    print 'Done!'
    dataloader.skipHeadline()
    sample = dataloader.getNextSample()

    targets_list = dataloader.getTargetsList()
    output_aux = file(outputfile+'.aux', 'w')
    output_aux.write('SAMPLE_ID\tCNV_TYPE\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n')
    output = file(outputfile,'w')
    output.write('SAMPLE_ID\tCNV_TYPE\tINTERVAL\tCHROMOSOME\tSTART\tSTOP\tLENGTH\n')

    if (hetsnp or tagsnp) and vcf_file == '':
        print 'Error: please indicate a vcf file!'
        system.exit(0)

    if vcf_file != '':
        vcf_reader = VCFReader(vcf_file)
    else:
	vcf_reader = False

    if tagsnp:
        print 'Loading tagSNP information ...',
        cnp_dict = vcf_reader.loadTagSNP(tagsnp_file)
        print 'Done!'

    while sample :
        if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req):
            sample_flag = True
            print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ,sample_req,'......'

            #Renjie added: To check whether the VCF contains sample_req.
            vcf_checker = vcf.Reader(open(vcf_file,'r'))
            if sample['sample_id'] in vcf_checker.samples:
                sample_in_VCF = True
            elif sample_req in vcf_checker.samples:
                sample_in_VCF = True
            else:
                print 'No sample %s in VCF file.'%sample_req
                sample_in_VCF = False

            if hetsnp and sample_in_VCF :
                print 'Parsing SNV information from VCF file for: ' + sample['sample_id']
                snp_info = vcf_reader.getSNPInfo(sample['sample_id'], targets_list)

            if tagsnp and sample_in_VCF:
                print 'Analysing tagSNP information from tagSNP database for: ' + sample['sample_id'],
                cnp_list = vcf_reader.findTagSNPForSample(sample['sample_pop'], sample['sample_id'], cnp_dict)
                tagsnp_info_list = vcf_reader.findExonWithTagSNP(cnp_list, targets_list, overlap_threshold=0.5)
                print len(tagsnp_info_list)

            #estimate NB paramters from sample['observations']  
            sample_observations = []
            remove_list = []
            sample['observations'] = [ float(x) for x in sample['observations']]
            
            #slicing: target_index is used to split observations sequence
            target_index_begin = 0
            target_index_end = 0
            temp = 1

            sample_observations_list = []
            snp_info_list = []

            for i, targets in enumerate(targets_list):
                target_index_end = target_index_begin + len(targets)
                if hetsnp and sample_in_VCF:
                    snp_info_list.append(snp_info[target_index_begin:target_index_end])
                sample_observations_list.append(sample['observations'][target_index_begin:target_index_end])

                target_index_begin = target_index_end

            # Filtering:
            if mode == 'svd' or mode == 'SVD' or mode == 'pooled' or mode == 'pooled-sample':
                for i in range(len(sample_observations_list)):
                    sample_observations_list[i] = ndarray.tolist(stats.zscore(sample_observations_list[i]))

            elif mode == 'baseline' or mode == 'reference'  or mode == 'single' or mode == 'single-sample':
                # filtering lists whose observation equals to 0

                for i in range(len(targets_list)):
                    rem_index = []
                    for j in range(len(targets_list[i])):
                        value = sample_observations_list[i][j]
                        if np.isnan(float(value)):
                            rem_index.append(j)
                    #filter target_list, snp_list and observation_list    
                    targets_list[i] = jf.filter_list_by_list(targets_list[i], rem_index)
                    sample_observations_list[i] = jf.filter_list_by_list(sample_observations_list[i], rem_index)
                    if hetsnp and sample_in_VCF:
                        snp_info_list[i] = jf.filter_list_by_list(snp_info_list[i], rem_index)
                    if tagsnp and sample_in_VCF:
                        tagsnp_info_list[i] = jf.filter_list_by_list(tagsnp_info_list[i], rem_index)

                #Parameters estimation
                observations_all_list = []
                for i in range(len(sample_observations_list)):
                    observations_all_list.extend(sample_observations_list[i])

                parameterLoader = ParameterEstimation(observations_all_list)
                parameterList = parameterLoader.fit(observations_all_list,0.01,0.99)
                print "Estimated Paramters: ",parameterList
                params.append(parameterList[0])#mu
                params.append(parameterList[1])#sd
                    
            for i, targets in enumerate(targets_list):
                print 'Running HMM for sample[' + sample['sample_id'] + ']: ',
                print 'chr' + targets[0]._chr + ' [' + str(temp) + '|' + str(len(targets_list)) + ']'
                temp += 1
		
                #Run the HMM 
                if not hetsnp and not tagsnp:
                    modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0)
                elif sample_in_VCF and hetsnp and not tagsnp:
                	modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp=0)
                elif sample_in_VCF and not hetsnp and tagsnp:
                	modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=tagsnp_info_list[i])
                elif sample_in_VCF and hetsnp and tagsnp:
                	modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp_info_list[i])
                elif not sample_in_VCF and hetsnp and tagsnp:
                    modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0)
                else:
                    pdb.set_trace()
	
                model = Model(mode, modelParams, sample_observations_list[i])
                pathlist = list()
                
                if vcf_reader and sample_in_VCF:
                    pathlist = model.forwardBackward_Viterbi(mode, if_snp = True)
                else:
                    pathlist = model.forwardBackward_Viterbi(mode, if_snp = False)
                dataloader.outputCNVaux(output_aux, sample['sample_id'], targets, pathlist, sample_observations_list[i])
                dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample_observations_list[i])

        sample = dataloader.getNextSample()

    output.close()
    output_aux.close()
    dataloader.closeFile()

    if not sample_flag:
        print 'Could not find the sample_id specified.'