Example #1
0
def discover(args) :
    paramsfile = args.params
    sample_req = args.sample
    hetsnp = args.hetsnp
    tagsnp = args.tagsnp
    vcf_file = args.vcf

    if hetsnp == 'True' or hetsnp == 'TRUE':
        hetsnp = True
    else:
        hetsnp = False
    
    if tagsnp == 'True' or tagsnp == 'TRUE':
        tagsnp = True
    else:
        tagsnp = False

    datafile = args.rpkm_matrix
    f_dir = os.path.dirname(datafile)
    if f_dir != '':
        f_dir = f_dir + '/'

    if args.output:
		outputfile = f_dir + str(args.output)

    tagsnp_file = args.tagsnp_file
    mode = args.mode

    sample_flag = False #used to check whether sample_req exists

    # Build a reference set 
    if mode == 'single' or mode == 'baseline' or mode == 'reference' or mode == 'ref':
        print 'Building the reference dataset...'
        dataloader = DataManager(datafile)
        samples_np = dataloader.getAllSamples()
        dataloader.closeFile()
        print 'Baseline is Done.'

    print 'Loading data file...',
    dataloader = DataManager(datafile)
    print 'Done!'
    print 'Loading paramters...',
    params = dataloader.getParams(paramsfile)
    print 'Done!'
    dataloader.skipHeadline()
    sample = dataloader.getNextSample()

    targets_list = dataloader.getTargetsList()
    output_aux = file(outputfile+'.aux', 'w')
    output_aux.write('SAMPLE_ID\tCNV_TYPE\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n')
    output = file(outputfile,'w')
    output.write('SAMPLE_ID\tCNV_TYPE\tINTERVAL\tCHROMOSOME\tSTART\tSTOP\tLENGTH\n')

    if (hetsnp or tagsnp) and vcf_file == '':
        print 'Error: please indicate a vcf file!'
        system.exit(0)

    if vcf_file != '':
        vcf_reader = VCFReader(vcf_file)
    else:
	vcf_reader = False

    if tagsnp:
        print 'Loading tagSNP information ...',
        cnp_dict = vcf_reader.loadTagSNP(tagsnp_file)
        print 'Done!'

    while sample :
        if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req):
            sample_flag = True
            print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ,sample_req,'......'

            #Renjie added: To check whether the VCF contains sample_req.
            vcf_checker = vcf.Reader(open(vcf_file,'r'))
            if sample['sample_id'] in vcf_checker.samples:
                sample_in_VCF = True
            elif sample_req in vcf_checker.samples:
                sample_in_VCF = True
            else:
                print 'No sample %s in VCF file.'%sample_req
                sample_in_VCF = False

            if hetsnp and sample_in_VCF :
                print 'Parsing SNV information from VCF file for: ' + sample['sample_id']
                snp_info = vcf_reader.getSNPInfo(sample['sample_id'], targets_list)

            if tagsnp and sample_in_VCF:
                print 'Analysing tagSNP information from tagSNP database for: ' + sample['sample_id'],
                cnp_list = vcf_reader.findTagSNPForSample(sample['sample_pop'], sample['sample_id'], cnp_dict)
                tagsnp_info_list = vcf_reader.findExonWithTagSNP(cnp_list, targets_list, overlap_threshold=0.5)
                print len(tagsnp_info_list)

            #estimate NB paramters from sample['observations']  
            sample_observations = []
            remove_list = []
            sample['observations'] = [ float(x) for x in sample['observations']]
            
            #slicing: target_index is used to split observations sequence
            target_index_begin = 0
            target_index_end = 0
            temp = 1

            sample_observations_list = []
            snp_info_list = []

            for i, targets in enumerate(targets_list):
                target_index_end = target_index_begin + len(targets)
                if hetsnp and sample_in_VCF:
                    snp_info_list.append(snp_info[target_index_begin:target_index_end])
                sample_observations_list.append(sample['observations'][target_index_begin:target_index_end])

                target_index_begin = target_index_end

            # Filtering:
            if mode == 'svd' or mode == 'SVD' or mode == 'pooled' or mode == 'pooled-sample':
                for i in range(len(sample_observations_list)):
                    sample_observations_list[i] = ndarray.tolist(stats.zscore(sample_observations_list[i]))

            elif mode == 'baseline' or mode == 'reference'  or mode == 'single' or mode == 'single-sample':
                # filtering lists whose observation equals to 0

                for i in range(len(targets_list)):
                    rem_index = []
                    for j in range(len(targets_list[i])):
                        value = sample_observations_list[i][j]
                        if np.isnan(float(value)):
                            rem_index.append(j)
                    #filter target_list, snp_list and observation_list    
                    targets_list[i] = jf.filter_list_by_list(targets_list[i], rem_index)
                    sample_observations_list[i] = jf.filter_list_by_list(sample_observations_list[i], rem_index)
                    if hetsnp and sample_in_VCF:
                        snp_info_list[i] = jf.filter_list_by_list(snp_info_list[i], rem_index)
                    if tagsnp and sample_in_VCF:
                        tagsnp_info_list[i] = jf.filter_list_by_list(tagsnp_info_list[i], rem_index)

                #Parameters estimation
                observations_all_list = []
                for i in range(len(sample_observations_list)):
                    observations_all_list.extend(sample_observations_list[i])

                parameterLoader = ParameterEstimation(observations_all_list)
                parameterList = parameterLoader.fit(observations_all_list,0.01,0.99)
                print "Estimated Paramters: ",parameterList
                params.append(parameterList[0])#mu
                params.append(parameterList[1])#sd
                    
            for i, targets in enumerate(targets_list):
                print 'Running HMM for sample[' + sample['sample_id'] + ']: ',
                print 'chr' + targets[0]._chr + ' [' + str(temp) + '|' + str(len(targets_list)) + ']'
                temp += 1
		
                #Run the HMM 
                if not hetsnp and not tagsnp:
                    modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0)
                elif sample_in_VCF and hetsnp and not tagsnp:
                	modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp=0)
                elif sample_in_VCF and not hetsnp and tagsnp:
                	modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=tagsnp_info_list[i])
                elif sample_in_VCF and hetsnp and tagsnp:
                	modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp_info_list[i])
                elif not sample_in_VCF and hetsnp and tagsnp:
                    modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0)
                else:
                    pdb.set_trace()
	
                model = Model(mode, modelParams, sample_observations_list[i])
                pathlist = list()
                
                if vcf_reader and sample_in_VCF:
                    pathlist = model.forwardBackward_Viterbi(mode, if_snp = True)
                else:
                    pathlist = model.forwardBackward_Viterbi(mode, if_snp = False)
                dataloader.outputCNVaux(output_aux, sample['sample_id'], targets, pathlist, sample_observations_list[i])
                dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample_observations_list[i])

        sample = dataloader.getNextSample()

    output.close()
    output_aux.close()
    dataloader.closeFile()

    if not sample_flag:
        print 'Could not find the sample_id specified.'