def discover(args): datafile = args.datafile outputfile = args.output paramsfile = args.params sample_req = args.sample sample_flag = False #used to check whether sample_req exists print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') output.write( 'SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') while sample: if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str( len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model( modelParams, sample['observations'] [target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV( output, sample['sample_id'], targets, pathlist, sample['observations'] [target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'
def discover(args) : datafile = args.datafile outputfile = args.output paramsfile = args.params sample_req = args.sample sample_flag = False #used to check whether sample_req exists print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') output.write('SAMPLE_ID\tCNV\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') while sample : if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model(modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'
outputfile = 'output' paramsfile = 'params.txt' print 'Loading data file...' dataloader = DataManager(datafile) params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') while sample : #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str(len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model(modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close()
params = dataloader.getParams(paramsfile) dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output = file(outputfile, 'w') while sample: #target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 for targets in targets_list: print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '\\' + str( len(targets_list)) + ']' temp += 1 target_index_end = target_index_begin + len(targets) modelParams = ModelParams(params, targets) #the 'observations' of sample is splitted model = Model( modelParams, sample['observations'][target_index_begin:target_index_end]) pathlist = model.forwardBackward_Viterbi() dataloader.outputCNV( output, sample['sample_id'], targets, pathlist, sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end sample = dataloader.getNextSample() output.close()
def discover(args) : paramsfile = args.params sample_req = args.sample hetsnp = args.hetsnp tagsnp = args.tagsnp vcf_file = args.vcf if hetsnp == 'True' or hetsnp == 'TRUE': hetsnp = True else: hetsnp = False if tagsnp == 'True' or tagsnp == 'TRUE': tagsnp = True else: tagsnp = False datafile = args.rpkm_matrix f_dir = os.path.dirname(datafile) if f_dir != '': f_dir = f_dir + '/' if args.output: outputfile = f_dir + str(args.output) tagsnp_file = args.tagsnp_file mode = args.mode sample_flag = False #used to check whether sample_req exists # Build a reference set if mode == 'single' or mode == 'baseline' or mode == 'reference' or mode == 'ref': print 'Building the reference dataset...' dataloader = DataManager(datafile) samples_np = dataloader.getAllSamples() dataloader.closeFile() print 'Baseline is Done.' print 'Loading data file...', dataloader = DataManager(datafile) print 'Done!' print 'Loading paramters...', params = dataloader.getParams(paramsfile) print 'Done!' dataloader.skipHeadline() sample = dataloader.getNextSample() targets_list = dataloader.getTargetsList() output_aux = file(outputfile+'.aux', 'w') output_aux.write('SAMPLE_ID\tCNV_TYPE\tFULL_INTERVAL\tINDEX\tINTERVAL\tREAD_DEPTH\n') output = file(outputfile,'w') output.write('SAMPLE_ID\tCNV_TYPE\tINTERVAL\tCHROMOSOME\tSTART\tSTOP\tLENGTH\n') if (hetsnp or tagsnp) and vcf_file == '': print 'Error: please indicate a vcf file!' system.exit(0) if vcf_file != '': vcf_reader = VCFReader(vcf_file) else: vcf_reader = False if tagsnp: print 'Loading tagSNP information ...', cnp_dict = vcf_reader.loadTagSNP(tagsnp_file) print 'Done!' while sample : if sample_req == '' or (sample_req != '' and sample['sample_id'] == sample_req): sample_flag = True print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) ,sample_req,'......' #Renjie added: To check whether the VCF contains sample_req. vcf_checker = vcf.Reader(open(vcf_file,'r')) if sample['sample_id'] in vcf_checker.samples: sample_in_VCF = True elif sample_req in vcf_checker.samples: sample_in_VCF = True else: print 'No sample %s in VCF file.'%sample_req sample_in_VCF = False if hetsnp and sample_in_VCF : print 'Parsing SNV information from VCF file for: ' + sample['sample_id'] snp_info = vcf_reader.getSNPInfo(sample['sample_id'], targets_list) if tagsnp and sample_in_VCF: print 'Analysing tagSNP information from tagSNP database for: ' + sample['sample_id'], cnp_list = vcf_reader.findTagSNPForSample(sample['sample_pop'], sample['sample_id'], cnp_dict) tagsnp_info_list = vcf_reader.findExonWithTagSNP(cnp_list, targets_list, overlap_threshold=0.5) print len(tagsnp_info_list) #estimate NB paramters from sample['observations'] sample_observations = [] remove_list = [] sample['observations'] = [ float(x) for x in sample['observations']] #slicing: target_index is used to split observations sequence target_index_begin = 0 target_index_end = 0 temp = 1 sample_observations_list = [] snp_info_list = [] for i, targets in enumerate(targets_list): target_index_end = target_index_begin + len(targets) if hetsnp and sample_in_VCF: snp_info_list.append(snp_info[target_index_begin:target_index_end]) sample_observations_list.append(sample['observations'][target_index_begin:target_index_end]) target_index_begin = target_index_end # Filtering: if mode == 'svd' or mode == 'SVD' or mode == 'pooled' or mode == 'pooled-sample': for i in range(len(sample_observations_list)): sample_observations_list[i] = ndarray.tolist(stats.zscore(sample_observations_list[i])) elif mode == 'baseline' or mode == 'reference' or mode == 'single' or mode == 'single-sample': # filtering lists whose observation equals to 0 for i in range(len(targets_list)): rem_index = [] for j in range(len(targets_list[i])): value = sample_observations_list[i][j] if np.isnan(float(value)): rem_index.append(j) #filter target_list, snp_list and observation_list targets_list[i] = jf.filter_list_by_list(targets_list[i], rem_index) sample_observations_list[i] = jf.filter_list_by_list(sample_observations_list[i], rem_index) if hetsnp and sample_in_VCF: snp_info_list[i] = jf.filter_list_by_list(snp_info_list[i], rem_index) if tagsnp and sample_in_VCF: tagsnp_info_list[i] = jf.filter_list_by_list(tagsnp_info_list[i], rem_index) #Parameters estimation observations_all_list = [] for i in range(len(sample_observations_list)): observations_all_list.extend(sample_observations_list[i]) parameterLoader = ParameterEstimation(observations_all_list) parameterList = parameterLoader.fit(observations_all_list,0.01,0.99) print "Estimated Paramters: ",parameterList params.append(parameterList[0])#mu params.append(parameterList[1])#sd for i, targets in enumerate(targets_list): print 'Running HMM for sample[' + sample['sample_id'] + ']: ', print 'chr' + targets[0]._chr + ' [' + str(temp) + '|' + str(len(targets_list)) + ']' temp += 1 #Run the HMM if not hetsnp and not tagsnp: modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0) elif sample_in_VCF and hetsnp and not tagsnp: modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp=0) elif sample_in_VCF and not hetsnp and tagsnp: modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=tagsnp_info_list[i]) elif sample_in_VCF and hetsnp and tagsnp: modelParams = ModelParams(mode, params, targets, snp_info_list[i], tagsnp_info_list[i]) elif not sample_in_VCF and hetsnp and tagsnp: modelParams = ModelParams(mode, params, targets, het_nums=0, tagsnp=0) else: pdb.set_trace() model = Model(mode, modelParams, sample_observations_list[i]) pathlist = list() if vcf_reader and sample_in_VCF: pathlist = model.forwardBackward_Viterbi(mode, if_snp = True) else: pathlist = model.forwardBackward_Viterbi(mode, if_snp = False) dataloader.outputCNVaux(output_aux, sample['sample_id'], targets, pathlist, sample_observations_list[i]) dataloader.outputCNV(output, sample['sample_id'], targets, pathlist, sample_observations_list[i]) sample = dataloader.getNextSample() output.close() output_aux.close() dataloader.closeFile() if not sample_flag: print 'Could not find the sample_id specified.'