def execute_findAmplificationsAndCalculateStats_fromGff(self, #analysis_id_I, experiment_id_I, strand_start, strand_stop, sample_names_I = [], scale_factor=True, downsample_factor=2000,reads_min=1.5,reads_max=5.0, indices_min=200,consecutive_tol=10): '''Calculate coverage statistics from gff file NOTE: multiple chromosomes not yet supported in sequencing_utilities''' # get the data data_O = []; stats_O = []; #OPTION1 gffcoverage = gff_coverage(); ## get the analysis_info #analysis_rows = []; # query information from coverage table # get the sample_names experiment_id = experiment_id_I; if sample_names_I: sample_names = sample_names_I; else: sample_names = []; sample_names = self.get_sampleNames_experimentID_dataStage01ResequencingCoverage(experiment_id_I); #for cnt,analysis in analysis_rows: # # get the sample_names and experiment_ids # experiment_id = analysis['experiment_id']; # sn = analysis['sample_name']; # filename = analysis['data_dir'] for cnt,sn in enumerate(sample_names): # get the data_dir filename = []; filename = self.get_dataDirs_experimentIDAndSampleName_dataStage01ResequencingCoverage(experiment_id_I,sn); #OPTION1 # find amplifications and calculate stats gffcoverage.findAndCalculate_amplificationStats_fromGff(filename[0],strand_start, strand_stop, experiment_id_I=experiment_id, sample_name_I=sn, indices_min = indices_min, consecutive_tol = consecutive_tol, scale_factor=scale_factor, downsample_factor=downsample_factor) data_O.extend(copy(gffcoverage.amplifications)); stats_O.extend(copy(gffcoverage.amplificationStats)); gffcoverage.clear_data(); ##OPTION2 ## find amplifications and calculate stats #amplifications,amplificationStats=[],[]; #amplifications,amplificationStats = self.findAndCalculate_amplificationStats_fromGff(filename[0],strand_start, strand_stop, experiment_id_I=experiment_id, sample_name_I=sn, indices_min = indices_min, consecutive_tol = consecutive_tol, scale_factor=scale_factor, downsample_factor=downsample_factor) #data_O.extend(amplifications); #stats_O.extend(amplificationStats); # add data to the DB self.add_dataStage01ResequencingAmplifications(data_O); self.add_dataStage01ResequencingAmplificationStats(stats_O);
def import_resequencingCoverageData_add( self, filename, # analysis_id, experiment_id, sample_name, strand_start, strand_stop, scale_factor=True, downsample_factor=2000, ): """table adds NOTE: multiple chromosomes not yet supported in sequencing_utilities""" # OPTION1 gffcoverage = gff_coverage() coverage_data = [] if ".bam" in filename: # TODO convert .bam to .gff using makegff.py from sequencing_utilities print("conversion of .bam to .gff not yet supported") exit(2) # filename_bam = filename; # filename = filename.replace('.bam','.gff'); # extract_strandsFromGff(filename_bam,filename,separate_strand=False); # convert strings to float and int strand_start, strand_stop, scale_factor, downsample_factor = ( int(strand_start), int(strand_stop), bool(scale_factor), float(downsample_factor), ) # OPTION1 # parse the gff file gffcoverage.extract_coverage_fromGff( filename, strand_start, strand_stop, scale_factor=scale_factor, downsample_factor=downsample_factor, experiment_id_I=experiment_id, sample_name_I=sample_name, ) coverage_data = gffcoverage.coverage ##OPTION2 ## parse the gff file # coverage_data = []; # coverage_data = self.extract_coverage_fromGff(filename, strand_start, strand_stop, scale_factor=scale_factor, downsample_factor=downsample_factor,experiment_id_I = experiment_id,sample_name_I=sample_name); # add data to the database: self.add_dataStage01ResequencingCoverage(coverage_data)
def execute_findAmplifications_fromGff(self, #analysis_id_I, experiment_id_I, strand_start, strand_stop, sample_names_I = [], scale_factor=True, downsample_factor=0,reads_min=1.5,reads_max=5.0, indices_min=200,consecutive_tol=10): '''Calculate coverage statistics from gff file NOTE: multiple chromosomes not yet supported in sequencing_utilities''' #from sequencing_utilities.coverage import extract_strandsFromGff,find_highCoverageRegions # get the data data_O = []; #OPTION1 gffcoverage = gff_coverage(); # get the sample_names experiment_id = experiment_id_I; if sample_names_I: sample_names = sample_names_I; else: sample_names = []; sample_names = self.get_sampleNames_experimentID_dataStage01ResequencingCoverage(experiment_id_I); #for cnt,analysis in analysis_rows: # # get the sample_names and experiment_ids # experiment_id = analysis['experiment_id']; # sn = analysis['sample_name']; # filename = analysis['data_dir'] for cnt,sn in enumerate(sample_names): # get the data_dir filename = []; filename = self.get_dataDirs_experimentIDAndSampleName_dataStage01ResequencingCoverage(experiment_id_I,sn); #OPTION1 gffcoverage.find_amplifications_fromGff(filename[0],strand_start, strand_stop, experiment_id, sn, scale=scale_factor, downsample=downsample_factor) data_O.extend(copy(gffcoverage.amplifications)) gffcoverage.clear_data(); #OPTION2 #amplifications = []; #amplifications = self.find_amplifications_fromGff(filename[0],strand_start, strand_stop, experiment_id, sn, scale=scale_factor, downsample=downsample_factor) #data_O.extend(amplifications) # add data to the DB self.add_dataStage01ResequencingAmplifications(data_O);
def execute_coverageStats_fromGff(self, #analysis_id_I, experiment_id_I, strand_start,strand_stop,scale_factor=True,downsample_factor=0, sample_names_I=[]): '''Calculate coverage statistics from gff file NOTE: multiple chromosomes not yet supported in sequencing_utilities''' #OPTION1 gffcoverage = gff_coverage(); ## get the analysis_info #analysis_rows = []; #analysis_rows = self.get_rows_analysisID_dataStage01ResequencingAnalysis(analysis_id_I); if sample_names_I: sample_names = sample_names_I; else: sample_names = []; sample_names = self.get_sampleNames_experimentID_dataStage01ResequencingCoverage(experiment_id_I); # get the data data_O = []; for sn in sample_names: # get the filename filename = None; filename = self.get_dataDirs_experimentIDAndSampleName_dataStage01ResequencingCoverage(experiment_id_I,sn); #OPTION1 # calculate the coverage statistics gffcoverage.calculate_coverageStats_fromGff(filename[0], strand_start,strand_stop,scale_factor=scale_factor,downsample_factor=downsample_factor, experiment_id_I=experiment_id_I, sample_name_I=sn); data_O.extend(copy(gffcoverage.coverageStats)); gffcoverage.clear_data(); ##OPTION2 ## calculate the coverage statistics #coverateStats = []; #coverageStats = calculate_interface_coverageStats_fromGff(filename[0], # strand_start,strand_stop,scale_factor=scale_factor,downsample_factor=downsample_factor, # experiment_id_I=experiment_id_I, sample_name_I=sn); #data_O.extend(coverageStats); #add data to the database self.add_dataStage01ResequencingCoverageStats(data_O);
# define search paths manually import sys # dependency dirs sys.path.append('C:/Users/dmccloskey-sbrg/Documents/GitHub/sequencing_analysis') sys.path.append('C:/Users/dmccloskey-sbrg/Documents/GitHub/io_utilities') sys.path.append('C:/Users/dmccloskey-sbrg/Documents/GitHub/sequencing_utilities') sys.path.append('C:/Users/dmccloskey-sbrg/Documents/GitHub/calculate_utilities') from sequencing_analysis.genome_diff import genome_diff from sequencing_analysis.mutations_lineage import mutations_lineage from sequencing_analysis.mutations_endpoints import mutations_endpoints from sequencing_analysis.mutations_heatmap import mutations_heatmap from sequencing_analysis.gff_coverage import gff_coverage #analyze the coverage for a particular strain gffcoverage = gff_coverage(); gffcoverage.extract_coverage_fromGff(gff_file = '//proline/Users/dmccloskey/Resequencing_DNA/Evo04ptsHIcrrEvo04EP/Evo04ptsHIcrrEvo04EP/data/Evo04ptsHIcrrEvo04EP_reference.gff', strand_start = 0,strand_stop = 4640000, scale_factor = False,downsample_factor = 2000, experiment_id_I = 'ALEsKOs01', sample_name_I = 'Evo04ptsHIcrrEvo04EP'); # calculate the coverage statistics gffcoverage.calculate_coverageStats_fromGff(gff_file = '//proline/Users/dmccloskey/Resequencing_DNA/Evo04ptsHIcrrEvo04EP/Evo04ptsHIcrrEvo04EP/data/Evo04ptsHIcrrEvo04EP_reference.gff', strand_start = 0,strand_stop = 4640000, scale_factor = False,downsample_factor = 0, experiment_id_I = 'ALEsKOs01', sample_name_I = 'Evo04ptsHIcrrEvo04EP') gffcoverage.export_coverageStats('Evo04ptsHIcrrEvo04EP_coverage.csv'); gffcoverage.export_coverage_js(); # find amplifications gffcoverage.findAndCalculate_amplificationStats_fromGff(gff_file = '//proline/Users/dmccloskey/Resequencing_DNA/Evo04ptsHIcrrEvo04EP/Evo04ptsHIcrrEvo04EP/data/Evo04ptsHIcrrEvo04EP_reference.gff',