def test_correct_calls_are_made(self): """ Tests that the correct arguments are passed to the method which calls the normalization script. Mostly tests the path renaming, etc. """ self.module.call_script = mock.Mock() project = Project() project.raw_count_matrices = ['/path/to/raw_counts/raw_count_matrix.primary.counts', '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts'] project_params = Params() component_params = Params() project_params.add(raw_count_matrix_file_prefix = 'raw_count_matrix') component_params.add(normalized_counts_file_prefix = 'normalized_count_matrix') component_params.add(normalized_counts_output_dir = '/path/to/final/norm_counts_dir') component_params.add(normalization_script = 'normalize.R') project_params.add(sample_annotation_file = '/path/to/samples.txt') project.add_parameters(project_params) m = mock.MagicMock(side_effect = [True, True]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): self.module.normalize(project, component_params) calls = [mock.call('normalize.R', '/path/to/raw_counts/raw_count_matrix.primary.counts', '/path/to/final/norm_counts_dir/normalized_count_matrix.primary.counts', '/path/to/samples.txt' ), mock.call('normalize.R', '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts', '/path/to/final/norm_counts_dir/normalized_count_matrix.primary.dedup.counts', '/path/to/samples.txt' )] self.module.call_script.assert_has_calls(calls)
def test_missing_countfile_raises_exception(self): """ Test one of the files is ok (the first), but the second is not found (for whatever reason). Test that we throw an exception, and that the one successful call was indeed made correctly. """ self.module.call_script = mock.Mock() project = Project() project.raw_count_matrices = ['/path/to/raw_counts/raw_count_matrix.primary.counts', '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts'] project_params = Params() component_params = Params() project_params.add(raw_count_matrix_file_prefix = 'raw_count_matrix') component_params.add(normalized_counts_file_prefix = 'normalized_count_matrix') component_params.add(normalized_counts_output_dir = '/path/to/final/norm_counts_dir') component_params.add(normalization_script = 'normalize.R') project_params.add(sample_annotation_file = '/path/to/samples.txt') project.add_parameters(project_params) m = mock.MagicMock(side_effect = [True, False]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): with self.assertRaises(self.module.MissingCountMatrixFileException): self.module.normalize(project, component_params) calls = [mock.call('normalize.R', '/path/to/raw_counts/raw_count_matrix.primary.counts', '/path/to/final/norm_counts_dir/normalized_count_matrix.primary.counts', '/path/to/samples.txt' )] self.module.call_script.assert_has_calls(calls)
def test_generate_figures(self): """ This is not a unit test in the conventional sense-- this is a full-scale mockup which will create an output pdf and everything. """ project = Project() parameters = { 'aligner': 'star', 'skip_align': False, 'sample_dir_prefix': 'Sample_', 'alignment_dir': 'aln', 'project_directory': 'foo', 'chromosomes': ['chr1', 'chr2', 'chrM'] } project.parameters = parameters component_params = cp.read_config( os.path.join(root, 'components', 'pdf_report', 'report.cfg'), 'COMPONENT_SPECIFIC') extra_params = cp.read_config( os.path.join(root, 'components', 'pdf_report', 'report.cfg'), 'STAR') mock_sample_ids = [ os.path.basename(x).split('.')[0] for x in glob.glob( os.path.join( 'test_data', '*' + component_params.get('coverage_file_suffix'))) ] project.samples = [Sample(x, 'X') for x in mock_sample_ids] component_params['report_output_dir'] = os.path.join( os.path.abspath(os.path.dirname(__file__)), test_output_dir, component_params.get('report_output_dir')) if not os.path.isdir(component_params['report_output_dir']): os.mkdir(component_params['report_output_dir']) # link the test files so they 'appear' in the correct location: [ os.symlink( os.path.abspath(x), os.path.join(component_params['report_output_dir'], os.path.basename(x))) for x in glob.glob( os.path.join( 'test_data', '*' + component_params.get('coverage_file_suffix'))) ] mock_log_data = mock_log_data_structure(project, extra_params) self.module.star_methods.process_star_logs = mock.Mock() self.module.star_methods.process_star_logs.return_value = mock_log_data self.module.get_bam_counts = mock.Mock() self.module.get_bam_counts.return_value = mock_bam_counts( mock_log_data.keys()) self.module.calculate_coverage_data = mock.Mock() self.module.calculate_coverage_data.return_value = None self.module.generate_figures(project, component_params, extra_params)
def test_general_portion_of_template_injected_correctly(self): template = 'STAR=%STAR%\nSAMTOOLS=%SAMTOOLS%\nPICARD_DIR=%PICARD_DIR%\nGTF=%GTF%\nGENOME_INDEX=%GENOME_INDEX%' expected_result = 'STAR=STARPATH\nSAMTOOLS=SAM\nPICARD_DIR=PIC\nGTF=my.gtf\nGENOME_INDEX=GI' p = Params() p.add(star_align = 'STARPATH') p.add(samtools = 'SAM') p.add(gtf = 'my.gtf') p.add(star_genome_index= 'GI') p.add(picard = 'PIC') myproject = Project() myproject.parameters = p result = self.module.fill_out_general_template_portion(myproject, template) self.assertEqual( result, expected_result)
def test_bad_bamfile_path_raises_exception(self): self.module.subprocess = mock.Mock() p = Params() p.add(gtf='/path/to/GTF/mock.gtf') p.add(feature_counts='/path/to/bin/featureCounts') p.add(feature_counts_file_extension='counts') p.add(feature_counts_output_dir='/path/to/final/featureCounts') p.add(paired_alignment=False) s1 = Sample('A', 'X') s1.bamfiles = [ '/path/to/bamdir/A.bam', '/path/to/bamdir/A.primary.bam', '/path/to/bamdir/A.primary.dedup.bam' ] s2 = Sample('B', 'X') s2.bamfiles = ['/path/to/bamdir/B.bam', '/bad/path/B.sort.bam'] project = Project() project.add_parameters(p) project.add_samples([s1, s2]) m = mock.MagicMock(side_effect=[True, True, True, True, False]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): with self.assertRaises(self.module.MissingBamFileException): self.module.execute_counting(project, util_methods)
def test_group_countfiles_raises_exception_if_missing_type(self): """ Test the method that aggregates all the countfiles generated from each 'type' of bam file. That is, we may have multiple bam files for each sample (e.g. primary alignments, deduplicated, etc). We will be generating a countfile for each one of those. When we assemble into a count matrix, we obviously group the files of a particular 'type' (e.g. those coming from deduplicated BAM files). This tests that the the glob methods are called with the correct parameters given the sample annotations prescribed. This one tests that an exception is raised if one of the countfile 'types' is missing. Here, sample B is missing a countfile corresponding to the primary.counts- based BAM files """ p = Params() p.add(feature_counts_output_dir='/path/to/final/featureCounts') s1 = Sample('A', 'X') s1.countfiles = [ '/path/to/final/featureCounts/A.counts', '/path/to/final/featureCounts/A.primary.counts', '/path/to/final/featureCounts/A.primary.dedup.counts' ] s2 = Sample('B', 'Y') s2.countfiles = [ '/path/to/final/featureCounts/B.counts', '/path/to/final/featureCounts/B.primary.dedup.counts' ] s3 = Sample('C', 'Z') s3.countfiles = [ '/path/to/final/featureCounts/C.counts', '/path/to/final/featureCounts/C.primary.counts', '/path/to/final/featureCounts/C.primary.dedup.counts' ] project = Project() project.add_parameters(p) project.add_samples([s1, s2, s3]) mock_util_methods = mock.Mock() mock_case_insensitive_glob = mock.Mock() mock_case_insensitive_glob.side_effect = [ [ '/path/to/final/featureCounts/A.counts', '/path/to/final/featureCounts/B.counts', '/path/to/final/featureCounts/C.counts' ], [ '/path/to/final/featureCounts/A.primary.counts', '/path/to/final/featureCounts/C.primary.counts' ], [ '/path/to/final/featureCounts/A.primary.dedup.counts', '/path/to/final/featureCounts/B.primary.dedup.counts', '/path/to/final/featureCounts/C.primary.dedup.counts' ] ] with self.assertRaises(self.module.CountfileQuantityException): result = self.module.get_countfile_groupings( project, mock_case_insensitive_glob)
def test_group_countfiles(self): """ Test the method that aggregates all the countfiles generated from each 'type' of bam file. That is, we may have multiple bam files for each sample (e.g. primary alignments, deduplicated, etc). We will be generating a countfile for each one of those. When we assemble into a count matrix, we obviously group the files of a particular 'type' (e.g. those coming from deduplicated BAM files). This tests that the the glob methods are called with the correct parameters given the sample annotations prescribed. """ p = Params() cp = Params() cp.add(feature_counts_output_dir='/path/to/final/featureCounts') s1 = Sample('A', 'X') s1.countfiles = [ '/path/to/final/featureCounts/A.counts', '/path/to/final/featureCounts/A.primary.counts', '/path/to/final/featureCounts/A.primary.dedup.counts' ] s2 = Sample('B', 'Y') s2.countfiles = [ '/path/to/final/featureCounts/B.counts', '/path/to/final/featureCounts/B.primary.counts', '/path/to/final/featureCounts/B.primary.dedup.counts' ] s3 = Sample('C', 'Z') s3.countfiles = [ '/path/to/final/featureCounts/C.counts', '/path/to/final/featureCounts/C.primary.counts', '/path/to/final/featureCounts/C.primary.dedup.counts' ] project = Project() project.add_parameters(p) project.add_samples([s1, s2, s3]) result = self.module.get_countfile_groupings(project, cp) expected_result = [ [ '/path/to/final/featureCounts/A.counts', '/path/to/final/featureCounts/B.counts', '/path/to/final/featureCounts/C.counts' ], [ '/path/to/final/featureCounts/A.primary.counts', '/path/to/final/featureCounts/B.primary.counts', '/path/to/final/featureCounts/C.primary.counts' ], [ '/path/to/final/featureCounts/A.primary.dedup.counts', '/path/to/final/featureCounts/B.primary.dedup.counts', '/path/to/final/featureCounts/C.primary.dedup.counts' ] ] self.assertEqual(result, expected_result)
def test_system_calls_single_end_experiment(self): self.module.subprocess = mock.Mock() p = Params() p.add(gtf='/path/to/GTF/mock.gtf') p.add(feature_counts='/path/to/bin/featureCounts') p.add(feature_counts_file_extension='counts') p.add(feature_counts_output_dir='/path/to/final/featureCounts') p.add(paired_alignment=False) s1 = Sample('A', 'X') s1.bamfiles = [ '/path/to/bamdir/A.bam', '/path/to/bamdir/A.primary.bam', '/path/to/bamdir/A.primary.dedup.bam' ] project = Project() project.add_parameters(p) project.add_samples([s1]) m = mock.MagicMock(side_effect=[True, True, True]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): self.module.execute_counting(project, util_methods) calls = [ mock.call( '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -o /path/to/final/featureCounts/A.counts /path/to/bamdir/A.bam', shell=True), mock.call( '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -o /path/to/final/featureCounts/A.primary.counts /path/to/bamdir/A.primary.bam', shell=True), mock.call( '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -o /path/to/final/featureCounts/A.primary.dedup.counts /path/to/bamdir/A.primary.dedup.bam', shell=True) ] self.module.subprocess.check_call.assert_has_calls(calls) # check that the sample contains paths to the new count files in the correct locations: expected_files = [ os.path.join('/path/to/final/featureCounts', re.sub('bam', 'counts', os.path.basename(f))) for f in s1.bamfiles ] actual_files = s1.countfiles self.assertEqual(actual_files, expected_files)
def test_correct_calls_are_made(self): """ Tests that the correct arguments are passed to the method which calls the DESeq script. Mostly tests the path renaming, etc. """ self.module.call_script = mock.Mock() project = Project() project.raw_count_matrices = [ '/path/to/raw_counts/raw_count_matrix.primary.counts', '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts' ] project_params = Params() component_params = Params() project_params.add(raw_count_matrix_file_prefix='raw_count_matrix') project_params.add(feature_counts_file_extension='counts') component_params.add(deseq_output_dir='/path/to/final/deseq_dir') component_params.add(deseq_script='deseq_original.R') project_params.add(sample_annotation_file='/path/to/samples.txt') component_params.add(deseq_output_tag='deseq') component_params.add(deseq_contrast_flag='_vs_') component_params.add(number_of_genes_for_heatmap='30') component_params.add(heatmap_file_tag='heatmap.png') project.add_parameters(project_params) project.contrasts = [('X', 'Y'), ('X', 'Z')] # construct the expected call strings: call_1 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Y /path/to/final/deseq_dir/Y_vs_X.primary.deseq /path/to/final/deseq_dir/Y_vs_X.primary.heatmap.png 30' call_2 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Z /path/to/final/deseq_dir/Z_vs_X.primary.deseq /path/to/final/deseq_dir/Z_vs_X.primary.heatmap.png 30' call_3 = '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts /path/to/samples.txt X Y /path/to/final/deseq_dir/Y_vs_X.primary.dedup.deseq /path/to/final/deseq_dir/Y_vs_X.primary.dedup.heatmap.png 30' call_4 = '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts /path/to/samples.txt X Z /path/to/final/deseq_dir/Z_vs_X.primary.dedup.deseq /path/to/final/deseq_dir/Z_vs_X.primary.dedup.heatmap.png 30' m = mock.MagicMock(side_effect=[True, True]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): self.module.call_deseq(project, component_params) calls = [ mock.call('deseq_original.R', call_1), mock.call('deseq_original.R', call_2), mock.call('deseq_original.R', call_3), mock.call('deseq_original.R', call_4) ] self.module.call_script.assert_has_calls(calls)
def test_missing_countfile_raises_exception(self): """ Test one of the files is ok (the first), but the second is not found (for whatever reason). Test that we throw an exception, and that the one successful call was indeed made correctly. """ self.module.call_script = mock.Mock() project = Project() project.raw_count_matrices = [ '/path/to/raw_counts/raw_count_matrix.primary.counts', '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts' ] project_params = Params() component_params = Params() project_params.add(raw_count_matrix_file_prefix='raw_count_matrix') project_params.add(feature_counts_file_extension='counts') component_params.add(deseq_output_dir='/path/to/final/deseq_dir') component_params.add(deseq_script='deseq_original.R') project_params.add(sample_annotation_file='/path/to/samples.txt') component_params.add(deseq_output_tag='deseq') component_params.add(deseq_contrast_flag='_vs_') component_params.add(number_of_genes_for_heatmap='30') component_params.add(heatmap_file_tag='heatmap.png') project.add_parameters(project_params) project.contrasts = [('X', 'Y'), ('X', 'Z')] # construct the expected call strings: call_1 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Y /path/to/final/deseq_dir/Y_vs_X.primary.deseq /path/to/final/deseq_dir/Y_vs_X.primary.heatmap.png 30' call_2 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Z /path/to/final/deseq_dir/Z_vs_X.primary.deseq /path/to/final/deseq_dir/Z_vs_X.primary.heatmap.png 30' m = mock.MagicMock(side_effect=[True, False]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): with self.assertRaises( self.module.MissingCountMatrixFileException): self.module.call_deseq(project, component_params) calls = [ mock.call('deseq_original.R', call_1), mock.call('deseq_original.R', call_2) ] self.module.call_script.assert_has_calls(calls)
from sklearn.ensemble import ExtraTreesClassifier # Extra Trees from models.hmp_model import HMP_Model from classifiers.base_classification import Base_Classification import pandas as pd from sklearn.model_selection import train_test_split from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project, slash from scripts.save_workspace import save from sklearn.model_selection import StratifiedKFold import numpy as np #===INITIALIZATION===# Debug.DEBUG = 0 hmp = HMP_Model() processing = Processing_DB_Files() project = Project() extra_trees = ExtraTreesClassifier(n_estimators=10000, random_state=0) base_classification = Base_Classification(hmp, extra_trees) #===LOAD FEATURES===# #Interate threshold to find de best value# s = save() person_list = ["f1", "m1", "m2"] accuracy_threshould_list = [] data = {} threshold = 0.35 project.log( "=========== HMP Outlier Accuracy, Thresold = {}===========".format( threshold)) for p in person_list:
def test_missing_count_matrix_files_raises_exception(self): project = Project() cp = Params() with self.assertRaises(self.module.NoCountMatricesException): self.module.call_deseq(project, cp)
from models.arcma_model import ARCMA_Model from classifiers.base_classification import Base_Classification import pandas as pd from sklearn.model_selection import train_test_split from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project, slash from scripts.save_workspace import save from sklearn.model_selection import StratifiedKFold import numpy as np from pre_processing.balance_data import BalanceData #===INITIALIZATION===# Debug.DEBUG = 0 arcma = ARCMA_Model() processing = Processing_DB_Files() project = Project() extra_trees = ExtraTreesClassifier(n_estimators=10000, random_state=0) base_classification = Base_Classification(arcma, extra_trees) balance_data = BalanceData() threshold_balance_data = 40 #===LOAD FEATURES===# #Interate threshold to find de best value# s = save() person_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] accuracy_threshould_list = [] data = {} threshold = 0.60 project.log( "=========== ARCMA Outlier Accuracy, Thresold = {}===========".format(
from sklearn.ensemble import ExtraTreesClassifier # Extra Trees from models.hmp_model import HMP_Model import pandas as pd from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project, slash from scripts.save_workspace import save from pre_processing.get_accuracy import Get_Accuracy from sklearn.model_selection import StratifiedKFold from pre_processing.balance_data import BalanceData import numpy as np #===INITIALIZATION===# Debug.DEBUG = 0 hmp = HMP_Model() processing = Processing_DB_Files() project = Project() extra_trees = ExtraTreesClassifier(n_estimators=1000, random_state=0) get_accuracy = Get_Accuracy() balance_data = BalanceData() threshold_balance_data = 40 #===LOAD FEATURES===# #Interate threshold to find de best value# persons = [ "f1", "m1", "m2", "f2", "m3", "f3", "m4", "m5", "m6", "m7", "f4", "m8", "m9", "f5", "m10", "m11" ] accuracy_by_person = pd.DataFrame() threshold = 0.65 project.log(
def test_fill_template(self): project = Project() parameters = { 'bam_filter_level': 'sort.primary', 'project_directory': 'abc/foo/AB_12345', 'genome': 'hg19', 'genome_source_link': 'ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/', 'skip_align': False, 'skip_analysis': False } project.parameters = parameters component_params = cp.read_config( os.path.join(root, 'components', 'pdf_report', 'report.cfg'), 'COMPONENT_SPECIFIC') extra_params = cp.read_config( os.path.join(root, 'components', 'pdf_report', 'report.cfg'), 'STAR') mock_sample_ids = [ os.path.basename(x).split('.')[0] for x in glob.glob( os.path.join( 'test_data', '*' + component_params.get('coverage_file_suffix'))) ] project.samples = [Sample(x, 'X') for x in mock_sample_ids] project.contrasts = [('X', 'Y'), ('X', 'Z'), ('Y', 'Z')] component_params['report_output_dir'] = os.path.join( os.path.abspath(os.path.dirname(__file__)), test_output_dir, component_params.get('report_output_dir')) if not os.path.isdir(component_params['report_output_dir']): os.mkdir(component_params['report_output_dir']) # link figures so they appear where they should be. figure_list = glob.glob( os.path.join(os.path.dirname(__file__), 'test_data', '*' + component_params.get('coverage_plot_suffix'))) figure_list += [ os.path.join(os.path.dirname(__file__), 'test_data', 'bamfile_reads.pdf'), os.path.join(os.path.dirname(__file__), 'test_data', 'mapping_composition.pdf'), os.path.join(os.path.dirname(__file__), 'test_data', 'total_reads.pdf'), os.path.join('components', 'pdf_report', 'igv_typical.png'), os.path.join('components', 'pdf_report', 'igv_duplicates.png') ] [ os.symlink( os.path.join(root, f), os.path.join(component_params['report_output_dir'], os.path.basename(f))) for f in figure_list ] self.module.get_diff_exp_gene_summary = mock.Mock() self.module.get_diff_exp_gene_summary.return_value = [[ 'X', 'Y', 100, 200 ], ['Y_1', 'Z_2', 400, 300], ['X_2', 'Z_3', 150, 300]] env = jinja2.Environment(loader=jinja2.FileSystemLoader( os.path.join(root, 'components', 'pdf_report'))) template = env.get_template(component_params.get('report_template')) self.module.fill_template(template, project, component_params) self.module.compile_report(project, component_params)
def test_system_call_to_bedtools(self): project = Project() parameters = { 'bam_filter_level': 'sort.primary', 'project_directory': 'abc/foo/AB_12345', 'genome': 'hg19', 'genome_source_link': 'ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/', 'skip_align': False, 'skip_analysis': False } project.parameters = parameters mock_dir = '/abc/def/' mock_sample_names = ['AAA', 'BBB', 'CCC'] levels = ['sort.bam', 'sort.primary.bam', 'sort.primary.dedup.bam'] all_samples = [] for sn in mock_sample_names: bamfiles = map(lambda x: os.path.join(mock_dir, sn + '.' + x), levels) s = Sample(sn, 'X', bamfiles=bamfiles) all_samples.append(s) project.samples = all_samples component_params = cp.read_config( os.path.join(root, 'components', 'pdf_report', 'report.cfg'), 'COMPONENT_SPECIFIC') self.module.subprocess.Popen = mock.Mock() mock_process = mock.Mock() mock_process.communicate.return_value = (('abc', 'def')) mock_process.returncode = 0 self.module.subprocess.Popen.return_value = mock_process self.module.subprocess.STDOUT = 'abc' self.module.subprocess.STDERR = 'def' m = mock.mock_open() with mock.patch.object(__builtin__, 'open', m) as x: expected_calls = [ mock.call([ component_params.get('bedtools_path'), component_params.get('bedtools_cmd'), '-ibam', '/abc/def/AAA.sort.primary.bam', '-bga' ], stderr='abc', stdout=m()), mock.call().communicate(), mock.call([ component_params.get('bedtools_path'), component_params.get('bedtools_cmd'), '-ibam', '/abc/def/BBB.sort.primary.bam', '-bga' ], stderr='abc', stdout=m()), mock.call().communicate(), mock.call([ component_params.get('bedtools_path'), component_params.get('bedtools_cmd'), '-ibam', '/abc/def/CCC.sort.primary.bam', '-bga' ], stderr='abc', stdout=m()), mock.call().communicate() ] self.module.calculate_coverage_data(project, component_params) self.module.subprocess.Popen.assert_has_calls(expected_calls)
def test_system_calls_paired_experiment(self): mock_process = mock.Mock(name='mock_process') mock_process.communicate.return_value = (('', '')) mock_process.returncode = 0 mock_popen = mock.Mock(name='mock_popen') mock_popen.return_value = mock_process self.module.subprocess = mock.Mock() self.module.subprocess.Popen = mock_popen self.module.subprocess.STDOUT = '' self.module.subprocess.PIPE = '' p = Params() cp = Params() p.add(gtf='/path/to/GTF/mock.gtf') cp.add(feature_counts='/path/to/bin/featureCounts') cp.add(feature_counts_file_extension='counts') cp.add(feature_counts_output_dir='/path/to/final/featureCounts') p.add(paired_alignment=True) s1 = Sample('A', 'X') s1.bamfiles = [ '/path/to/bamdir/A.bam', '/path/to/bamdir/A.primary.bam', '/path/to/bamdir/A.primary.dedup.bam' ] project = Project() project.add_parameters(p) project.add_samples([s1]) m = mock.MagicMock(side_effect=[True, True, True]) path = self.module.os.path with mock.patch.object(path, 'isfile', m): self.module.execute_counting(project, cp, util_methods) calls = [ mock.call( '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -p -o /path/to/final/featureCounts/A.counts /path/to/bamdir/A.bam', shell=True, stderr=self.module.subprocess.STDOUT, stdout=self.module.subprocess.PIPE), mock.call( '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -p -o /path/to/final/featureCounts/A.primary.counts /path/to/bamdir/A.primary.bam', shell=True, stderr=self.module.subprocess.STDOUT, stdout=self.module.subprocess.PIPE), mock.call( '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -p -o /path/to/final/featureCounts/A.primary.dedup.counts /path/to/bamdir/A.primary.dedup.bam', shell=True, stderr=self.module.subprocess.STDOUT, stdout=self.module.subprocess.PIPE) ] mock_popen.assert_has_calls(calls) # check that the sample contains paths to the new count files in the correct locations: expected_files = [ os.path.join('/path/to/final/featureCounts', re.sub('bam', 'counts', os.path.basename(f))) for f in s1.bamfiles ] actual_files = s1.countfiles self.assertEqual(actual_files, expected_files)
from tsfresh import extract_relevant_features from tsfresh.utilities.dataframe_functions import impute import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from pre_processing.processing_db_files import Processing_DB_Files import itertools from sklearn.metrics import accuracy_score from utils.project import Project import time #===INITIALIZATION===# Debug.DEBUG = 0 hmp = HMP_Model() processing = Processing_DB_Files() project = Project() extra_trees = ExtraTreesClassifier(n_estimators=1000, max_depth=1000, random_state=0) #Good performer base_classification = Base_Classification(hmp, extra_trees) _, _, _ = base_classification.predict_outliers_for_list_people_with_proba( 36, ["f1", "m1", "m2"], "eat_soup", 0.55, remove_outliers=0.05) #===Extract TsFresh Features===# dataframe_1 = hmp.data_with_window["f1"]["training"] dataframe_2 = pd.DataFrame() labels = [] id = 1 for d in dataframe_1: if len(np.unique(d[hmp.label_tag])) < 2: d["id"] = pd.Series(np.full((1, d.shape[0]), id)[0], index=d.index)
from models.umafall_model import UMAFALL_Model from classifiers.base_classification import Base_Classification import pandas as pd from sklearn.model_selection import train_test_split from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project, slash from scripts.save_workspace import save from sklearn.model_selection import StratifiedKFold import numpy as np #===INITIALIZATION===# Debug.DEBUG = 0 umafall = UMAFALL_Model() processing = Processing_DB_Files() project = Project() extra_trees = ExtraTreesClassifier(n_estimators = 10000, random_state=0) base_classification = Base_Classification(umafall, extra_trees) #===LOAD FEATURES===# #Interate threshold to find de best value# s = save() person_list = [14,15, 16, 17] accuracy_threshould_list = [] data = {} threshold = 0.65 project.log("=========== UMAFALL Outlier Accuracy, Thresold = {}===========".format(threshold), file="umafall_log.log") for p in person_list: project.log("===========Person {}===========".format(p), file="umafall_log.log") data = s.load_var("umafall_relevant_features{}relevant_features_{}.pkl".format(slash, p))
from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project from sklearn.ensemble import ExtraTreesClassifier # Extra Trees import pandas as pd from sklearn.model_selection import train_test_split from pre_processing.get_accuracy import Get_Accuracy import numpy as np from tsfresh import extract_relevant_features import time from pre_processing.balance_data import BalanceData #===INITIALIZATION===# Debug.DEBUG = 0 umafall = UMAFALL_Model() processing = Processing_DB_Files() project = Project() persons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] get_accuracy = Get_Accuracy() balance_data = BalanceData() threshold_balance_data = 40 #Select de best windows t = time.time() best_model = ExtraTreesClassifier(n_estimators=1000, random_state=0) w_accuracies = pd.DataFrame(columns=["window", "accurary"]) p = 1 # pessoa com mais registros project.log( "=====================UMAFALL_SELECT_BEST_WINDOWS=====================", file="umafall_log_best_window.log") for w in range(10, 110, 10): print("Load data with window len = {}".format(w))
from sklearn import svm #SVM from sklearn.neural_network import MLPClassifier #multi-layer percept import pandas as pd from sklearn.model_selection import train_test_split from pre_processing.get_accuracy import Get_Accuracy from scripts.save_workspace import save import numpy as np from pre_processing.balance_data import BalanceData import statistics as st #===INITIALIZATION===# Debug.DEBUG = 0 arcma = ARCMA_Model() processing = Processing_DB_Files() project = Project() #tuple from MPL t_aux = [] for i in range(0,500): t_aux.append(500) t = tuple(t_aux) #### classifiers = {"MPL": MLPClassifier(random_state=1, solver="adam", activation="relu", max_iter=100000, alpha=1e-5, hidden_layer_sizes=t), "Extratrees": ExtraTreesClassifier(n_estimators = 1000, random_state=1), "Knn":KNeighborsClassifier(n_neighbors=5), "Naive Bayes":GaussianNB(), "RandomForest":RandomForestClassifier(n_estimators = 1000, random_state=1), "Decision Tree":tree.DecisionTreeClassifier(random_state=1), "SVM":svm.SVC(probability=True, random_state=1)} persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] get_accuracy = Get_Accuracy() balance_data = BalanceData() threshold_balance_data = 40 #Select the best classifier accuracy_mean = pd.DataFrame(columns=["Classifier", "Accuracy"]) project.log("=====================ARCMA_SELECT_BEST_ALGORITHM=====================", file="arcma_best_algorithm.log") for c in classifiers:
from models.arcma_model import ARCMA_Model from classifiers.base_classification import Base_Classification import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project, slash from scripts.save_workspace import save import statistics as st from pre_processing.balance_data import BalanceData #===INITIALIZATION===# Debug.DEBUG = 0 arcma = ARCMA_Model() processing = Processing_DB_Files() project = Project() extra_trees = ExtraTreesClassifier(n_estimators = 1000, random_state=0) base_classification = Base_Classification(arcma, extra_trees) balance_data = BalanceData() threshold_balance_data = 40 #Interate threshold to find de best value# s = save() person_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] accuracy_threshould_temp_aux = pd.DataFrame(columns=["accuracy","discarted", "len_activity", "threshold"]) accuracy_mean = pd.DataFrame(columns=["accuracy","discarted", "len_activity", "threshold"]) project.log("====================ARCMA BEST THRESHOLD============================", file="arcma_log_best_threshold.log") for t in np.arange(0.05, 1, 0.05): accuracy_threshould_temp_aux = pd.DataFrame(columns=["accuracy","discarted", "len_activity"]) for p in person_list:
def test_countfile_merging(self): """ This tests that the correct files are used to merge. The result (a data structure) of the merging is mocked out. Tests that the expected data is written to the file and tests that the file ends up in the correct location """ # a dummy method to mock the reading/concatenating of the data in the individual files def mock_read(matrix, f): dummy = [['geneA', '0', '100', '200'], ['geneB', '1', '101', '201'], ['geneC', '2', '102', '202']] if len(matrix) == 0: for k in range(len(dummy)): matrix.append([]) for i, l in enumerate(dummy): matrix[i] = l # mock out the actual implementations self.module.get_countfile_groupings = mock.Mock() self.module.get_countfile_groupings.return_value = [ [ '/path/to/final/featureCounts/A.counts', '/path/to/final/featureCounts/C.counts', '/path/to/final/featureCounts/B.counts' ], [ '/path/to/final/featureCounts/A.primary.counts', '/path/to/final/featureCounts/C.primary.counts' ], [ '/path/to/final/featureCounts/A.primary.dedup.counts', '/path/to/final/featureCounts/B.primary.dedup.counts', '/path/to/final/featureCounts/C.primary.dedup.counts' ] ] self.module.read = mock_read p = Params() p.add(raw_count_matrix_file_prefix='merged_counts') s1 = Sample('A', 'X') s1.countfiles = [ '/path/to/final/featureCounts/A.primary.counts', '/path/to/final/featureCounts/A.counts', '/path/to/final/featureCounts/A.primary.dedup.counts' ] s2 = Sample('B', 'Y') s2.countfiles = [ '/path/to/final/featureCounts/B.counts', '/path/to/final/featureCounts/B.primary.dedup.counts', '/path/to/final/featureCounts/B.primary.counts' ] s3 = Sample('C', 'Z') s3.countfiles = [ '/path/to/final/featureCounts/C.counts', '/path/to/final/featureCounts/C.primary.counts', '/path/to/final/featureCounts/C.primary.dedup.counts' ] project = Project() project.add_parameters(p) project.add_samples([s1, s3, s2]) m = mock.mock_open() with mock.patch.object(__builtin__, 'open', m): self.module.create_count_matrices(project, mock.Mock()) m.assert_any_call( '/path/to/final/featureCounts/merged_counts.counts', 'w') m.assert_any_call( '/path/to/final/featureCounts/merged_counts.primary.counts', 'w') m.assert_any_call( '/path/to/final/featureCounts/merged_counts.primary.dedup.counts', 'w') handle = m() calls = [ mock.call('Gene\tA\tB\tC\n'), mock.call('geneA\t0\t100\t200\n'), mock.call('geneB\t1\t101\t201\n'), mock.call('geneC\t2\t102\t202\n') ] * 3 handle.write.assert_has_calls(calls)
from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project from sklearn.ensemble import ExtraTreesClassifier # Extra Trees import pandas as pd from sklearn.model_selection import train_test_split from pre_processing.get_accuracy import Get_Accuracy import numpy as np from tsfresh import extract_relevant_features import time from pre_processing.balance_data import BalanceData #===INITIALIZATION===# Debug.DEBUG = 0 arcma = ARCMA_Model() processing = Processing_DB_Files() project = Project() persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] get_accuracy = Get_Accuracy() balance_data = BalanceData() threshold_balance_data = 40 #Select de best windows t = time.time() best_model = ExtraTreesClassifier(n_estimators = 1000, random_state=0) w_accuracies = pd.DataFrame(columns=["window", "accurary"]) p = 15 # pessoa com mais registros project.log("=====================ARCMA_SELECT_BEST_WINDOWS=====================", file="arcma_log_best_window.log") for w in range(20,110,10): print("Load data with window len = {}".format(w)) data = arcma.load_training_data_by_people(p) print("Slicing Window....")
def test_missing_count_matrix_files_raises_exception(self): project = Project() component_params = Params() with self.assertRaises(self.module.NoCountMatricesException): self.module.normalize(project, component_params)
# -*- coding: utf-8 -*- # IMPORTS # from utils.debug import Debug from models.arcma_model import ARCMA_Model from tsfresh import extract_relevant_features from pre_processing.processing_db_files import Processing_DB_Files from utils.project import Project, slash from scripts.save_workspace import save #===INITIALIZATION===# Debug.DEBUG = 0 arcma = ARCMA_Model() processing = Processing_DB_Files() project = Project() s = save() #window = 26 # Janela Fixa window = 50 # Melhor Janela persons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] for p in persons: data = arcma.load_training_data_by_people(p) print("Slicing Window....") data_tsfresh, y = arcma.slice_by_window_tsfresh(data, window) y.index += 1 del data_tsfresh["activity"] classes_counts = y.value_counts() if len(classes_counts) > 1: relevant_features = extract_relevant_features(data_tsfresh, y,
from sklearn import tree # Decision Tree from sklearn.ensemble import RandomForestClassifier # Random Forest from sklearn.ensemble import ExtraTreesClassifier # Extra Trees from sklearn.naive_bayes import GaussianNB #Naive Bayes from sklearn import svm #SVM from sklearn.neural_network import MLPClassifier #multi-layer percept #==== Models ====# from models.hmp_model import HMP_Model from models.umafall_model import UMAFALL_Model from models.arcma_model import ARCMA_Model #===INITIALIZATION===# Debug.DEBUG = 0 processing = Processing_DB_Files() project = Project() s = save() get_accuracy = Get_Accuracy() #===INIT BASES===# hmp_persons = ["f1", "m1", "m2", "f2", "m3", "f3", "m4", "f4"] # at least 5 activities umafall_persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] arcma_persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] models = [] #models.append({"model_name":"hmp", "model":HMP_Model(), "persons":hmp_persons, "window":90}) models.append({"model_name":"umafall", "model":UMAFALL_Model(), "persons":umafall_persons, "window":10}) models.append({"model_name":"arcma", "model":ARCMA_Model(), "persons":arcma_persons, "window":40}) #tuple from MPL t_aux = [] for i in range(0,500): t_aux.append(500)