Ejemplo n.º 1
0
	def test_correct_calls_are_made(self):
		"""
		Tests that the correct arguments are passed to the method which calls the normalization script.
		Mostly tests the path renaming, etc.
		"""
		self.module.call_script = mock.Mock()
		project = Project()
		project.raw_count_matrices = ['/path/to/raw_counts/raw_count_matrix.primary.counts',
					'/path/to/raw_counts/raw_count_matrix.primary.dedup.counts']
		project_params = Params()
		component_params = Params()
		project_params.add(raw_count_matrix_file_prefix = 'raw_count_matrix')
		component_params.add(normalized_counts_file_prefix = 'normalized_count_matrix')
		component_params.add(normalized_counts_output_dir = '/path/to/final/norm_counts_dir')
		component_params.add(normalization_script = 'normalize.R')
		project_params.add(sample_annotation_file = '/path/to/samples.txt')
		project.add_parameters(project_params)

		m = mock.MagicMock(side_effect = [True, True])
		path = self.module.os.path
		with mock.patch.object(path, 'isfile', m):
			self.module.normalize(project, component_params)
			calls = [mock.call('normalize.R', '/path/to/raw_counts/raw_count_matrix.primary.counts', 
					'/path/to/final/norm_counts_dir/normalized_count_matrix.primary.counts', '/path/to/samples.txt' ), 
				mock.call('normalize.R', '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts', 
					'/path/to/final/norm_counts_dir/normalized_count_matrix.primary.dedup.counts', '/path/to/samples.txt' )]
			self.module.call_script.assert_has_calls(calls)
Ejemplo n.º 2
0
	def test_missing_countfile_raises_exception(self):
		"""
		Test one of the files is ok (the first), but the second is not found (for whatever reason).  Test that we throw an exception, 
		and that the one successful call was indeed made correctly.
		"""
		self.module.call_script = mock.Mock()
		project = Project()
		project.raw_count_matrices = ['/path/to/raw_counts/raw_count_matrix.primary.counts',
					'/path/to/raw_counts/raw_count_matrix.primary.dedup.counts']
		project_params = Params()
		component_params = Params()
		project_params.add(raw_count_matrix_file_prefix = 'raw_count_matrix')
		component_params.add(normalized_counts_file_prefix = 'normalized_count_matrix')
		component_params.add(normalized_counts_output_dir = '/path/to/final/norm_counts_dir')
		component_params.add(normalization_script = 'normalize.R')
		project_params.add(sample_annotation_file = '/path/to/samples.txt')
		project.add_parameters(project_params)

		m = mock.MagicMock(side_effect = [True, False])
		path = self.module.os.path
		with mock.patch.object(path, 'isfile', m):
			with self.assertRaises(self.module.MissingCountMatrixFileException):
				self.module.normalize(project, component_params)
			calls = [mock.call('normalize.R', '/path/to/raw_counts/raw_count_matrix.primary.counts', 
					'/path/to/final/norm_counts_dir/normalized_count_matrix.primary.counts', '/path/to/samples.txt' )]
			self.module.call_script.assert_has_calls(calls)
Ejemplo n.º 3
0
    def test_generate_figures(self):
        """
		This is not a unit test in the conventional sense-- this is a full-scale mockup which will
		create an output pdf and everything.
		"""

        project = Project()
        parameters = {
            'aligner': 'star',
            'skip_align': False,
            'sample_dir_prefix': 'Sample_',
            'alignment_dir': 'aln',
            'project_directory': 'foo',
            'chromosomes': ['chr1', 'chr2', 'chrM']
        }
        project.parameters = parameters

        component_params = cp.read_config(
            os.path.join(root, 'components', 'pdf_report', 'report.cfg'),
            'COMPONENT_SPECIFIC')
        extra_params = cp.read_config(
            os.path.join(root, 'components', 'pdf_report', 'report.cfg'),
            'STAR')

        mock_sample_ids = [
            os.path.basename(x).split('.')[0] for x in glob.glob(
                os.path.join(
                    'test_data', '*' +
                    component_params.get('coverage_file_suffix')))
        ]
        project.samples = [Sample(x, 'X') for x in mock_sample_ids]

        component_params['report_output_dir'] = os.path.join(
            os.path.abspath(os.path.dirname(__file__)), test_output_dir,
            component_params.get('report_output_dir'))
        if not os.path.isdir(component_params['report_output_dir']):
            os.mkdir(component_params['report_output_dir'])

        # link the test files so they 'appear' in the correct location:
        [
            os.symlink(
                os.path.abspath(x),
                os.path.join(component_params['report_output_dir'],
                             os.path.basename(x)))
            for x in glob.glob(
                os.path.join(
                    'test_data', '*' +
                    component_params.get('coverage_file_suffix')))
        ]

        mock_log_data = mock_log_data_structure(project, extra_params)
        self.module.star_methods.process_star_logs = mock.Mock()
        self.module.star_methods.process_star_logs.return_value = mock_log_data

        self.module.get_bam_counts = mock.Mock()
        self.module.get_bam_counts.return_value = mock_bam_counts(
            mock_log_data.keys())
        self.module.calculate_coverage_data = mock.Mock()
        self.module.calculate_coverage_data.return_value = None
        self.module.generate_figures(project, component_params, extra_params)
Ejemplo n.º 4
0
	def test_general_portion_of_template_injected_correctly(self):
		template = 'STAR=%STAR%\nSAMTOOLS=%SAMTOOLS%\nPICARD_DIR=%PICARD_DIR%\nGTF=%GTF%\nGENOME_INDEX=%GENOME_INDEX%'
		expected_result = 'STAR=STARPATH\nSAMTOOLS=SAM\nPICARD_DIR=PIC\nGTF=my.gtf\nGENOME_INDEX=GI'
		p = Params()
		p.add(star_align = 'STARPATH')
		p.add(samtools = 'SAM')
		p.add(gtf = 'my.gtf')
		p.add(star_genome_index= 'GI')
		p.add(picard = 'PIC') 
		myproject = Project()
		myproject.parameters = p

		result = self.module.fill_out_general_template_portion(myproject, template)
		self.assertEqual( result, expected_result)
    def test_bad_bamfile_path_raises_exception(self):

        self.module.subprocess = mock.Mock()

        p = Params()
        p.add(gtf='/path/to/GTF/mock.gtf')
        p.add(feature_counts='/path/to/bin/featureCounts')
        p.add(feature_counts_file_extension='counts')
        p.add(feature_counts_output_dir='/path/to/final/featureCounts')
        p.add(paired_alignment=False)

        s1 = Sample('A', 'X')
        s1.bamfiles = [
            '/path/to/bamdir/A.bam', '/path/to/bamdir/A.primary.bam',
            '/path/to/bamdir/A.primary.dedup.bam'
        ]
        s2 = Sample('B', 'X')
        s2.bamfiles = ['/path/to/bamdir/B.bam', '/bad/path/B.sort.bam']

        project = Project()
        project.add_parameters(p)
        project.add_samples([s1, s2])

        m = mock.MagicMock(side_effect=[True, True, True, True, False])
        path = self.module.os.path
        with mock.patch.object(path, 'isfile', m):
            with self.assertRaises(self.module.MissingBamFileException):
                self.module.execute_counting(project, util_methods)
    def test_group_countfiles_raises_exception_if_missing_type(self):
        """
		Test the method that aggregates all the countfiles generated from each 'type' of bam file.  That is, we may have multiple bam files for each sample (e.g. primary alignments, deduplicated, etc).
		We will be generating a countfile for each one of those.  When we assemble into a count matrix, we obviously group the files of a particular 'type' (e.g. those coming from deduplicated BAM files).
		This tests that the the glob methods are called with the correct parameters given the sample annotations prescribed.

		This one tests that an exception is raised if one of the countfile 'types' is missing.  Here, sample B is missing a countfile corresponding to the primary.counts- based BAM files
		"""

        p = Params()
        p.add(feature_counts_output_dir='/path/to/final/featureCounts')

        s1 = Sample('A', 'X')
        s1.countfiles = [
            '/path/to/final/featureCounts/A.counts',
            '/path/to/final/featureCounts/A.primary.counts',
            '/path/to/final/featureCounts/A.primary.dedup.counts'
        ]
        s2 = Sample('B', 'Y')
        s2.countfiles = [
            '/path/to/final/featureCounts/B.counts',
            '/path/to/final/featureCounts/B.primary.dedup.counts'
        ]
        s3 = Sample('C', 'Z')
        s3.countfiles = [
            '/path/to/final/featureCounts/C.counts',
            '/path/to/final/featureCounts/C.primary.counts',
            '/path/to/final/featureCounts/C.primary.dedup.counts'
        ]

        project = Project()
        project.add_parameters(p)
        project.add_samples([s1, s2, s3])

        mock_util_methods = mock.Mock()
        mock_case_insensitive_glob = mock.Mock()
        mock_case_insensitive_glob.side_effect = [
            [
                '/path/to/final/featureCounts/A.counts',
                '/path/to/final/featureCounts/B.counts',
                '/path/to/final/featureCounts/C.counts'
            ],
            [
                '/path/to/final/featureCounts/A.primary.counts',
                '/path/to/final/featureCounts/C.primary.counts'
            ],
            [
                '/path/to/final/featureCounts/A.primary.dedup.counts',
                '/path/to/final/featureCounts/B.primary.dedup.counts',
                '/path/to/final/featureCounts/C.primary.dedup.counts'
            ]
        ]
        with self.assertRaises(self.module.CountfileQuantityException):
            result = self.module.get_countfile_groupings(
                project, mock_case_insensitive_glob)
Ejemplo n.º 7
0
    def test_group_countfiles(self):
        """
		Test the method that aggregates all the countfiles generated from each 'type' of bam file.  That is, we may have multiple bam files for each sample (e.g. primary alignments, deduplicated, etc).
		We will be generating a countfile for each one of those.  When we assemble into a count matrix, we obviously group the files of a particular 'type' (e.g. those coming from deduplicated BAM files).
		This tests that the the glob methods are called with the correct parameters given the sample annotations prescribed.
		"""

        p = Params()
        cp = Params()
        cp.add(feature_counts_output_dir='/path/to/final/featureCounts')

        s1 = Sample('A', 'X')
        s1.countfiles = [
            '/path/to/final/featureCounts/A.counts',
            '/path/to/final/featureCounts/A.primary.counts',
            '/path/to/final/featureCounts/A.primary.dedup.counts'
        ]
        s2 = Sample('B', 'Y')
        s2.countfiles = [
            '/path/to/final/featureCounts/B.counts',
            '/path/to/final/featureCounts/B.primary.counts',
            '/path/to/final/featureCounts/B.primary.dedup.counts'
        ]
        s3 = Sample('C', 'Z')
        s3.countfiles = [
            '/path/to/final/featureCounts/C.counts',
            '/path/to/final/featureCounts/C.primary.counts',
            '/path/to/final/featureCounts/C.primary.dedup.counts'
        ]

        project = Project()
        project.add_parameters(p)
        project.add_samples([s1, s2, s3])

        result = self.module.get_countfile_groupings(project, cp)
        expected_result = [
            [
                '/path/to/final/featureCounts/A.counts',
                '/path/to/final/featureCounts/B.counts',
                '/path/to/final/featureCounts/C.counts'
            ],
            [
                '/path/to/final/featureCounts/A.primary.counts',
                '/path/to/final/featureCounts/B.primary.counts',
                '/path/to/final/featureCounts/C.primary.counts'
            ],
            [
                '/path/to/final/featureCounts/A.primary.dedup.counts',
                '/path/to/final/featureCounts/B.primary.dedup.counts',
                '/path/to/final/featureCounts/C.primary.dedup.counts'
            ]
        ]
        self.assertEqual(result, expected_result)
    def test_system_calls_single_end_experiment(self):
        self.module.subprocess = mock.Mock()

        p = Params()
        p.add(gtf='/path/to/GTF/mock.gtf')
        p.add(feature_counts='/path/to/bin/featureCounts')
        p.add(feature_counts_file_extension='counts')
        p.add(feature_counts_output_dir='/path/to/final/featureCounts')
        p.add(paired_alignment=False)

        s1 = Sample('A', 'X')
        s1.bamfiles = [
            '/path/to/bamdir/A.bam', '/path/to/bamdir/A.primary.bam',
            '/path/to/bamdir/A.primary.dedup.bam'
        ]

        project = Project()
        project.add_parameters(p)
        project.add_samples([s1])

        m = mock.MagicMock(side_effect=[True, True, True])
        path = self.module.os.path
        with mock.patch.object(path, 'isfile', m):
            self.module.execute_counting(project, util_methods)

            calls = [
                mock.call(
                    '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -o /path/to/final/featureCounts/A.counts /path/to/bamdir/A.bam',
                    shell=True),
                mock.call(
                    '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -o /path/to/final/featureCounts/A.primary.counts /path/to/bamdir/A.primary.bam',
                    shell=True),
                mock.call(
                    '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -o /path/to/final/featureCounts/A.primary.dedup.counts /path/to/bamdir/A.primary.dedup.bam',
                    shell=True)
            ]
            self.module.subprocess.check_call.assert_has_calls(calls)

            # check that the sample contains paths to the new count files in the correct locations:
            expected_files = [
                os.path.join('/path/to/final/featureCounts',
                             re.sub('bam', 'counts', os.path.basename(f)))
                for f in s1.bamfiles
            ]
            actual_files = s1.countfiles
            self.assertEqual(actual_files, expected_files)
Ejemplo n.º 9
0
    def test_correct_calls_are_made(self):
        """
		Tests that the correct arguments are passed to the method which calls the DESeq script.
		Mostly tests the path renaming, etc.
		"""
        self.module.call_script = mock.Mock()
        project = Project()
        project.raw_count_matrices = [
            '/path/to/raw_counts/raw_count_matrix.primary.counts',
            '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts'
        ]
        project_params = Params()
        component_params = Params()
        project_params.add(raw_count_matrix_file_prefix='raw_count_matrix')
        project_params.add(feature_counts_file_extension='counts')
        component_params.add(deseq_output_dir='/path/to/final/deseq_dir')
        component_params.add(deseq_script='deseq_original.R')
        project_params.add(sample_annotation_file='/path/to/samples.txt')
        component_params.add(deseq_output_tag='deseq')
        component_params.add(deseq_contrast_flag='_vs_')
        component_params.add(number_of_genes_for_heatmap='30')
        component_params.add(heatmap_file_tag='heatmap.png')

        project.add_parameters(project_params)
        project.contrasts = [('X', 'Y'), ('X', 'Z')]

        # construct the expected call strings:
        call_1 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Y /path/to/final/deseq_dir/Y_vs_X.primary.deseq /path/to/final/deseq_dir/Y_vs_X.primary.heatmap.png 30'
        call_2 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Z /path/to/final/deseq_dir/Z_vs_X.primary.deseq /path/to/final/deseq_dir/Z_vs_X.primary.heatmap.png 30'
        call_3 = '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts /path/to/samples.txt X Y /path/to/final/deseq_dir/Y_vs_X.primary.dedup.deseq /path/to/final/deseq_dir/Y_vs_X.primary.dedup.heatmap.png 30'
        call_4 = '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts /path/to/samples.txt X Z /path/to/final/deseq_dir/Z_vs_X.primary.dedup.deseq /path/to/final/deseq_dir/Z_vs_X.primary.dedup.heatmap.png 30'

        m = mock.MagicMock(side_effect=[True, True])
        path = self.module.os.path
        with mock.patch.object(path, 'isfile', m):
            self.module.call_deseq(project, component_params)
            calls = [
                mock.call('deseq_original.R', call_1),
                mock.call('deseq_original.R', call_2),
                mock.call('deseq_original.R', call_3),
                mock.call('deseq_original.R', call_4)
            ]
            self.module.call_script.assert_has_calls(calls)
Ejemplo n.º 10
0
    def test_missing_countfile_raises_exception(self):
        """
		Test one of the files is ok (the first), but the second is not found (for whatever reason).  Test that we throw an exception, 
		and that the one successful call was indeed made correctly.
		"""
        self.module.call_script = mock.Mock()
        project = Project()
        project.raw_count_matrices = [
            '/path/to/raw_counts/raw_count_matrix.primary.counts',
            '/path/to/raw_counts/raw_count_matrix.primary.dedup.counts'
        ]

        project_params = Params()
        component_params = Params()
        project_params.add(raw_count_matrix_file_prefix='raw_count_matrix')
        project_params.add(feature_counts_file_extension='counts')
        component_params.add(deseq_output_dir='/path/to/final/deseq_dir')
        component_params.add(deseq_script='deseq_original.R')
        project_params.add(sample_annotation_file='/path/to/samples.txt')
        component_params.add(deseq_output_tag='deseq')
        component_params.add(deseq_contrast_flag='_vs_')
        component_params.add(number_of_genes_for_heatmap='30')
        component_params.add(heatmap_file_tag='heatmap.png')

        project.add_parameters(project_params)
        project.contrasts = [('X', 'Y'), ('X', 'Z')]

        # construct the expected call strings:
        call_1 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Y /path/to/final/deseq_dir/Y_vs_X.primary.deseq /path/to/final/deseq_dir/Y_vs_X.primary.heatmap.png 30'
        call_2 = '/path/to/raw_counts/raw_count_matrix.primary.counts /path/to/samples.txt X Z /path/to/final/deseq_dir/Z_vs_X.primary.deseq /path/to/final/deseq_dir/Z_vs_X.primary.heatmap.png 30'

        m = mock.MagicMock(side_effect=[True, False])
        path = self.module.os.path
        with mock.patch.object(path, 'isfile', m):
            with self.assertRaises(
                    self.module.MissingCountMatrixFileException):
                self.module.call_deseq(project, component_params)
            calls = [
                mock.call('deseq_original.R', call_1),
                mock.call('deseq_original.R', call_2)
            ]
            self.module.call_script.assert_has_calls(calls)
from sklearn.ensemble import ExtraTreesClassifier  # Extra Trees
from models.hmp_model import HMP_Model
from classifiers.base_classification import Base_Classification
import pandas as pd
from sklearn.model_selection import train_test_split
from pre_processing.processing_db_files import Processing_DB_Files
from utils.project import Project, slash
from scripts.save_workspace import save
from sklearn.model_selection import StratifiedKFold
import numpy as np

#===INITIALIZATION===#
Debug.DEBUG = 0
hmp = HMP_Model()
processing = Processing_DB_Files()
project = Project()
extra_trees = ExtraTreesClassifier(n_estimators=10000, random_state=0)
base_classification = Base_Classification(hmp, extra_trees)

#===LOAD FEATURES===#

#Interate threshold to find de best value#
s = save()
person_list = ["f1", "m1", "m2"]
accuracy_threshould_list = []
data = {}
threshold = 0.35
project.log(
    "=========== HMP Outlier Accuracy, Thresold = {}===========".format(
        threshold))
for p in person_list:
Ejemplo n.º 12
0
 def test_missing_count_matrix_files_raises_exception(self):
     project = Project()
     cp = Params()
     with self.assertRaises(self.module.NoCountMatricesException):
         self.module.call_deseq(project, cp)
Ejemplo n.º 13
0
from models.arcma_model import ARCMA_Model
from classifiers.base_classification import Base_Classification
import pandas as pd
from sklearn.model_selection import train_test_split
from pre_processing.processing_db_files import Processing_DB_Files
from utils.project import Project, slash
from scripts.save_workspace import save
from sklearn.model_selection import StratifiedKFold
import numpy as np
from pre_processing.balance_data import BalanceData

#===INITIALIZATION===#
Debug.DEBUG = 0
arcma = ARCMA_Model()
processing = Processing_DB_Files()
project = Project()
extra_trees = ExtraTreesClassifier(n_estimators=10000, random_state=0)
base_classification = Base_Classification(arcma, extra_trees)
balance_data = BalanceData()
threshold_balance_data = 40

#===LOAD FEATURES===#

#Interate threshold to find de best value#
s = save()
person_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
accuracy_threshould_list = []
data = {}
threshold = 0.60
project.log(
    "=========== ARCMA Outlier Accuracy, Thresold = {}===========".format(
Ejemplo n.º 14
0
from sklearn.ensemble import ExtraTreesClassifier  # Extra Trees
from models.hmp_model import HMP_Model
import pandas as pd
from pre_processing.processing_db_files import Processing_DB_Files
from utils.project import Project, slash
from scripts.save_workspace import save
from pre_processing.get_accuracy import Get_Accuracy
from sklearn.model_selection import StratifiedKFold
from pre_processing.balance_data import BalanceData
import numpy as np

#===INITIALIZATION===#
Debug.DEBUG = 0
hmp = HMP_Model()
processing = Processing_DB_Files()
project = Project()
extra_trees = ExtraTreesClassifier(n_estimators=1000, random_state=0)
get_accuracy = Get_Accuracy()
balance_data = BalanceData()
threshold_balance_data = 40

#===LOAD FEATURES===#

#Interate threshold to find de best value#
persons = [
    "f1", "m1", "m2", "f2", "m3", "f3", "m4", "m5", "m6", "m7", "f4", "m8",
    "m9", "f5", "m10", "m11"
]
accuracy_by_person = pd.DataFrame()
threshold = 0.65
project.log(
Ejemplo n.º 15
0
    def test_fill_template(self):

        project = Project()
        parameters = {
            'bam_filter_level': 'sort.primary',
            'project_directory': 'abc/foo/AB_12345',
            'genome': 'hg19',
            'genome_source_link':
            'ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/',
            'skip_align': False,
            'skip_analysis': False
        }

        project.parameters = parameters

        component_params = cp.read_config(
            os.path.join(root, 'components', 'pdf_report', 'report.cfg'),
            'COMPONENT_SPECIFIC')
        extra_params = cp.read_config(
            os.path.join(root, 'components', 'pdf_report', 'report.cfg'),
            'STAR')

        mock_sample_ids = [
            os.path.basename(x).split('.')[0] for x in glob.glob(
                os.path.join(
                    'test_data', '*' +
                    component_params.get('coverage_file_suffix')))
        ]
        project.samples = [Sample(x, 'X') for x in mock_sample_ids]
        project.contrasts = [('X', 'Y'), ('X', 'Z'), ('Y', 'Z')]

        component_params['report_output_dir'] = os.path.join(
            os.path.abspath(os.path.dirname(__file__)), test_output_dir,
            component_params.get('report_output_dir'))
        if not os.path.isdir(component_params['report_output_dir']):
            os.mkdir(component_params['report_output_dir'])

        # link figures so they appear where they should be.
        figure_list = glob.glob(
            os.path.join(os.path.dirname(__file__), 'test_data',
                         '*' + component_params.get('coverage_plot_suffix')))
        figure_list += [
            os.path.join(os.path.dirname(__file__), 'test_data',
                         'bamfile_reads.pdf'),
            os.path.join(os.path.dirname(__file__), 'test_data',
                         'mapping_composition.pdf'),
            os.path.join(os.path.dirname(__file__), 'test_data',
                         'total_reads.pdf'),
            os.path.join('components', 'pdf_report', 'igv_typical.png'),
            os.path.join('components', 'pdf_report', 'igv_duplicates.png')
        ]
        [
            os.symlink(
                os.path.join(root, f),
                os.path.join(component_params['report_output_dir'],
                             os.path.basename(f))) for f in figure_list
        ]

        self.module.get_diff_exp_gene_summary = mock.Mock()
        self.module.get_diff_exp_gene_summary.return_value = [[
            'X', 'Y', 100, 200
        ], ['Y_1', 'Z_2', 400, 300], ['X_2', 'Z_3', 150, 300]]

        env = jinja2.Environment(loader=jinja2.FileSystemLoader(
            os.path.join(root, 'components', 'pdf_report')))
        template = env.get_template(component_params.get('report_template'))

        self.module.fill_template(template, project, component_params)
        self.module.compile_report(project, component_params)
Ejemplo n.º 16
0
    def test_system_call_to_bedtools(self):

        project = Project()
        parameters = {
            'bam_filter_level': 'sort.primary',
            'project_directory': 'abc/foo/AB_12345',
            'genome': 'hg19',
            'genome_source_link':
            'ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/',
            'skip_align': False,
            'skip_analysis': False
        }

        project.parameters = parameters

        mock_dir = '/abc/def/'
        mock_sample_names = ['AAA', 'BBB', 'CCC']
        levels = ['sort.bam', 'sort.primary.bam', 'sort.primary.dedup.bam']

        all_samples = []
        for sn in mock_sample_names:
            bamfiles = map(lambda x: os.path.join(mock_dir, sn + '.' + x),
                           levels)
            s = Sample(sn, 'X', bamfiles=bamfiles)
            all_samples.append(s)

        project.samples = all_samples

        component_params = cp.read_config(
            os.path.join(root, 'components', 'pdf_report', 'report.cfg'),
            'COMPONENT_SPECIFIC')

        self.module.subprocess.Popen = mock.Mock()

        mock_process = mock.Mock()
        mock_process.communicate.return_value = (('abc', 'def'))
        mock_process.returncode = 0
        self.module.subprocess.Popen.return_value = mock_process
        self.module.subprocess.STDOUT = 'abc'
        self.module.subprocess.STDERR = 'def'

        m = mock.mock_open()
        with mock.patch.object(__builtin__, 'open', m) as x:
            expected_calls = [
                mock.call([
                    component_params.get('bedtools_path'),
                    component_params.get('bedtools_cmd'), '-ibam',
                    '/abc/def/AAA.sort.primary.bam', '-bga'
                ],
                          stderr='abc',
                          stdout=m()),
                mock.call().communicate(),
                mock.call([
                    component_params.get('bedtools_path'),
                    component_params.get('bedtools_cmd'), '-ibam',
                    '/abc/def/BBB.sort.primary.bam', '-bga'
                ],
                          stderr='abc',
                          stdout=m()),
                mock.call().communicate(),
                mock.call([
                    component_params.get('bedtools_path'),
                    component_params.get('bedtools_cmd'), '-ibam',
                    '/abc/def/CCC.sort.primary.bam', '-bga'
                ],
                          stderr='abc',
                          stdout=m()),
                mock.call().communicate()
            ]
            self.module.calculate_coverage_data(project, component_params)

        self.module.subprocess.Popen.assert_has_calls(expected_calls)
Ejemplo n.º 17
0
    def test_system_calls_paired_experiment(self):

        mock_process = mock.Mock(name='mock_process')
        mock_process.communicate.return_value = (('', ''))
        mock_process.returncode = 0

        mock_popen = mock.Mock(name='mock_popen')
        mock_popen.return_value = mock_process

        self.module.subprocess = mock.Mock()
        self.module.subprocess.Popen = mock_popen
        self.module.subprocess.STDOUT = ''
        self.module.subprocess.PIPE = ''

        p = Params()
        cp = Params()
        p.add(gtf='/path/to/GTF/mock.gtf')
        cp.add(feature_counts='/path/to/bin/featureCounts')
        cp.add(feature_counts_file_extension='counts')
        cp.add(feature_counts_output_dir='/path/to/final/featureCounts')
        p.add(paired_alignment=True)

        s1 = Sample('A', 'X')
        s1.bamfiles = [
            '/path/to/bamdir/A.bam', '/path/to/bamdir/A.primary.bam',
            '/path/to/bamdir/A.primary.dedup.bam'
        ]

        project = Project()
        project.add_parameters(p)
        project.add_samples([s1])

        m = mock.MagicMock(side_effect=[True, True, True])
        path = self.module.os.path
        with mock.patch.object(path, 'isfile', m):
            self.module.execute_counting(project, cp, util_methods)

            calls = [
                mock.call(
                    '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -p -o /path/to/final/featureCounts/A.counts /path/to/bamdir/A.bam',
                    shell=True,
                    stderr=self.module.subprocess.STDOUT,
                    stdout=self.module.subprocess.PIPE),
                mock.call(
                    '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -p -o /path/to/final/featureCounts/A.primary.counts /path/to/bamdir/A.primary.bam',
                    shell=True,
                    stderr=self.module.subprocess.STDOUT,
                    stdout=self.module.subprocess.PIPE),
                mock.call(
                    '/path/to/bin/featureCounts -a /path/to/GTF/mock.gtf -t exon -g gene_name -p -o /path/to/final/featureCounts/A.primary.dedup.counts /path/to/bamdir/A.primary.dedup.bam',
                    shell=True,
                    stderr=self.module.subprocess.STDOUT,
                    stdout=self.module.subprocess.PIPE)
            ]
            mock_popen.assert_has_calls(calls)

        # check that the sample contains paths to the new count files in the correct locations:
        expected_files = [
            os.path.join('/path/to/final/featureCounts',
                         re.sub('bam', 'counts', os.path.basename(f)))
            for f in s1.bamfiles
        ]
        actual_files = s1.countfiles
        self.assertEqual(actual_files, expected_files)
from tsfresh import extract_relevant_features
from tsfresh.utilities.dataframe_functions import impute
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pre_processing.processing_db_files import Processing_DB_Files
import itertools
from sklearn.metrics import accuracy_score
from utils.project import Project
import time

#===INITIALIZATION===#
Debug.DEBUG = 0
hmp = HMP_Model()
processing = Processing_DB_Files()
project = Project()
extra_trees = ExtraTreesClassifier(n_estimators=1000,
                                   max_depth=1000,
                                   random_state=0)  #Good performer
base_classification = Base_Classification(hmp, extra_trees)
_, _, _ = base_classification.predict_outliers_for_list_people_with_proba(
    36, ["f1", "m1", "m2"], "eat_soup", 0.55, remove_outliers=0.05)

#===Extract TsFresh Features===#
dataframe_1 = hmp.data_with_window["f1"]["training"]
dataframe_2 = pd.DataFrame()
labels = []
id = 1
for d in dataframe_1:
    if len(np.unique(d[hmp.label_tag])) < 2:
        d["id"] = pd.Series(np.full((1, d.shape[0]), id)[0], index=d.index)
Ejemplo n.º 19
0
from models.umafall_model import UMAFALL_Model
from classifiers.base_classification import Base_Classification
import pandas as pd
from sklearn.model_selection import train_test_split
from pre_processing.processing_db_files import Processing_DB_Files  
from utils.project import Project, slash
from scripts.save_workspace import save
from sklearn.model_selection import StratifiedKFold
import numpy as np


#===INITIALIZATION===#
Debug.DEBUG = 0
umafall = UMAFALL_Model()
processing = Processing_DB_Files()
project = Project()
extra_trees = ExtraTreesClassifier(n_estimators = 10000, random_state=0)
base_classification = Base_Classification(umafall, extra_trees)

#===LOAD FEATURES===#

#Interate threshold to find de best value#
s = save()
person_list = [14,15, 16, 17]
accuracy_threshould_list = []
data = {}
threshold = 0.65
project.log("=========== UMAFALL Outlier Accuracy, Thresold = {}===========".format(threshold), file="umafall_log.log")
for p in person_list:
    project.log("===========Person {}===========".format(p), file="umafall_log.log")
    data = s.load_var("umafall_relevant_features{}relevant_features_{}.pkl".format(slash, p))
from pre_processing.processing_db_files import Processing_DB_Files
from utils.project import Project
from sklearn.ensemble import ExtraTreesClassifier  # Extra Trees
import pandas as pd
from sklearn.model_selection import train_test_split
from pre_processing.get_accuracy import Get_Accuracy
import numpy as np
from tsfresh import extract_relevant_features
import time
from pre_processing.balance_data import BalanceData

#===INITIALIZATION===#
Debug.DEBUG = 0
umafall = UMAFALL_Model()
processing = Processing_DB_Files()
project = Project()
persons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
get_accuracy = Get_Accuracy()
balance_data = BalanceData()
threshold_balance_data = 40
#Select de best windows
t = time.time()
best_model = ExtraTreesClassifier(n_estimators=1000, random_state=0)
w_accuracies = pd.DataFrame(columns=["window", "accurary"])
p = 1  # pessoa com mais registros
project.log(
    "=====================UMAFALL_SELECT_BEST_WINDOWS=====================",
    file="umafall_log_best_window.log")
for w in range(10, 110, 10):

    print("Load data with window len = {}".format(w))
from sklearn import svm #SVM
from sklearn.neural_network import MLPClassifier #multi-layer percept
import pandas as pd
from sklearn.model_selection import train_test_split
from pre_processing.get_accuracy import Get_Accuracy
from scripts.save_workspace import save
import numpy as np
from pre_processing.balance_data import BalanceData
import statistics as st

#===INITIALIZATION===#

Debug.DEBUG = 0
arcma = ARCMA_Model()
processing = Processing_DB_Files()
project = Project()
#tuple from MPL
t_aux = []
for i in range(0,500):
    t_aux.append(500)
t = tuple(t_aux)
####
classifiers = {"MPL": MLPClassifier(random_state=1, solver="adam", activation="relu", max_iter=100000, alpha=1e-5, hidden_layer_sizes=t), "Extratrees": ExtraTreesClassifier(n_estimators = 1000, random_state=1), "Knn":KNeighborsClassifier(n_neighbors=5), "Naive Bayes":GaussianNB(), "RandomForest":RandomForestClassifier(n_estimators = 1000, random_state=1), "Decision Tree":tree.DecisionTreeClassifier(random_state=1), "SVM":svm.SVC(probability=True, random_state=1)}
persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
get_accuracy = Get_Accuracy()
balance_data = BalanceData()
threshold_balance_data = 40
#Select the best classifier
accuracy_mean = pd.DataFrame(columns=["Classifier", "Accuracy"])
project.log("=====================ARCMA_SELECT_BEST_ALGORITHM=====================", file="arcma_best_algorithm.log")
for c in classifiers:
Ejemplo n.º 22
0
from models.arcma_model import ARCMA_Model
from classifiers.base_classification import Base_Classification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pre_processing.processing_db_files import Processing_DB_Files  
from utils.project import Project, slash
from scripts.save_workspace import save
import statistics as st
from pre_processing.balance_data import BalanceData

#===INITIALIZATION===#
Debug.DEBUG = 0
arcma = ARCMA_Model()
processing = Processing_DB_Files()
project = Project()
extra_trees = ExtraTreesClassifier(n_estimators = 1000, random_state=0)
base_classification = Base_Classification(arcma, extra_trees)
balance_data = BalanceData()
threshold_balance_data = 40

#Interate threshold to find de best value#

s = save()
person_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
accuracy_threshould_temp_aux = pd.DataFrame(columns=["accuracy","discarted", "len_activity", "threshold"])
accuracy_mean = pd.DataFrame(columns=["accuracy","discarted", "len_activity", "threshold"])
project.log("====================ARCMA BEST THRESHOLD============================", file="arcma_log_best_threshold.log")
for t in np.arange(0.05, 1, 0.05):
    accuracy_threshould_temp_aux = pd.DataFrame(columns=["accuracy","discarted", "len_activity"])
    for p in person_list:
Ejemplo n.º 23
0
    def test_countfile_merging(self):
        """
		This tests that the correct files are used to merge.  The result (a data structure) of the merging is mocked out.
		Tests that the expected data is written to the file and tests that the file ends up in the correct location
		"""

        # a dummy method to mock the reading/concatenating of the data in the individual files
        def mock_read(matrix, f):
            dummy = [['geneA', '0', '100',
                      '200'], ['geneB', '1', '101', '201'],
                     ['geneC', '2', '102', '202']]
            if len(matrix) == 0:
                for k in range(len(dummy)):
                    matrix.append([])

            for i, l in enumerate(dummy):
                matrix[i] = l

        # mock out the actual implementations
        self.module.get_countfile_groupings = mock.Mock()

        self.module.get_countfile_groupings.return_value = [
            [
                '/path/to/final/featureCounts/A.counts',
                '/path/to/final/featureCounts/C.counts',
                '/path/to/final/featureCounts/B.counts'
            ],
            [
                '/path/to/final/featureCounts/A.primary.counts',
                '/path/to/final/featureCounts/C.primary.counts'
            ],
            [
                '/path/to/final/featureCounts/A.primary.dedup.counts',
                '/path/to/final/featureCounts/B.primary.dedup.counts',
                '/path/to/final/featureCounts/C.primary.dedup.counts'
            ]
        ]

        self.module.read = mock_read

        p = Params()
        p.add(raw_count_matrix_file_prefix='merged_counts')

        s1 = Sample('A', 'X')
        s1.countfiles = [
            '/path/to/final/featureCounts/A.primary.counts',
            '/path/to/final/featureCounts/A.counts',
            '/path/to/final/featureCounts/A.primary.dedup.counts'
        ]
        s2 = Sample('B', 'Y')
        s2.countfiles = [
            '/path/to/final/featureCounts/B.counts',
            '/path/to/final/featureCounts/B.primary.dedup.counts',
            '/path/to/final/featureCounts/B.primary.counts'
        ]
        s3 = Sample('C', 'Z')
        s3.countfiles = [
            '/path/to/final/featureCounts/C.counts',
            '/path/to/final/featureCounts/C.primary.counts',
            '/path/to/final/featureCounts/C.primary.dedup.counts'
        ]

        project = Project()
        project.add_parameters(p)
        project.add_samples([s1, s3, s2])

        m = mock.mock_open()
        with mock.patch.object(__builtin__, 'open', m):
            self.module.create_count_matrices(project, mock.Mock())
            m.assert_any_call(
                '/path/to/final/featureCounts/merged_counts.counts', 'w')
            m.assert_any_call(
                '/path/to/final/featureCounts/merged_counts.primary.counts',
                'w')
            m.assert_any_call(
                '/path/to/final/featureCounts/merged_counts.primary.dedup.counts',
                'w')
            handle = m()
            calls = [
                mock.call('Gene\tA\tB\tC\n'),
                mock.call('geneA\t0\t100\t200\n'),
                mock.call('geneB\t1\t101\t201\n'),
                mock.call('geneC\t2\t102\t202\n')
            ] * 3
            handle.write.assert_has_calls(calls)
from pre_processing.processing_db_files import Processing_DB_Files
from utils.project import Project
from sklearn.ensemble import ExtraTreesClassifier # Extra Trees
import pandas as pd
from sklearn.model_selection import train_test_split
from pre_processing.get_accuracy import Get_Accuracy
import numpy as np
from tsfresh import extract_relevant_features
import time
from pre_processing.balance_data import BalanceData

#===INITIALIZATION===#
Debug.DEBUG = 0
arcma = ARCMA_Model()
processing = Processing_DB_Files()
project = Project()
persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
get_accuracy = Get_Accuracy()
balance_data = BalanceData()
threshold_balance_data = 40
#Select de best windows
t = time.time()
best_model = ExtraTreesClassifier(n_estimators = 1000, random_state=0)
w_accuracies = pd.DataFrame(columns=["window", "accurary"])
p = 15 # pessoa com mais registros
project.log("=====================ARCMA_SELECT_BEST_WINDOWS=====================", file="arcma_log_best_window.log")
for w in range(20,110,10):
    
    print("Load data with window len = {}".format(w))
    data = arcma.load_training_data_by_people(p)
    print("Slicing Window....")
Ejemplo n.º 25
0
	def test_missing_count_matrix_files_raises_exception(self):
		project = Project()
		component_params = Params()
		with self.assertRaises(self.module.NoCountMatricesException):
			self.module.normalize(project, component_params)
Ejemplo n.º 26
0
# -*- coding: utf-8 -*-
# IMPORTS #
from utils.debug import Debug
from models.arcma_model import ARCMA_Model
from tsfresh import extract_relevant_features
from pre_processing.processing_db_files import Processing_DB_Files
from utils.project import Project, slash
from scripts.save_workspace import save

#===INITIALIZATION===#
Debug.DEBUG = 0
arcma = ARCMA_Model()
processing = Processing_DB_Files()
project = Project()
s = save()
#window = 26 # Janela Fixa
window = 50  # Melhor Janela
persons = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

for p in persons:

    data = arcma.load_training_data_by_people(p)
    print("Slicing Window....")
    data_tsfresh, y = arcma.slice_by_window_tsfresh(data, window)
    y.index += 1
    del data_tsfresh["activity"]

    classes_counts = y.value_counts()
    if len(classes_counts) > 1:
        relevant_features = extract_relevant_features(data_tsfresh,
                                                      y,
Ejemplo n.º 27
0
from sklearn import tree # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest
from sklearn.ensemble import ExtraTreesClassifier # Extra Trees
from sklearn.naive_bayes import GaussianNB #Naive Bayes
from sklearn import svm #SVM
from sklearn.neural_network import MLPClassifier #multi-layer percept
#==== Models ====#
from models.hmp_model import HMP_Model
from models.umafall_model import UMAFALL_Model
from models.arcma_model import ARCMA_Model


#===INITIALIZATION===#
Debug.DEBUG = 0
processing = Processing_DB_Files()
project = Project()
s = save()
get_accuracy = Get_Accuracy()
#===INIT BASES===#
hmp_persons = ["f1", "m1", "m2", "f2", "m3", "f3", "m4", "f4"] # at least 5 activities
umafall_persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]
arcma_persons = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
models = []
#models.append({"model_name":"hmp", "model":HMP_Model(), "persons":hmp_persons, "window":90})
models.append({"model_name":"umafall", "model":UMAFALL_Model(), "persons":umafall_persons, "window":10})
models.append({"model_name":"arcma", "model":ARCMA_Model(), "persons":arcma_persons, "window":40})

#tuple from MPL
t_aux = []
for i in range(0,500):
    t_aux.append(500)