def setUp(self):
     ##
     # Five features:
     # Three correlated, with two in close RT proximity
     # Two uncorrelated
     ##
     self.msData = nPYc.MSDataset('', fileType='empty')
     self.msData.sampleMetadata = pandas.DataFrame(['a', 'b', 'c', 'd'],
                                                   columns=['Sample Name'])
     self.msData.intensityData = numpy.array(
         [[1, 21, 10.5, 4, 5], [2, 22, 12.5, 5, 6], [3, 23, 11.5, 6, 5],
          [4, 24, 12, 7, 6]],
         dtype=float)
     self.msData.featureMetadata = pandas.DataFrame(
         [[
             3.12, 5, '100 - 10 - 1', '3.12_127.1212m/z', numpy.nan,
             127.1212
         ],
          [
              3.13, 5, '100 - 10 - 1', '3.13_220.1419n',
              'M+H, M+Na, M+K, 2M+Na', 219.1419
          ], [3.12, 5, '100 - 20', '3.12_170.2233m/z', numpy.nan, 170.2233],
          [5.32, 5, '100', '5.32_89.9812m/z', numpy.nan, 89.9812],
          [
              0.56, 5, '90 - 100 - 50', '0.56_214.1245n',
              'M+H, M+Na, M+K, 2M+Na', 213.1245
          ]],
         columns=[
             'Retention Time', 'Peak Width', 'Isotope Distribution',
             'Feature Name', 'Adducts', 'm/z'
         ])
     self.msData.Attributes['FeatureExtractionSoftware'] = 'Progenesis QI'
     self.msData.initialiseMasks()
Esempio n. 2
0
	def setUp(self):
		from nPYc.enumerations import AssayRole, SampleType

		self.msData = nPYc.MSDataset('', fileType='empty')

		##
		# Variables:
		# Above blank
		# Below blank (default)
		# Below blank * 5
		##
		self.msData.intensityData = numpy.array([[54, 53, 121],
												[57, 49, 15],
												[140, 41, 97],
												[52, 60, 42],
												[12, 48, 8],
												[1, 60, 41],
												[2, 21, 42,]],
												dtype=float)

		self.msData.sampleMetadata = pandas.DataFrame(data=[ [numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample],
															[numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample],
															[numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample],
															[numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample],
															[numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample],
															[0, 1, 1, 1, AssayRole.Assay, SampleType.ProceduralBlank],
															[0, 1, 1, 1, AssayRole.Assay, SampleType.ProceduralBlank]],
															columns=['Dilution', 'Batch', 'Correction Batch', 'Well', 'AssayRole', 'SampleType'])

		self.msData.featureMetadata = pandas.DataFrame(data=[['Feature_1', 0.5, 100., 0.3],
															['Feature_2', 0.55, 100.04, 0.3],
															['Feature_3', 0.75, 200., 0.1]],
															columns=['Feature Name','Retention Time','m/z','Peak Width'])

		self.msData.initialiseMasks()
Esempio n. 3
0
	def test_blank_filter_raises(self):
		msData = nPYc.MSDataset('', fileType='empty')

		with self.subTest(msg='Invalid threshold'):
			self.assertRaises(TypeError, nPYc.utilities._filters.blankFilter, msData, threshold='A string')

		with self.subTest(msg='True threshold'):
			self.assertRaises(TypeError, nPYc.utilities._filters.blankFilter, msData, threshold=True)
Esempio n. 4
0
		def test_correctMSdataset_raises(self):

			with self.subTest(msg='Object type'):
				self.assertRaises(TypeError, nPYc.batchAndROCorrection.correctMSdataset, 's')

			with self.subTest(msg='Parallelise type'):
				dataset = nPYc.MSDataset('', type='empty')
				self.assertRaises(TypeError, nPYc.batchAndROCorrection.correctMSdataset, dataset, parallelise=1)
Esempio n. 5
0
    def setUp(self):
        ##
        # Five features:
        # Three correlated, with two in close RT proximity
        # Two uncorrelated
        ##
        self.msData = nPYc.MSDataset('', fileType='empty')
        self.msData.sampleMetadata = pandas.DataFrame(['a', 'b', 'c', 'd'],
                                                      columns=['Sample Name'])
        self.msData.intensityData = numpy.array(
            [[1, 21, 10.5, 4, 5], [2, 22, 12.5, 5, 6], [3, 23, 11.5, 6, 5],
             [4, 24, 12, 7, 6]],
            dtype=float)
        self.msData.featureMetadata = pandas.DataFrame(
            [[
                3.12, 5, '100 - 10 - 1', '3.12_127.1212m/z', numpy.nan,
                127.1212
            ],
             [
                 3.13, 5, '100 - 10 - 1', '3.13_220.1419n',
                 'M+H, M+Na, M+K, 2M+Na', 219.1419
             ], [3.12, 5, '100 - 20', '3.12_170.2233m/z', numpy.nan, 170.2233],
             [5.32, 5, '100', '5.32_89.9812m/z', numpy.nan, 89.9812],
             [
                 0.56, 5, '90 - 100 - 50', '0.56_214.1245n',
                 'M+H, M+Na, M+K, 2M+Na', 213.1245
             ]],
            columns=[
                'Retention Time', 'Peak Width', 'Isotope Distribution',
                'Feature Name', 'Adducts', 'm/z'
            ])

        self.msData.featureMetadata['Exclusion Details'] = None
        self.msData.featureMetadata['User Excluded'] = False
        self.msData.featureMetadata[[
            'rsdFilter', 'varianceRatioFilter', 'correlationToDilutionFilter',
            'blankFilter', 'artifactualFilter'
        ]] = pandas.DataFrame([[True, True, True, True, True]],
                              index=self.msData.featureMetadata.index)

        self.msData.featureMetadata[['rsdSP', 'rsdSS/rsdSP', 'correlationToDilution', 'blankValue']] \
         = pandas.DataFrame([[numpy.nan, numpy.nan, numpy.nan, numpy.nan]], index=self.msData.featureMetadata.index)

        self.msData.Attributes['FeatureExtractionSoftware'] = 'Progenesis QI'
        self.msData.initialiseMasks()
Esempio n. 6
0
	def test_generatesrdmask(self):

		# Create an empty object with simple filenames
		msData = nPYc.MSDataset('', fileType='empty')

		msData.sampleMetadata['Sample File Name'] = ['Test1_HPOS_ToF01_B1SRD01', 'Test1_HPOS_ToF01_B1SRD02', 'Test1_HPOS_ToF01_B1SRD43',
													'Test1_HPOS_ToF01_B1SRD44','Test1_HPOS_ToF01_B1SRD45','Test1_HPOS_ToF01_B1SRD46',
													'Test1_HPOS_ToF01_B1SRD47','Test1_HPOS_ToF01_B1SRD48','Test1_HPOS_ToF01_B1SRD49',
													'Test1_HPOS_ToF01_B1SRD50','Test1_HPOS_ToF01_B1SRD51','Test1_HPOS_ToF01_B1SRD92',
													'Test1_HPOS_ToF01_B2SRD01','Test1_HPOS_ToF01_B2SRD02','Test1_HPOS_ToF01_B2SRD43',
													'Test1_HPOS_ToF01_B2SRD44','Test1_HPOS_ToF01_B2SRD45','Test1_HPOS_ToF01_B2SRD46',
													'Test1_HPOS_ToF01_P2W30','Test1_HPOS_ToF01_P2W31_SR','Test1_HPOS_ToF01_P2W32',
													'Test1_HPOS_ToF01_B2SRD47','Test1_HPOS_ToF01_B2SRD48','Test1_HPOS_ToF01_B2SRD49',
													'Test1_HPOS_ToF01_B2SRD50','Test1_HPOS_ToF01_B2SRD51','Test1_HPOS_ToF01_B2SRD92',
													'Test1_HPOS_ToF01_B3SRD01','Test1_HPOS_ToF01_B3SRD02','Test1_HPOS_ToF01_B3SRD43',
													'Test1_HPOS_ToF01_B3SRD44','Test1_HPOS_ToF01_B3SRD45','Test1_HPOS_ToF01_B3SRD46',
													'Test1_HPOS_ToF01_B3SRD47','Test1_HPOS_ToF01_B3SRD48','Test1_HPOS_ToF01_B3SRD49',
													'Test1_HPOS_ToF01_B3SRD50','Test1_HPOS_ToF01_B3SRD51','Test1_HPOS_ToF01_B3SRD92']

		msData.intensityData = numpy.zeros((39,2))
		msData.initialiseMasks()
		msData.sampleMetadata['Run Order'] = msData.sampleMetadata.index + 1
		msData.addSampleInfo(descriptionFormat='Filenames')
		msData.addSampleInfo(descriptionFormat='Batches')
		msData.corrExclusions = msData.sampleMask

		srdMask = nPYc.utilities.ms.generateLRmask(msData)

		cannonicalMask = {'Batch 1.0, series 1.0': numpy.array([True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, 
																False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False],
															   dtype=bool),
						  'Batch 2.0, series 1.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False,
						  										False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False],
															   dtype=bool),
						  'Batch 2.0, series 2.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,
						  										False, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False],
															   dtype=bool),
						  'Batch 3.0, series 2.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False,
																False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True],
															   dtype=bool)}

		numpy.testing.assert_equal(srdMask, cannonicalMask)
def generateTestDataset(noSamp,
                        noFeat,
                        dtype='Dataset',
                        variableType=VariableType.Discrete,
                        sop='Generic'):
    """
	Generate a dataset object with random sample and feature numbers, and random contents.

	.. warning:: Objects returned by this function are not expected to be fully functional!

	:param int noSamp: Number of samples
	:param int noFeat: Number of features
	:param VariableType variableType: Type of enumerations
	
	"""
    if dtype == 'Dataset':
        data = nPYc.Dataset(sop=sop)
    elif dtype == 'MSDataset':
        data = nPYc.MSDataset('', fileType='empty', sop=sop)
    elif dtype == 'NMRDataset':
        data = nPYc.NMRDataset('', fileType='empty', sop=sop)
    else:
        raise ValueError

    data.intensityData = numpy.random.lognormal(size=(noSamp, noFeat)) + 1

    data.sampleMetadata = pandas.DataFrame(0,
                                           index=numpy.arange(noSamp),
                                           columns=[
                                               'Sample File Name',
                                               'SampleType', 'AssayRole',
                                               'Acquired Time', 'Run Order',
                                               'Dilution', 'Detector',
                                               'Correction Batch'
                                           ])

    data.sampleMetadata[
        'SampleType'] = nPYc.enumerations.SampleType.StudySample
    data.sampleMetadata['AssayRole'] = nPYc.enumerations.AssayRole.Assay
    data.sampleMetadata['Run Order'] = numpy.arange(noSamp)
    data.sampleMetadata['Detector'] = numpy.arange(noSamp) * 5
    data.sampleMetadata['Batch'] = 1
    data.sampleMetadata['Correction Batch'] = 2
    data.sampleMetadata.loc[0:int(noSamp / 2), 'Correction Batch'] = 1
    data.sampleMetadata['Exclusion Details'] = ''

    data.sampleMetadata['Sample File Name'] = [
        randomword(10) for x in range(0, noSamp)
    ]
    data.sampleMetadata['Dilution'] = numpy.random.rand(noSamp)

    noClasses = numpy.random.randint(2, 5)
    classNames = [str(i) for i in range(0, noClasses)]
    classProbabilties = numpy.random.rand(noClasses)
    classProbabilties = classProbabilties / sum(classProbabilties)

    data.sampleMetadata['Classes'] = numpy.random.choice(classNames,
                                                         size=noSamp,
                                                         p=classProbabilties)

    data.sampleMetadata['Acquired Time'] = [
        d
        for d in datetime_range(datetime.now(), noSamp, timedelta(minutes=15))
    ]
    data.sampleMetadata['Acquired Time'] = data.sampleMetadata[
        'Acquired Time'].astype(datetime)

    data.sampleMetadata.iloc[::10, 1] = nPYc.enumerations.SampleType.StudyPool
    data.sampleMetadata.iloc[::10,
                             2] = nPYc.enumerations.AssayRole.PrecisionReference

    data.sampleMetadata.iloc[
        5::10, 1] = nPYc.enumerations.SampleType.ExternalReference
    data.sampleMetadata.iloc[
        5::10, 2] = nPYc.enumerations.AssayRole.PrecisionReference

    if dtype == 'MSDataset' or dtype == 'Dataset':
        data.featureMetadata = pandas.DataFrame(0,
                                                index=numpy.arange(noFeat),
                                                columns=['m/z'])

        data.featureMetadata['m/z'] = numpy.linspace(50, 800, noFeat)
        data.featureMetadata['Retention Time'] = (
            720 - 50) * numpy.random.rand(noFeat) + 50
        data.featureMetadata['Feature Name'] = [
            randomword(10) for x in range(0, noFeat)
        ]
        data.featureMetadata['ppm'] = numpy.linspace(10, -1, noFeat)

    elif dtype == 'NMRDataset':
        data.featureMetadata = pandas.DataFrame(numpy.linspace(10, -1, noFeat),
                                                columns=('ppm', ),
                                                dtype=float)

    data.VariableType = variableType
    data.initialiseMasks()

    return data
Esempio n. 8
0
	def test_generatesrdmask_raises(self):

		dataset = nPYc.MSDataset('', fileType='empty')

		self.assertRaises(ValueError, nPYc.utilities.ms.generateLRmask, dataset)
def generateTestDataset(noSamp,
                        noFeat,
                        dtype='Dataset',
                        variableType=VariableType.Discrete,
                        sop='Generic'):
    """
	Generate a dataset object with random sample and feature numbers, and random contents.

	.. warning:: Objects returned by this function are not expected to be fully functional!

	:param int noSamp: Number of samples
	:param int noFeat: Number of features
	:param VariableType variableType: Type of enumerations
	
	"""
    if dtype == 'Dataset':
        data = nPYc.Dataset(sop=sop)
    elif dtype == 'MSDataset':
        data = nPYc.MSDataset('', fileType='empty', sop=sop)
    elif dtype == 'NMRDataset':
        data = nPYc.NMRDataset('', fileType='empty', sop=sop)
    elif dtype == 'TargetedDataset':
        data = nPYc.TargetedDataset('', fileType='empty', sop=sop)
    else:
        raise ValueError

    data.intensityData = numpy.random.lognormal(size=(noSamp, noFeat)) + 1

    data.sampleMetadata = pandas.DataFrame(0,
                                           index=numpy.arange(noSamp),
                                           columns=[
                                               'Sample File Name',
                                               'SampleType', 'AssayRole',
                                               'Acquired Time', 'Run Order',
                                               'Dilution', 'Detector',
                                               'Correction Batch'
                                           ])

    data.sampleMetadata[
        'SampleType'] = nPYc.enumerations.SampleType.StudySample
    data.sampleMetadata['AssayRole'] = nPYc.enumerations.AssayRole.Assay
    data.sampleMetadata['Run Order'] = numpy.arange(noSamp, dtype='int64')
    data.sampleMetadata['Detector'] = numpy.arange(noSamp) * 5
    data.sampleMetadata['Batch'] = 1
    data.sampleMetadata['Correction Batch'] = 2
    data.sampleMetadata.loc[0:int(noSamp / 2), 'Correction Batch'] = 1
    data.sampleMetadata['Exclusion Details'] = ''

    data.sampleMetadata['Sample File Name'] = [
        randomword(10) for x in range(0, noSamp)
    ]
    data.sampleMetadata['Sample ID'] = [
        randomword(10) for x in range(0, noSamp)
    ]
    data.sampleMetadata['Dilution'] = numpy.random.rand(noSamp)

    noClasses = numpy.random.randint(2, 5)
    classNames = [str(i) for i in range(0, noClasses)]
    classProbabilties = numpy.random.rand(noClasses)
    classProbabilties = classProbabilties / sum(classProbabilties)

    data.sampleMetadata['Classes'] = numpy.random.choice(classNames,
                                                         size=noSamp,
                                                         p=classProbabilties)

    data.sampleMetadata['Acquired Time'] = [
        d
        for d in datetime_range(datetime.now(), noSamp, timedelta(minutes=15))
    ]
    #Ensure seconds are not recorded, otherwise its impossible to test datasets read with datasets recorded on the fly.
    data.sampleMetadata['Acquired Time'] = [
        datetime.strptime(d.strftime("%Y-%m-%d %H:%M"), "%Y-%m-%d %H:%M")
        for d in data.sampleMetadata['Acquired Time']
    ]
    data.sampleMetadata['Acquired Time'] = data.sampleMetadata[
        'Acquired Time'].dt.to_pydatetime()

    data.sampleMetadata.iloc[::10, 1] = nPYc.enumerations.SampleType.StudyPool
    data.sampleMetadata.iloc[::10,
                             2] = nPYc.enumerations.AssayRole.PrecisionReference

    data.sampleMetadata.iloc[
        5::10, 1] = nPYc.enumerations.SampleType.ExternalReference
    data.sampleMetadata.iloc[
        5::10, 2] = nPYc.enumerations.AssayRole.PrecisionReference

    if dtype == 'MSDataset':
        data.featureMetadata = pandas.DataFrame(0,
                                                index=numpy.arange(noFeat),
                                                columns=['m/z'])

        data.featureMetadata['m/z'] = (800 -
                                       40) * numpy.random.rand(noFeat) + 40
        data.featureMetadata['Retention Time'] = (
            720 - 50) * numpy.random.rand(noFeat) + 50
        data.featureMetadata['Feature Name'] = [
            randomword(10) for x in range(0, noFeat)
        ]

        data.featureMetadata['Exclusion Details'] = None
        data.featureMetadata['User Excluded'] = False
        data.featureMetadata[[
            'rsdFilter', 'varianceRatioFilter', 'correlationToDilutionFilter',
            'blankFilter', 'artifactualFilter'
        ]] = pandas.DataFrame([[True, True, True, True, True]],
                              index=data.featureMetadata.index)

        data.featureMetadata[['rsdSP', 'rsdSS/rsdSP', 'correlationToDilution', 'blankValue']] \
         = pandas.DataFrame([[numpy.nan, numpy.nan, numpy.nan, numpy.nan]], index=data.featureMetadata.index)

        data.Attributes['Feature Names'] = 'Feature Name'

    elif dtype == 'Dataset':
        data.featureMetadata = pandas.DataFrame(0,
                                                index=numpy.arange(noFeat),
                                                columns=['m/z'])

        data.featureMetadata['m/z'] = (800 -
                                       40) * numpy.random.rand(noFeat) + 40
        data.featureMetadata['Retention Time'] = (
            720 - 50) * numpy.random.rand(noFeat) + 50
        data.featureMetadata['Feature Name'] = [
            randomword(10) for x in range(0, noFeat)
        ]

        data.Attributes['Feature Names'] = 'Feature Name'

    elif dtype == 'NMRDataset':
        data.featureMetadata = pandas.DataFrame(numpy.linspace(10, -1, noFeat),
                                                columns=('ppm', ),
                                                dtype=float)
        data.featureMetadata['Feature Name'] = data.featureMetadata[
            'ppm'].astype(str)
        data.sampleMetadata['Delta PPM'] = numpy.random.rand(noSamp)
        data.sampleMetadata['Line Width (Hz)'] = numpy.random.rand(noSamp)
        data.sampleMetadata['CalibrationFail'] = False
        data.sampleMetadata['LineWidthFail'] = False
        data.sampleMetadata['WaterPeakFail'] = False
        data.sampleMetadata['BaselineFail'] = False

        data.Attributes['Feature Names'] = 'ppm'

    data.VariableType = variableType
    data.initialiseMasks()

    return data