def setUp(self): ## # Five features: # Three correlated, with two in close RT proximity # Two uncorrelated ## self.msData = nPYc.MSDataset('', fileType='empty') self.msData.sampleMetadata = pandas.DataFrame(['a', 'b', 'c', 'd'], columns=['Sample Name']) self.msData.intensityData = numpy.array( [[1, 21, 10.5, 4, 5], [2, 22, 12.5, 5, 6], [3, 23, 11.5, 6, 5], [4, 24, 12, 7, 6]], dtype=float) self.msData.featureMetadata = pandas.DataFrame( [[ 3.12, 5, '100 - 10 - 1', '3.12_127.1212m/z', numpy.nan, 127.1212 ], [ 3.13, 5, '100 - 10 - 1', '3.13_220.1419n', 'M+H, M+Na, M+K, 2M+Na', 219.1419 ], [3.12, 5, '100 - 20', '3.12_170.2233m/z', numpy.nan, 170.2233], [5.32, 5, '100', '5.32_89.9812m/z', numpy.nan, 89.9812], [ 0.56, 5, '90 - 100 - 50', '0.56_214.1245n', 'M+H, M+Na, M+K, 2M+Na', 213.1245 ]], columns=[ 'Retention Time', 'Peak Width', 'Isotope Distribution', 'Feature Name', 'Adducts', 'm/z' ]) self.msData.Attributes['FeatureExtractionSoftware'] = 'Progenesis QI' self.msData.initialiseMasks()
def setUp(self): from nPYc.enumerations import AssayRole, SampleType self.msData = nPYc.MSDataset('', fileType='empty') ## # Variables: # Above blank # Below blank (default) # Below blank * 5 ## self.msData.intensityData = numpy.array([[54, 53, 121], [57, 49, 15], [140, 41, 97], [52, 60, 42], [12, 48, 8], [1, 60, 41], [2, 21, 42,]], dtype=float) self.msData.sampleMetadata = pandas.DataFrame(data=[ [numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample], [numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample], [numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample], [numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample], [numpy.nan, 1, 1, 1, AssayRole.Assay, SampleType.StudySample], [0, 1, 1, 1, AssayRole.Assay, SampleType.ProceduralBlank], [0, 1, 1, 1, AssayRole.Assay, SampleType.ProceduralBlank]], columns=['Dilution', 'Batch', 'Correction Batch', 'Well', 'AssayRole', 'SampleType']) self.msData.featureMetadata = pandas.DataFrame(data=[['Feature_1', 0.5, 100., 0.3], ['Feature_2', 0.55, 100.04, 0.3], ['Feature_3', 0.75, 200., 0.1]], columns=['Feature Name','Retention Time','m/z','Peak Width']) self.msData.initialiseMasks()
def test_blank_filter_raises(self): msData = nPYc.MSDataset('', fileType='empty') with self.subTest(msg='Invalid threshold'): self.assertRaises(TypeError, nPYc.utilities._filters.blankFilter, msData, threshold='A string') with self.subTest(msg='True threshold'): self.assertRaises(TypeError, nPYc.utilities._filters.blankFilter, msData, threshold=True)
def test_correctMSdataset_raises(self): with self.subTest(msg='Object type'): self.assertRaises(TypeError, nPYc.batchAndROCorrection.correctMSdataset, 's') with self.subTest(msg='Parallelise type'): dataset = nPYc.MSDataset('', type='empty') self.assertRaises(TypeError, nPYc.batchAndROCorrection.correctMSdataset, dataset, parallelise=1)
def setUp(self): ## # Five features: # Three correlated, with two in close RT proximity # Two uncorrelated ## self.msData = nPYc.MSDataset('', fileType='empty') self.msData.sampleMetadata = pandas.DataFrame(['a', 'b', 'c', 'd'], columns=['Sample Name']) self.msData.intensityData = numpy.array( [[1, 21, 10.5, 4, 5], [2, 22, 12.5, 5, 6], [3, 23, 11.5, 6, 5], [4, 24, 12, 7, 6]], dtype=float) self.msData.featureMetadata = pandas.DataFrame( [[ 3.12, 5, '100 - 10 - 1', '3.12_127.1212m/z', numpy.nan, 127.1212 ], [ 3.13, 5, '100 - 10 - 1', '3.13_220.1419n', 'M+H, M+Na, M+K, 2M+Na', 219.1419 ], [3.12, 5, '100 - 20', '3.12_170.2233m/z', numpy.nan, 170.2233], [5.32, 5, '100', '5.32_89.9812m/z', numpy.nan, 89.9812], [ 0.56, 5, '90 - 100 - 50', '0.56_214.1245n', 'M+H, M+Na, M+K, 2M+Na', 213.1245 ]], columns=[ 'Retention Time', 'Peak Width', 'Isotope Distribution', 'Feature Name', 'Adducts', 'm/z' ]) self.msData.featureMetadata['Exclusion Details'] = None self.msData.featureMetadata['User Excluded'] = False self.msData.featureMetadata[[ 'rsdFilter', 'varianceRatioFilter', 'correlationToDilutionFilter', 'blankFilter', 'artifactualFilter' ]] = pandas.DataFrame([[True, True, True, True, True]], index=self.msData.featureMetadata.index) self.msData.featureMetadata[['rsdSP', 'rsdSS/rsdSP', 'correlationToDilution', 'blankValue']] \ = pandas.DataFrame([[numpy.nan, numpy.nan, numpy.nan, numpy.nan]], index=self.msData.featureMetadata.index) self.msData.Attributes['FeatureExtractionSoftware'] = 'Progenesis QI' self.msData.initialiseMasks()
def test_generatesrdmask(self): # Create an empty object with simple filenames msData = nPYc.MSDataset('', fileType='empty') msData.sampleMetadata['Sample File Name'] = ['Test1_HPOS_ToF01_B1SRD01', 'Test1_HPOS_ToF01_B1SRD02', 'Test1_HPOS_ToF01_B1SRD43', 'Test1_HPOS_ToF01_B1SRD44','Test1_HPOS_ToF01_B1SRD45','Test1_HPOS_ToF01_B1SRD46', 'Test1_HPOS_ToF01_B1SRD47','Test1_HPOS_ToF01_B1SRD48','Test1_HPOS_ToF01_B1SRD49', 'Test1_HPOS_ToF01_B1SRD50','Test1_HPOS_ToF01_B1SRD51','Test1_HPOS_ToF01_B1SRD92', 'Test1_HPOS_ToF01_B2SRD01','Test1_HPOS_ToF01_B2SRD02','Test1_HPOS_ToF01_B2SRD43', 'Test1_HPOS_ToF01_B2SRD44','Test1_HPOS_ToF01_B2SRD45','Test1_HPOS_ToF01_B2SRD46', 'Test1_HPOS_ToF01_P2W30','Test1_HPOS_ToF01_P2W31_SR','Test1_HPOS_ToF01_P2W32', 'Test1_HPOS_ToF01_B2SRD47','Test1_HPOS_ToF01_B2SRD48','Test1_HPOS_ToF01_B2SRD49', 'Test1_HPOS_ToF01_B2SRD50','Test1_HPOS_ToF01_B2SRD51','Test1_HPOS_ToF01_B2SRD92', 'Test1_HPOS_ToF01_B3SRD01','Test1_HPOS_ToF01_B3SRD02','Test1_HPOS_ToF01_B3SRD43', 'Test1_HPOS_ToF01_B3SRD44','Test1_HPOS_ToF01_B3SRD45','Test1_HPOS_ToF01_B3SRD46', 'Test1_HPOS_ToF01_B3SRD47','Test1_HPOS_ToF01_B3SRD48','Test1_HPOS_ToF01_B3SRD49', 'Test1_HPOS_ToF01_B3SRD50','Test1_HPOS_ToF01_B3SRD51','Test1_HPOS_ToF01_B3SRD92'] msData.intensityData = numpy.zeros((39,2)) msData.initialiseMasks() msData.sampleMetadata['Run Order'] = msData.sampleMetadata.index + 1 msData.addSampleInfo(descriptionFormat='Filenames') msData.addSampleInfo(descriptionFormat='Batches') msData.corrExclusions = msData.sampleMask srdMask = nPYc.utilities.ms.generateLRmask(msData) cannonicalMask = {'Batch 1.0, series 1.0': numpy.array([True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), 'Batch 2.0, series 1.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), 'Batch 2.0, series 2.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False], dtype=bool), 'Batch 3.0, series 2.0': numpy.array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True], dtype=bool)} numpy.testing.assert_equal(srdMask, cannonicalMask)
def generateTestDataset(noSamp, noFeat, dtype='Dataset', variableType=VariableType.Discrete, sop='Generic'): """ Generate a dataset object with random sample and feature numbers, and random contents. .. warning:: Objects returned by this function are not expected to be fully functional! :param int noSamp: Number of samples :param int noFeat: Number of features :param VariableType variableType: Type of enumerations """ if dtype == 'Dataset': data = nPYc.Dataset(sop=sop) elif dtype == 'MSDataset': data = nPYc.MSDataset('', fileType='empty', sop=sop) elif dtype == 'NMRDataset': data = nPYc.NMRDataset('', fileType='empty', sop=sop) else: raise ValueError data.intensityData = numpy.random.lognormal(size=(noSamp, noFeat)) + 1 data.sampleMetadata = pandas.DataFrame(0, index=numpy.arange(noSamp), columns=[ 'Sample File Name', 'SampleType', 'AssayRole', 'Acquired Time', 'Run Order', 'Dilution', 'Detector', 'Correction Batch' ]) data.sampleMetadata[ 'SampleType'] = nPYc.enumerations.SampleType.StudySample data.sampleMetadata['AssayRole'] = nPYc.enumerations.AssayRole.Assay data.sampleMetadata['Run Order'] = numpy.arange(noSamp) data.sampleMetadata['Detector'] = numpy.arange(noSamp) * 5 data.sampleMetadata['Batch'] = 1 data.sampleMetadata['Correction Batch'] = 2 data.sampleMetadata.loc[0:int(noSamp / 2), 'Correction Batch'] = 1 data.sampleMetadata['Exclusion Details'] = '' data.sampleMetadata['Sample File Name'] = [ randomword(10) for x in range(0, noSamp) ] data.sampleMetadata['Dilution'] = numpy.random.rand(noSamp) noClasses = numpy.random.randint(2, 5) classNames = [str(i) for i in range(0, noClasses)] classProbabilties = numpy.random.rand(noClasses) classProbabilties = classProbabilties / sum(classProbabilties) data.sampleMetadata['Classes'] = numpy.random.choice(classNames, size=noSamp, p=classProbabilties) data.sampleMetadata['Acquired Time'] = [ d for d in datetime_range(datetime.now(), noSamp, timedelta(minutes=15)) ] data.sampleMetadata['Acquired Time'] = data.sampleMetadata[ 'Acquired Time'].astype(datetime) data.sampleMetadata.iloc[::10, 1] = nPYc.enumerations.SampleType.StudyPool data.sampleMetadata.iloc[::10, 2] = nPYc.enumerations.AssayRole.PrecisionReference data.sampleMetadata.iloc[ 5::10, 1] = nPYc.enumerations.SampleType.ExternalReference data.sampleMetadata.iloc[ 5::10, 2] = nPYc.enumerations.AssayRole.PrecisionReference if dtype == 'MSDataset' or dtype == 'Dataset': data.featureMetadata = pandas.DataFrame(0, index=numpy.arange(noFeat), columns=['m/z']) data.featureMetadata['m/z'] = numpy.linspace(50, 800, noFeat) data.featureMetadata['Retention Time'] = ( 720 - 50) * numpy.random.rand(noFeat) + 50 data.featureMetadata['Feature Name'] = [ randomword(10) for x in range(0, noFeat) ] data.featureMetadata['ppm'] = numpy.linspace(10, -1, noFeat) elif dtype == 'NMRDataset': data.featureMetadata = pandas.DataFrame(numpy.linspace(10, -1, noFeat), columns=('ppm', ), dtype=float) data.VariableType = variableType data.initialiseMasks() return data
def test_generatesrdmask_raises(self): dataset = nPYc.MSDataset('', fileType='empty') self.assertRaises(ValueError, nPYc.utilities.ms.generateLRmask, dataset)
def generateTestDataset(noSamp, noFeat, dtype='Dataset', variableType=VariableType.Discrete, sop='Generic'): """ Generate a dataset object with random sample and feature numbers, and random contents. .. warning:: Objects returned by this function are not expected to be fully functional! :param int noSamp: Number of samples :param int noFeat: Number of features :param VariableType variableType: Type of enumerations """ if dtype == 'Dataset': data = nPYc.Dataset(sop=sop) elif dtype == 'MSDataset': data = nPYc.MSDataset('', fileType='empty', sop=sop) elif dtype == 'NMRDataset': data = nPYc.NMRDataset('', fileType='empty', sop=sop) elif dtype == 'TargetedDataset': data = nPYc.TargetedDataset('', fileType='empty', sop=sop) else: raise ValueError data.intensityData = numpy.random.lognormal(size=(noSamp, noFeat)) + 1 data.sampleMetadata = pandas.DataFrame(0, index=numpy.arange(noSamp), columns=[ 'Sample File Name', 'SampleType', 'AssayRole', 'Acquired Time', 'Run Order', 'Dilution', 'Detector', 'Correction Batch' ]) data.sampleMetadata[ 'SampleType'] = nPYc.enumerations.SampleType.StudySample data.sampleMetadata['AssayRole'] = nPYc.enumerations.AssayRole.Assay data.sampleMetadata['Run Order'] = numpy.arange(noSamp, dtype='int64') data.sampleMetadata['Detector'] = numpy.arange(noSamp) * 5 data.sampleMetadata['Batch'] = 1 data.sampleMetadata['Correction Batch'] = 2 data.sampleMetadata.loc[0:int(noSamp / 2), 'Correction Batch'] = 1 data.sampleMetadata['Exclusion Details'] = '' data.sampleMetadata['Sample File Name'] = [ randomword(10) for x in range(0, noSamp) ] data.sampleMetadata['Sample ID'] = [ randomword(10) for x in range(0, noSamp) ] data.sampleMetadata['Dilution'] = numpy.random.rand(noSamp) noClasses = numpy.random.randint(2, 5) classNames = [str(i) for i in range(0, noClasses)] classProbabilties = numpy.random.rand(noClasses) classProbabilties = classProbabilties / sum(classProbabilties) data.sampleMetadata['Classes'] = numpy.random.choice(classNames, size=noSamp, p=classProbabilties) data.sampleMetadata['Acquired Time'] = [ d for d in datetime_range(datetime.now(), noSamp, timedelta(minutes=15)) ] #Ensure seconds are not recorded, otherwise its impossible to test datasets read with datasets recorded on the fly. data.sampleMetadata['Acquired Time'] = [ datetime.strptime(d.strftime("%Y-%m-%d %H:%M"), "%Y-%m-%d %H:%M") for d in data.sampleMetadata['Acquired Time'] ] data.sampleMetadata['Acquired Time'] = data.sampleMetadata[ 'Acquired Time'].dt.to_pydatetime() data.sampleMetadata.iloc[::10, 1] = nPYc.enumerations.SampleType.StudyPool data.sampleMetadata.iloc[::10, 2] = nPYc.enumerations.AssayRole.PrecisionReference data.sampleMetadata.iloc[ 5::10, 1] = nPYc.enumerations.SampleType.ExternalReference data.sampleMetadata.iloc[ 5::10, 2] = nPYc.enumerations.AssayRole.PrecisionReference if dtype == 'MSDataset': data.featureMetadata = pandas.DataFrame(0, index=numpy.arange(noFeat), columns=['m/z']) data.featureMetadata['m/z'] = (800 - 40) * numpy.random.rand(noFeat) + 40 data.featureMetadata['Retention Time'] = ( 720 - 50) * numpy.random.rand(noFeat) + 50 data.featureMetadata['Feature Name'] = [ randomword(10) for x in range(0, noFeat) ] data.featureMetadata['Exclusion Details'] = None data.featureMetadata['User Excluded'] = False data.featureMetadata[[ 'rsdFilter', 'varianceRatioFilter', 'correlationToDilutionFilter', 'blankFilter', 'artifactualFilter' ]] = pandas.DataFrame([[True, True, True, True, True]], index=data.featureMetadata.index) data.featureMetadata[['rsdSP', 'rsdSS/rsdSP', 'correlationToDilution', 'blankValue']] \ = pandas.DataFrame([[numpy.nan, numpy.nan, numpy.nan, numpy.nan]], index=data.featureMetadata.index) data.Attributes['Feature Names'] = 'Feature Name' elif dtype == 'Dataset': data.featureMetadata = pandas.DataFrame(0, index=numpy.arange(noFeat), columns=['m/z']) data.featureMetadata['m/z'] = (800 - 40) * numpy.random.rand(noFeat) + 40 data.featureMetadata['Retention Time'] = ( 720 - 50) * numpy.random.rand(noFeat) + 50 data.featureMetadata['Feature Name'] = [ randomword(10) for x in range(0, noFeat) ] data.Attributes['Feature Names'] = 'Feature Name' elif dtype == 'NMRDataset': data.featureMetadata = pandas.DataFrame(numpy.linspace(10, -1, noFeat), columns=('ppm', ), dtype=float) data.featureMetadata['Feature Name'] = data.featureMetadata[ 'ppm'].astype(str) data.sampleMetadata['Delta PPM'] = numpy.random.rand(noSamp) data.sampleMetadata['Line Width (Hz)'] = numpy.random.rand(noSamp) data.sampleMetadata['CalibrationFail'] = False data.sampleMetadata['LineWidthFail'] = False data.sampleMetadata['WaterPeakFail'] = False data.sampleMetadata['BaselineFail'] = False data.Attributes['Feature Names'] = 'ppm' data.VariableType = variableType data.initialiseMasks() return data