def test_equality(andi): assert andi == GCMS_data(andi.time_list, andi.scan_list) assert andi != GCMS_data(list(range(len(andi.scan_list))), andi.scan_list) assert andi != test_string assert andi != test_int assert andi != test_float assert andi != test_list_ints assert andi != test_list_strs assert andi != test_tuple assert andi != test_dict
def test_equality(data): assert data == GCMS_data(data.time_list, data.scan_list) assert data != GCMS_data(list(range(len(data.scan_list))), data.scan_list) assert data != test_string assert data != test_int assert data != test_float assert data != test_list_ints assert data != test_list_strs assert data != test_tuple assert data != test_dict
def test_GCMS_data(andi): assert isinstance(andi, GCMS_data) GCMS_data(andi.time_list, andi.scan_list) # Errors for obj in [test_string, *test_numbers, test_list_strs, test_dict]: with pytest.raises(TypeError): GCMS_data(obj, andi.scan_list) for obj in [test_string, *test_numbers, *test_sequences, test_dict]: with pytest.raises(TypeError): GCMS_data(andi.time_list, obj)
def test_GCMS_data(data): assert isinstance(data, GCMS_data) GCMS_data(data.time_list, data.scan_list) # Errors for obj in [test_string, *test_numbers, test_list_strs, test_dict]: with pytest.raises(TypeError): GCMS_data(obj, data.scan_list) # type: ignore for obj in [test_string, *test_numbers, *test_sequences, test_dict]: with pytest.raises(TypeError): GCMS_data(data.time_list, obj) # type: ignore
def ANDI_reader(file_name): # pyMs写的太烂了!!!!其中的很多构架严重不合理,将必然造成计算缓慢!!! # 尽量减少内存的复制,有助性能的提升。最好是cdf的dada读进来就作为文件就好,不要进行过多的内存copy print "ok in andi" scan_list, scan_acquisition_time = CDF_Reader(file_name) data = GCMS_data(scan_list=scan_list, time_list=scan_acquisition_time) return data
def Agilent_reader(file_name): if not isinstance(file_name, (str, pathlib.Path)): raise TypeError( "'file_name' must be a string or a pathlib.Path object") if not isinstance(file_name, pathlib.Path): file_name = pathlib.Path(file_name) assert os.path.exists(file_name) if (file_name / 'DATA.MS').exists(): d_file = open(file_name / 'DATA.MS', 'rb') elif (file_name / 'data.ms').exists(): d_file = open(file_name / 'data.ms', 'rb') else: print(f'Error: {file_name} does not contain a data.ms file.') raise ValueError( f'Error: {file_name} does not contain a data.ms file.') data = AgilentGCMSData() options = Options() load_file_info(d_file, data, options) load_tic(d_file, data, options) load_xic(d_file, data, options) time_list = list(data.time) rows, _ = data.xic.shape scan_list = [Scan(data.mz, list(data.xic[r, :])) for r in range(rows)] return GCMS_data(time_list, scan_list)
def ANDI_reader(file_name): """ A reader for ANDI-MS NetCDF files :param file_name: The path of the ANDI-MS file :type file_name: str or os.PathLike :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :author: Qiao Wang :author: Andrew Isaac :author: Vladimir Likic :author: Dominic Davis-Foster """ if not isinstance(file_name, (str, pathlib.Path)): raise TypeError( "'file_name' must be a string or a pathlib.Path object") rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC') # TODO: find out if netCDF4 throws specific errors that we can use here print(f" -> Reading netCDF file '{file_name}'") scan_list = [] mass = rootgrp.variables[__MASS_STRING][:] intensity = rootgrp.variables[__INTENSITY_STRING][:] scan_lengths = rootgrp.variables[ "point_count"] # The number of data points in each scan mass_values = mass.tolist() intensity_values = intensity.tolist() if len(mass_values) != len(intensity_values): raise ValueError("The lengths of the mass and intensity lists differ!") offset = 0 for idx, length in enumerate(scan_lengths): mass_list = mass_values[offset:offset + length] assert len(mass_values[offset:offset + length]) == length intensity_list = intensity_values[offset:offset + length] assert len(intensity_values[offset:offset + length]) == length scan_list.append(Scan(mass_list, intensity_list)) offset += length assert offset == len(mass_values) time = rootgrp.variables[__TIME_STRING][:] time_list = time.tolist() # sanity check if not len(time_list) == len(scan_list): raise ValueError( "number of time points does not equal the number of scans") return GCMS_data(time_list, scan_list)
def mzML_reader(file_name): """ @summary: A reader for mzML files, returns a GC-MS data object @param file_name: The name of the mzML file @type file_name: StringType @author: Sean O'Callaghan """ if not is_str(file_name): error("'file_name' must be a string") try: mzml_file = pymzml.run.Reader(file_name) except: error("Cannot open file '%s'" % file_name) print " -> Reading mzML file '%s'" % (file_name) scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz,i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) #scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": #time value # We need time in seconds not minutes time_list.append(60*float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) print "time:", len(time_list) print "scan:", len(scan_list) data = GCMS_data(time_list, scan_list) return data
def agilent_reader( file_name: PathLike) -> GCMS_data: # pragma: no cover (!Windows) """ Reader for Agilent MassHunter ``.d`` files. :param file_name: Path of the file to read. :return: GC-MS data object. """ if not isinstance(file_name, (str, pathlib.Path)): raise TypeError( "'file_name' must be a string or a pathlib.Path object") if not isinstance(file_name, pathlib.Path): file_name = pathlib.Path(file_name) print(f" -> Reading Agilent data file '{file_name}'") time_list = [] scan_list = [] reader = MassSpecDataReader(file_name) for scan_no in range( reader.file_information.ms_scan_file_info.total_scans): spectrum = reader.get_spectrum_by_scan(scan_no) scan_list.append(Scan(spectrum.x_data, spectrum.y_data)) time_list.append(mean(spectrum.acquired_time_ranges[0]) * 60.0) # sanity check time_len = len(time_list) scan_len = len(scan_list) if not time_len == scan_len: # pragma: no cover print(time_list) print(scan_list) raise ValueError( f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})." ) data = GCMS_data(time_list, scan_list) return data
def JCAMP_reader(file_name: Union[str, os.PathLike]) -> GCMS_data: """ Generic reader for JCAMP DX files :param file_name: Path of the file to read :type file_name: str or os.PathLike :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) print(f" -> Reading JCAMP file '{file_name}'") lines_list = file_name.open('r') data = [] page_idx = 0 xydata_idx = 0 time_list = [] scan_list = [] header_info = {} # Dictionary containing header information for line in lines_list: if len(line.strip()) != 0: # prefix = line.find('#') # if prefix == 0: if line.startswith("##"): # key word or information fields = line.split('=', 1) fields[0] = fields[0].lstrip("##").upper() fields[1] = fields[1].strip() if "PAGE" in fields[0]: if "T=" in fields[1]: # PAGE contains retention time starting with T= # FileConverter Pro style time = float(fields[1].lstrip( "T=")) # rt for the scan to be submitted time_list.append(time) page_idx = page_idx + 1 elif "RETENTION_TIME" in fields[0]: # OpenChrom style time = float(fields[1]) # rt for the scan to be submitted # Check to make sure time is not already in the time list; # Can happen when both ##PAGE and ##RETENTION_TIME are specified if time_list[-1] != time: time_list.append(time) elif fields[0] in xydata_tags: xydata_idx = xydata_idx + 1 elif fields[0] in header_info_fields: if fields[1].isdigit(): header_info[fields[0]] = int(fields[1]) elif is_float(fields[1]): header_info[fields[0]] = float(fields[1]) else: header_info[fields[0]] = fields[1] # elif prefix == -1: else: # Line doesn't start with ## # data if page_idx > 1 or xydata_idx > 1: if len(data) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") mass_list = [] intensity_list = [] for i in range(len(data) // 2): mass_list.append(data[i * 2]) intensity_list.append(data[i * 2 + 1]) if len(mass_list) != len(intensity_list): raise ValueError( "len(mass_list) is not equal to len(intensity_list)" ) scan_list.append(Scan(mass_list, intensity_list)) data = [] data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if page_idx > 1: page_idx = 1 if xydata_idx > 1: xydata_idx = 1 else: data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if len(data) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") # get last scan mass = [] intensity = [] for i in range(len(data) // 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if len(mass) != len(intensity): raise ValueError("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) # sanity check time_len = len(time_list) scan_len = len(scan_list) if time_len != scan_len: print(time_list) print(scan_list) raise ValueError( f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})" ) data = GCMS_data(time_list, scan_list) return data
def JCAMP_reader(file_name): """ @summary: Generic reader for JCAMP DX files, produces GC-MS data object @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' not a string") print " -> Reading JCAMP file '%s'" % (file_name) lines_list = open(file_name,'r') data = [] page_idx = 0 xydata_idx = 0 time_list = [] scan_list = [] for line in lines_list: if not len(line.strip()) == 0: prefix = line.find('#') # key word or information if prefix == 0: fields = line.split('=') if fields[0].find("##PAGE") >= 0: time = float(fields[2].strip()) #rt for the scan to be submitted time_list.append(time) page_idx = page_idx + 1 elif fields[0].find("##DATA TABLE") >= 0: xydata_idx = xydata_idx + 1 # data elif prefix == -1: if page_idx > 1 or xydata_idx > 1: if len(data) % 2 == 1: error("data not in pair !") mass = [] intensity = [] for i in range(len(data) / 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if not len(mass) == len(intensity): error("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) data = [] data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if page_idx > 1: page_idx = 1 if xydata_idx > 1: xydata_idx = 1 else: data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if len(data) % 2 == 1: error("data not in pair !") # get last scan mass = [] intensity = [] for i in range(len(data) / 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if not len(mass) == len(intensity): error("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) # sanity check if not len(time_list) == len(scan_list): error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def mzML_reader(file_name): """ A reader for mzML files :param file_name: The name of the mzML file :type file_name: str or pathlib.Path :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :author: Sean O'Callaghan :author: Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") mzml_file = pymzml.run.Reader(str(file_name)) try: # avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank == 0: file_names = [] for i in range(1, size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print(" -> Reading mzML files:") print(file_name) for file_n in file_names: print(file_n) else: comm.send(file_name, dest=0) # TODO: Find specific error except Exception as e: print(e) print(f" -> Reading mzML file '{file_name}'") scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz, i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) # scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": # time value # We need time in seconds not minutes time_list.append(60 * float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) # print("time:", len(time_list)) # print("scan:", len(scan_list)) data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: file = CDF(file_name) except CDFError: error("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) scan_list = [] mass = file.var(__MASS_STRING) intensity = file.var(__INTENSITY_STRING) mass_values = mass.get().tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.get().tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time = file.var(__TIME_STRING) time_list = time.get().tolist() # sanity check if not len(time_list) == len(scan_list): #JT: Debug for old gcms data #JT: time longer than scans so trim print "Time list is" print len(time_list) - len(scan_list) print "longer than scan list. Trimming...." time_list = time_list[0:len(scan_list)] print len(time_list) print len(scan_list) #error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def mzML_reader(file_name): """ @summary: A reader for mzML files, returns a GC-MS data object @param file_name: The name of the mzML file @type file_name: StringType @author: Sean O'Callaghan """ if not is_str(file_name): error("'file_name' must be a string") try: mzml_file = pymzml.run.Reader(file_name) except: error("Cannot open file '%s'" % file_name) try: # avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank == 0: file_names = [] for i in range(1, size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print " -> Reading mzML files:" print file_name for file_n in file_names: print file_n else: comm.send(file_name, dest=0) except: print " -> Reading mzML file '%s'" % (file_name) scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz, i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) #scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": #time value # We need time in seconds not minutes time_list.append(60 * float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) #print "time:", len(time_list) #print "scan:", len(scan_list) data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: file = CDF(file_name) except CDFError: error("Cannot open file '%s'" % file_name) try:# avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank ==0: file_names = [] for i in range(1,size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print " -> Reading netCDF files:" print file_name for file_n in file_names: print file_n else: comm.send(file_name, dest=0) except: print " -> Reading netCDF file '%s'" % (file_name) scan_list = [] mass = file.var(__MASS_STRING) intensity = file.var(__INTENSITY_STRING) mass_values = mass.get().tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.get().tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time = file.var(__TIME_STRING) time_list = time.get().tolist() # sanity check if not len(time_list) == len(scan_list): error("number of time points (%d) does not equal the number of scans (%d)"%(len(time_list), len(scan_list))) data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic @author: Tony Chen """ __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: dataset = Dataset(file_name, 'r') except: error("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) scan_list = list() # mass = np.array(dataset.variables[__MASS_STRING]) # intensity = np.array(dataset.variables[__INTENSITY_STRING]) mass_values = np.array(dataset.variables[__MASS_STRING]) mass_list = list() mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = np.array(dataset.variables[__INTENSITY_STRING]) intensity_list = list() intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = list() intensity_list = list() mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time_list = np.array(dataset.variables[__TIME_STRING]) # sanity check if not len(time_list) == len(scan_list): error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: #file = CDF(file_name) rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC') except: raise RuntimeError("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) print rootgrp.variables[__MASS_STRING][:] scan_list = [] # mass = file.var(__MASS_STRING) # old pycdf way # intensity = file.var(__INTENSITY_STRING) #old pycdf way mass = rootgrp.variables[__MASS_STRING][:] intensity = rootgrp.variables[__INTENSITY_STRING][:] mass_values = mass.tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) # time = file.var(__TIME_STRING) #old pycdf way time = rootgrp.variables[__TIME_STRING][:] time_list = time.tolist() # sanity check if not len(time_list) == len(scan_list): raise RuntimeError( "number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def diff(data1: GCMS_data, data2: GCMS_data): """ Compares two GCMS_data objects :param data1: GCMS data set 1 :type data1: pyms.GCMS.Class.GCMS_data :param data2: GCMS data set 2 :type data2: pyms.GCMS.Class.GCMS_data :author: Qiao Wang :author: Andrew Isaac :author: Vladimir Likic """ # get time attributes time_list1 = data1.get_time_list() time_list2 = data2.get_time_list() # First, check if two data sets have the same number of retention times. if len(time_list1) != len(time_list2): print(" The number of retention time points differ.") print(f" First data set: {len(time_list1):d} time points") print(f" Second data set: {len(time_list2):d} time points") print(" Data sets are different.") return else: time_rmsd = rmsd(time_list1, time_list2) print(" Data sets have the same number of time points.") print(f" Time RMSD: {time_rmsd:.2e}") # Second, check if each scan has the same number of m/z intensities print(" Checking for consistency in scan lengths ...", end='') sys.stdout.flush() scan_list1 = data1.get_scan_list() scan_list2 = data2.get_scan_list() if not len(scan_list1) == len(scan_list2): # since the number of rention times are the same, this indicated # some unexpected problem with data raise ValueError("inconsistency in data detected") for ii in range(len(scan_list1)): scan1 = scan_list1[ii] scan2 = scan_list2[ii] mass_list1 = scan1.get_mass_list() mass_list2 = scan2.get_mass_list() if len(mass_list1) != len(mass_list2): print(f"\n Different number of points detected in scan no. {ii:d}") print(" Data sets are different.") return print("OK") # Third, if here, calculate the max RMSD for m/z and intensities print(" Calculating maximum RMSD for m/z values and intensities ...", end='') sys.stdout.flush() max_mass_rmsd = 0.0 max_intensity_rmsd = 0.0 for ii in range(len(scan_list1)): scan1 = scan_list1[ii] scan2 = scan_list2[ii] mass_list1 = scan1.get_mass_list() mass_list2 = scan2.get_mass_list() intensity_list1 = scan1.get_intensity_list() intensity_list2 = scan2.get_intensity_list() mass_rmsd = rmsd(mass_list1, mass_list2) if mass_rmsd > max_mass_rmsd: max_mass_rmsd = mass_rmsd intensity_rmsd = rmsd(intensity_list1, intensity_list2) if intensity_rmsd > max_intensity_rmsd: max_intensity_rmsd = intensity_rmsd print(f"\n Max m/z RMSD: {max_mass_rmsd:.2e}") print(f" Max intensity RMSD: {max_intensity_rmsd:.2e}")