def Agilent_reader(file_name): if not isinstance(file_name, (str, pathlib.Path)): raise TypeError( "'file_name' must be a string or a pathlib.Path object") if not isinstance(file_name, pathlib.Path): file_name = pathlib.Path(file_name) assert os.path.exists(file_name) if (file_name / 'DATA.MS').exists(): d_file = open(file_name / 'DATA.MS', 'rb') elif (file_name / 'data.ms').exists(): d_file = open(file_name / 'data.ms', 'rb') else: print(f'Error: {file_name} does not contain a data.ms file.') raise ValueError( f'Error: {file_name} does not contain a data.ms file.') data = AgilentGCMSData() options = Options() load_file_info(d_file, data, options) load_tic(d_file, data, options) load_xic(d_file, data, options) time_list = list(data.time) rows, _ = data.xic.shape scan_list = [Scan(data.mz, list(data.xic[r, :])) for r in range(rows)] return GCMS_data(time_list, scan_list)
def ANDI_reader(file_name): """ A reader for ANDI-MS NetCDF files :param file_name: The path of the ANDI-MS file :type file_name: str or os.PathLike :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :author: Qiao Wang :author: Andrew Isaac :author: Vladimir Likic :author: Dominic Davis-Foster """ if not isinstance(file_name, (str, pathlib.Path)): raise TypeError( "'file_name' must be a string or a pathlib.Path object") rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC') # TODO: find out if netCDF4 throws specific errors that we can use here print(f" -> Reading netCDF file '{file_name}'") scan_list = [] mass = rootgrp.variables[__MASS_STRING][:] intensity = rootgrp.variables[__INTENSITY_STRING][:] scan_lengths = rootgrp.variables[ "point_count"] # The number of data points in each scan mass_values = mass.tolist() intensity_values = intensity.tolist() if len(mass_values) != len(intensity_values): raise ValueError("The lengths of the mass and intensity lists differ!") offset = 0 for idx, length in enumerate(scan_lengths): mass_list = mass_values[offset:offset + length] assert len(mass_values[offset:offset + length]) == length intensity_list = intensity_values[offset:offset + length] assert len(intensity_values[offset:offset + length]) == length scan_list.append(Scan(mass_list, intensity_list)) offset += length assert offset == len(mass_values) time = rootgrp.variables[__TIME_STRING][:] time_list = time.tolist() # sanity check if not len(time_list) == len(scan_list): raise ValueError( "number of time points does not equal the number of scans") return GCMS_data(time_list, scan_list)
def agilent_reader( file_name: PathLike) -> GCMS_data: # pragma: no cover (!Windows) """ Reader for Agilent MassHunter ``.d`` files. :param file_name: Path of the file to read. :return: GC-MS data object. """ if not isinstance(file_name, (str, pathlib.Path)): raise TypeError( "'file_name' must be a string or a pathlib.Path object") if not isinstance(file_name, pathlib.Path): file_name = pathlib.Path(file_name) print(f" -> Reading Agilent data file '{file_name}'") time_list = [] scan_list = [] reader = MassSpecDataReader(file_name) for scan_no in range( reader.file_information.ms_scan_file_info.total_scans): spectrum = reader.get_spectrum_by_scan(scan_no) scan_list.append(Scan(spectrum.x_data, spectrum.y_data)) time_list.append(mean(spectrum.acquired_time_ranges[0]) * 60.0) # sanity check time_len = len(time_list) scan_len = len(scan_list) if not time_len == scan_len: # pragma: no cover print(time_list) print(scan_list) raise ValueError( f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})." ) data = GCMS_data(time_list, scan_list) return data
def JCAMP_reader(file_name: Union[str, os.PathLike]) -> GCMS_data: """ Generic reader for JCAMP DX files :param file_name: Path of the file to read :type file_name: str or os.PathLike :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) print(f" -> Reading JCAMP file '{file_name}'") lines_list = file_name.open('r') data = [] page_idx = 0 xydata_idx = 0 time_list = [] scan_list = [] header_info = {} # Dictionary containing header information for line in lines_list: if len(line.strip()) != 0: # prefix = line.find('#') # if prefix == 0: if line.startswith("##"): # key word or information fields = line.split('=', 1) fields[0] = fields[0].lstrip("##").upper() fields[1] = fields[1].strip() if "PAGE" in fields[0]: if "T=" in fields[1]: # PAGE contains retention time starting with T= # FileConverter Pro style time = float(fields[1].lstrip( "T=")) # rt for the scan to be submitted time_list.append(time) page_idx = page_idx + 1 elif "RETENTION_TIME" in fields[0]: # OpenChrom style time = float(fields[1]) # rt for the scan to be submitted # Check to make sure time is not already in the time list; # Can happen when both ##PAGE and ##RETENTION_TIME are specified if time_list[-1] != time: time_list.append(time) elif fields[0] in xydata_tags: xydata_idx = xydata_idx + 1 elif fields[0] in header_info_fields: if fields[1].isdigit(): header_info[fields[0]] = int(fields[1]) elif is_float(fields[1]): header_info[fields[0]] = float(fields[1]) else: header_info[fields[0]] = fields[1] # elif prefix == -1: else: # Line doesn't start with ## # data if page_idx > 1 or xydata_idx > 1: if len(data) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") mass_list = [] intensity_list = [] for i in range(len(data) // 2): mass_list.append(data[i * 2]) intensity_list.append(data[i * 2 + 1]) if len(mass_list) != len(intensity_list): raise ValueError( "len(mass_list) is not equal to len(intensity_list)" ) scan_list.append(Scan(mass_list, intensity_list)) data = [] data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if page_idx > 1: page_idx = 1 if xydata_idx > 1: xydata_idx = 1 else: data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if len(data) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") # get last scan mass = [] intensity = [] for i in range(len(data) // 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if len(mass) != len(intensity): raise ValueError("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) # sanity check time_len = len(time_list) scan_len = len(scan_list) if time_len != scan_len: print(time_list) print(scan_list) raise ValueError( f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})" ) data = GCMS_data(time_list, scan_list) return data
def mzML_reader(file_name): """ A reader for mzML files :param file_name: The name of the mzML file :type file_name: str or pathlib.Path :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :author: Sean O'Callaghan :author: Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") mzml_file = pymzml.run.Reader(str(file_name)) try: # avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank == 0: file_names = [] for i in range(1, size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print(" -> Reading mzML files:") print(file_name) for file_n in file_names: print(file_n) else: comm.send(file_name, dest=0) # TODO: Find specific error except Exception as e: print(e) print(f" -> Reading mzML file '{file_name}'") scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz, i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) # scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": # time value # We need time in seconds not minutes time_list.append(60 * float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) # print("time:", len(time_list)) # print("scan:", len(scan_list)) data = GCMS_data(time_list, scan_list) return data
def test_zero_length(): # TODO: finish scan = Scan([], [])
def test_equality(im, scan): assert scan == Scan(scan.mass_list, scan.intensity_list) assert scan != im.get_scan_at_index(1234)
def test_errors(scan, obj, expects): with pytest.raises(expects): Scan(obj, scan.intensity_list) with pytest.raises(expects): Scan(scan.mass_list, obj)