Esempio n. 1
0
def Agilent_reader(file_name):

    if not isinstance(file_name, (str, pathlib.Path)):
        raise TypeError(
            "'file_name' must be a string or a pathlib.Path object")

    if not isinstance(file_name, pathlib.Path):
        file_name = pathlib.Path(file_name)

    assert os.path.exists(file_name)
    if (file_name / 'DATA.MS').exists():
        d_file = open(file_name / 'DATA.MS', 'rb')
    elif (file_name / 'data.ms').exists():
        d_file = open(file_name / 'data.ms', 'rb')
    else:
        print(f'Error: {file_name} does not contain a data.ms file.')
        raise ValueError(
            f'Error: {file_name} does not contain a data.ms file.')

    data = AgilentGCMSData()
    options = Options()

    load_file_info(d_file, data, options)
    load_tic(d_file, data, options)
    load_xic(d_file, data, options)

    time_list = list(data.time)
    rows, _ = data.xic.shape
    scan_list = [Scan(data.mz, list(data.xic[r, :])) for r in range(rows)]

    return GCMS_data(time_list, scan_list)
Esempio n. 2
0
def ANDI_reader(file_name):
    """
	A reader for ANDI-MS NetCDF files

	:param file_name: The path of the ANDI-MS file
	:type file_name: str or os.PathLike

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:author: Qiao Wang
	:author: Andrew Isaac
	:author: Vladimir Likic
	:author: Dominic Davis-Foster
	"""

    if not isinstance(file_name, (str, pathlib.Path)):
        raise TypeError(
            "'file_name' must be a string or a pathlib.Path object")

    rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC')
    # TODO: find out if netCDF4 throws specific errors that we can use here

    print(f" -> Reading netCDF file '{file_name}'")

    scan_list = []
    mass = rootgrp.variables[__MASS_STRING][:]
    intensity = rootgrp.variables[__INTENSITY_STRING][:]

    scan_lengths = rootgrp.variables[
        "point_count"]  # The number of data points in each scan

    mass_values = mass.tolist()
    intensity_values = intensity.tolist()

    if len(mass_values) != len(intensity_values):
        raise ValueError("The lengths of the mass and intensity lists differ!")

    offset = 0
    for idx, length in enumerate(scan_lengths):
        mass_list = mass_values[offset:offset + length]
        assert len(mass_values[offset:offset + length]) == length
        intensity_list = intensity_values[offset:offset + length]
        assert len(intensity_values[offset:offset + length]) == length
        scan_list.append(Scan(mass_list, intensity_list))
        offset += length

    assert offset == len(mass_values)

    time = rootgrp.variables[__TIME_STRING][:]
    time_list = time.tolist()

    # sanity check
    if not len(time_list) == len(scan_list):
        raise ValueError(
            "number of time points does not equal the number of scans")

    return GCMS_data(time_list, scan_list)
Esempio n. 3
0
def agilent_reader(
        file_name: PathLike) -> GCMS_data:  # pragma: no cover (!Windows)
    """
	Reader for Agilent MassHunter ``.d`` files.

	:param file_name: Path of the file to read.

	:return: GC-MS data object.
	"""

    if not isinstance(file_name, (str, pathlib.Path)):
        raise TypeError(
            "'file_name' must be a string or a pathlib.Path object")

    if not isinstance(file_name, pathlib.Path):
        file_name = pathlib.Path(file_name)

    print(f" -> Reading Agilent data file '{file_name}'")

    time_list = []
    scan_list = []

    reader = MassSpecDataReader(file_name)

    for scan_no in range(
            reader.file_information.ms_scan_file_info.total_scans):
        spectrum = reader.get_spectrum_by_scan(scan_no)
        scan_list.append(Scan(spectrum.x_data, spectrum.y_data))
        time_list.append(mean(spectrum.acquired_time_ranges[0]) * 60.0)

    # sanity check
    time_len = len(time_list)
    scan_len = len(scan_list)
    if not time_len == scan_len:  # pragma: no cover
        print(time_list)
        print(scan_list)
        raise ValueError(
            f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})."
        )

    data = GCMS_data(time_list, scan_list)

    return data
Esempio n. 4
0
def JCAMP_reader(file_name: Union[str, os.PathLike]) -> GCMS_data:
    """
	Generic reader for JCAMP DX files

	:param file_name: Path of the file to read
	:type file_name: str or os.PathLike

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    print(f" -> Reading JCAMP file '{file_name}'")
    lines_list = file_name.open('r')
    data = []
    page_idx = 0
    xydata_idx = 0
    time_list = []
    scan_list = []

    header_info = {}  # Dictionary containing header information

    for line in lines_list:

        if len(line.strip()) != 0:
            # prefix = line.find('#')
            # if prefix == 0:
            if line.startswith("##"):
                # key word or information
                fields = line.split('=', 1)
                fields[0] = fields[0].lstrip("##").upper()
                fields[1] = fields[1].strip()

                if "PAGE" in fields[0]:
                    if "T=" in fields[1]:
                        # PAGE contains retention time starting with T=
                        # FileConverter Pro style
                        time = float(fields[1].lstrip(
                            "T="))  # rt for the scan to be submitted
                        time_list.append(time)
                    page_idx = page_idx + 1
                elif "RETENTION_TIME" in fields[0]:
                    # OpenChrom style
                    time = float(fields[1])  # rt for the scan to be submitted

                    # Check to make sure time is not already in the time list;
                    # Can happen when both ##PAGE and ##RETENTION_TIME are specified
                    if time_list[-1] != time:
                        time_list.append(time)

                elif fields[0] in xydata_tags:
                    xydata_idx = xydata_idx + 1

                elif fields[0] in header_info_fields:
                    if fields[1].isdigit():
                        header_info[fields[0]] = int(fields[1])
                    elif is_float(fields[1]):
                        header_info[fields[0]] = float(fields[1])
                    else:
                        header_info[fields[0]] = fields[1]

            # elif prefix == -1:
            else:
                # Line doesn't start with ##
                # data
                if page_idx > 1 or xydata_idx > 1:
                    if len(data) % 2 == 1:
                        # TODO: This means the data is not in x, y pairs
                        #  Make a better error message
                        raise ValueError("data not in pair !")
                    mass_list = []
                    intensity_list = []
                    for i in range(len(data) // 2):
                        mass_list.append(data[i * 2])
                        intensity_list.append(data[i * 2 + 1])
                    if len(mass_list) != len(intensity_list):
                        raise ValueError(
                            "len(mass_list) is not equal to len(intensity_list)"
                        )
                    scan_list.append(Scan(mass_list, intensity_list))
                    data = []
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))
                    if page_idx > 1:
                        page_idx = 1
                    if xydata_idx > 1:
                        xydata_idx = 1
                else:
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))

    if len(data) % 2 == 1:
        # TODO: This means the data is not in x, y pairs
        #  Make a better error message
        raise ValueError("data not in pair !")

    # get last scan
    mass = []
    intensity = []
    for i in range(len(data) // 2):
        mass.append(data[i * 2])
        intensity.append(data[i * 2 + 1])

    if len(mass) != len(intensity):
        raise ValueError("len(mass) is not equal to len(intensity)")
    scan_list.append(Scan(mass, intensity))

    # sanity check
    time_len = len(time_list)
    scan_len = len(scan_list)
    if time_len != scan_len:
        print(time_list)
        print(scan_list)
        raise ValueError(
            f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})"
        )

    data = GCMS_data(time_list, scan_list)

    return data
Esempio n. 5
0
def mzML_reader(file_name):
    """
	A reader for mzML files

	:param file_name: The name of the mzML file
	:type file_name: str or pathlib.Path

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:author: Sean O'Callaghan
	:author: Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    mzml_file = pymzml.run.Reader(str(file_name))

    try:  # avoid printing from each rank
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()

        if rank == 0:
            file_names = []

            for i in range(1, size):
                recv_buffer = ""
                file_n = comm.recv(recv_buffer, i)
                file_names.append(file_n)

            print(" -> Reading mzML files:")
            print(file_name)
            for file_n in file_names:
                print(file_n)
        else:
            comm.send(file_name, dest=0)
    # TODO: Find specific error
    except Exception as e:
        print(e)
        print(f" -> Reading mzML file '{file_name}'")

    scan_list = []
    time_list = []

    for spectrum in mzml_file:
        mass_list = []
        intensity_list = []

        for mz, i in spectrum.peaks:
            mass_list.append(mz)
            intensity_list.append(i)

        # scan_list.append(Scan(mass_list, intensity_list))
        for element in spectrum.xmlTree:
            # For some reason there are spectra with no time value,
            # Ignore these????????????
            if element.get('accession') == "MS:1000016":  # time value
                # We need time in seconds not minutes
                time_list.append(60 * float(element.get('value')))
                scan_list.append(Scan(mass_list, intensity_list))

    # print("time:", len(time_list))
    # print("scan:", len(scan_list))

    data = GCMS_data(time_list, scan_list)

    return data
Esempio n. 6
0
def test_zero_length():
    # TODO: finish
    scan = Scan([], [])
Esempio n. 7
0
def test_equality(im, scan):
    assert scan == Scan(scan.mass_list, scan.intensity_list)
    assert scan != im.get_scan_at_index(1234)
Esempio n. 8
0
def test_errors(scan, obj, expects):
    with pytest.raises(expects):
        Scan(obj, scan.intensity_list)

    with pytest.raises(expects):
        Scan(scan.mass_list, obj)