Ejemplo n.º 1
0
def test_equality(andi):
    assert andi == GCMS_data(andi.time_list, andi.scan_list)
    assert andi != GCMS_data(list(range(len(andi.scan_list))), andi.scan_list)
    assert andi != test_string
    assert andi != test_int
    assert andi != test_float
    assert andi != test_list_ints
    assert andi != test_list_strs
    assert andi != test_tuple
    assert andi != test_dict
Ejemplo n.º 2
0
def test_equality(data):
    assert data == GCMS_data(data.time_list, data.scan_list)
    assert data != GCMS_data(list(range(len(data.scan_list))), data.scan_list)
    assert data != test_string
    assert data != test_int
    assert data != test_float
    assert data != test_list_ints
    assert data != test_list_strs
    assert data != test_tuple
    assert data != test_dict
Ejemplo n.º 3
0
def test_GCMS_data(andi):
    assert isinstance(andi, GCMS_data)

    GCMS_data(andi.time_list, andi.scan_list)

    # Errors
    for obj in [test_string, *test_numbers, test_list_strs, test_dict]:
        with pytest.raises(TypeError):
            GCMS_data(obj, andi.scan_list)

    for obj in [test_string, *test_numbers, *test_sequences, test_dict]:
        with pytest.raises(TypeError):
            GCMS_data(andi.time_list, obj)
Ejemplo n.º 4
0
def test_GCMS_data(data):
    assert isinstance(data, GCMS_data)

    GCMS_data(data.time_list, data.scan_list)

    # Errors
    for obj in [test_string, *test_numbers, test_list_strs, test_dict]:
        with pytest.raises(TypeError):
            GCMS_data(obj, data.scan_list)  # type: ignore

    for obj in [test_string, *test_numbers, *test_sequences, test_dict]:
        with pytest.raises(TypeError):
            GCMS_data(data.time_list, obj)  # type: ignore
Ejemplo n.º 5
0
def ANDI_reader(file_name):
    # pyMs写的太烂了!!!!其中的很多构架严重不合理,将必然造成计算缓慢!!!
    # 尽量减少内存的复制,有助性能的提升。最好是cdf的dada读进来就作为文件就好,不要进行过多的内存copy
    print "ok in andi"
    scan_list, scan_acquisition_time = CDF_Reader(file_name)
    data = GCMS_data(scan_list=scan_list, time_list=scan_acquisition_time)
    return data
Ejemplo n.º 6
0
def Agilent_reader(file_name):

    if not isinstance(file_name, (str, pathlib.Path)):
        raise TypeError(
            "'file_name' must be a string or a pathlib.Path object")

    if not isinstance(file_name, pathlib.Path):
        file_name = pathlib.Path(file_name)

    assert os.path.exists(file_name)
    if (file_name / 'DATA.MS').exists():
        d_file = open(file_name / 'DATA.MS', 'rb')
    elif (file_name / 'data.ms').exists():
        d_file = open(file_name / 'data.ms', 'rb')
    else:
        print(f'Error: {file_name} does not contain a data.ms file.')
        raise ValueError(
            f'Error: {file_name} does not contain a data.ms file.')

    data = AgilentGCMSData()
    options = Options()

    load_file_info(d_file, data, options)
    load_tic(d_file, data, options)
    load_xic(d_file, data, options)

    time_list = list(data.time)
    rows, _ = data.xic.shape
    scan_list = [Scan(data.mz, list(data.xic[r, :])) for r in range(rows)]

    return GCMS_data(time_list, scan_list)
Ejemplo n.º 7
0
def ANDI_reader(file_name):
    """
	A reader for ANDI-MS NetCDF files

	:param file_name: The path of the ANDI-MS file
	:type file_name: str or os.PathLike

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:author: Qiao Wang
	:author: Andrew Isaac
	:author: Vladimir Likic
	:author: Dominic Davis-Foster
	"""

    if not isinstance(file_name, (str, pathlib.Path)):
        raise TypeError(
            "'file_name' must be a string or a pathlib.Path object")

    rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC')
    # TODO: find out if netCDF4 throws specific errors that we can use here

    print(f" -> Reading netCDF file '{file_name}'")

    scan_list = []
    mass = rootgrp.variables[__MASS_STRING][:]
    intensity = rootgrp.variables[__INTENSITY_STRING][:]

    scan_lengths = rootgrp.variables[
        "point_count"]  # The number of data points in each scan

    mass_values = mass.tolist()
    intensity_values = intensity.tolist()

    if len(mass_values) != len(intensity_values):
        raise ValueError("The lengths of the mass and intensity lists differ!")

    offset = 0
    for idx, length in enumerate(scan_lengths):
        mass_list = mass_values[offset:offset + length]
        assert len(mass_values[offset:offset + length]) == length
        intensity_list = intensity_values[offset:offset + length]
        assert len(intensity_values[offset:offset + length]) == length
        scan_list.append(Scan(mass_list, intensity_list))
        offset += length

    assert offset == len(mass_values)

    time = rootgrp.variables[__TIME_STRING][:]
    time_list = time.tolist()

    # sanity check
    if not len(time_list) == len(scan_list):
        raise ValueError(
            "number of time points does not equal the number of scans")

    return GCMS_data(time_list, scan_list)
Ejemplo n.º 8
0
def mzML_reader(file_name):

    """
    @summary: A reader for mzML files, returns
        a GC-MS data object

    @param file_name: The name of the mzML file
    @type file_name: StringType

    @author: Sean O'Callaghan
    """

    if not is_str(file_name):
        error("'file_name' must be a string")
    try:
        mzml_file = pymzml.run.Reader(file_name)
    except:
        error("Cannot open file '%s'" % file_name)

    print " -> Reading mzML file '%s'" % (file_name)

    scan_list = []
    time_list = []

    for spectrum in mzml_file:
        mass_list = []
        intensity_list = []

        for mz,i in spectrum.peaks:
            mass_list.append(mz)
            intensity_list.append(i)

        #scan_list.append(Scan(mass_list, intensity_list))
        for element in spectrum.xmlTree:
            # For some reason there are spectra with no time value, 
            # Ignore these????????????
            if element.get('accession') == "MS:1000016": #time value
                # We need time in seconds not minutes
                time_list.append(60*float(element.get('value')))
                scan_list.append(Scan(mass_list, intensity_list))

    print "time:", len(time_list)
    print "scan:", len(scan_list)

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 9
0
def agilent_reader(
        file_name: PathLike) -> GCMS_data:  # pragma: no cover (!Windows)
    """
	Reader for Agilent MassHunter ``.d`` files.

	:param file_name: Path of the file to read.

	:return: GC-MS data object.
	"""

    if not isinstance(file_name, (str, pathlib.Path)):
        raise TypeError(
            "'file_name' must be a string or a pathlib.Path object")

    if not isinstance(file_name, pathlib.Path):
        file_name = pathlib.Path(file_name)

    print(f" -> Reading Agilent data file '{file_name}'")

    time_list = []
    scan_list = []

    reader = MassSpecDataReader(file_name)

    for scan_no in range(
            reader.file_information.ms_scan_file_info.total_scans):
        spectrum = reader.get_spectrum_by_scan(scan_no)
        scan_list.append(Scan(spectrum.x_data, spectrum.y_data))
        time_list.append(mean(spectrum.acquired_time_ranges[0]) * 60.0)

    # sanity check
    time_len = len(time_list)
    scan_len = len(scan_list)
    if not time_len == scan_len:  # pragma: no cover
        print(time_list)
        print(scan_list)
        raise ValueError(
            f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})."
        )

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 10
0
def JCAMP_reader(file_name: Union[str, os.PathLike]) -> GCMS_data:
    """
	Generic reader for JCAMP DX files

	:param file_name: Path of the file to read
	:type file_name: str or os.PathLike

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    print(f" -> Reading JCAMP file '{file_name}'")
    lines_list = file_name.open('r')
    data = []
    page_idx = 0
    xydata_idx = 0
    time_list = []
    scan_list = []

    header_info = {}  # Dictionary containing header information

    for line in lines_list:

        if len(line.strip()) != 0:
            # prefix = line.find('#')
            # if prefix == 0:
            if line.startswith("##"):
                # key word or information
                fields = line.split('=', 1)
                fields[0] = fields[0].lstrip("##").upper()
                fields[1] = fields[1].strip()

                if "PAGE" in fields[0]:
                    if "T=" in fields[1]:
                        # PAGE contains retention time starting with T=
                        # FileConverter Pro style
                        time = float(fields[1].lstrip(
                            "T="))  # rt for the scan to be submitted
                        time_list.append(time)
                    page_idx = page_idx + 1
                elif "RETENTION_TIME" in fields[0]:
                    # OpenChrom style
                    time = float(fields[1])  # rt for the scan to be submitted

                    # Check to make sure time is not already in the time list;
                    # Can happen when both ##PAGE and ##RETENTION_TIME are specified
                    if time_list[-1] != time:
                        time_list.append(time)

                elif fields[0] in xydata_tags:
                    xydata_idx = xydata_idx + 1

                elif fields[0] in header_info_fields:
                    if fields[1].isdigit():
                        header_info[fields[0]] = int(fields[1])
                    elif is_float(fields[1]):
                        header_info[fields[0]] = float(fields[1])
                    else:
                        header_info[fields[0]] = fields[1]

            # elif prefix == -1:
            else:
                # Line doesn't start with ##
                # data
                if page_idx > 1 or xydata_idx > 1:
                    if len(data) % 2 == 1:
                        # TODO: This means the data is not in x, y pairs
                        #  Make a better error message
                        raise ValueError("data not in pair !")
                    mass_list = []
                    intensity_list = []
                    for i in range(len(data) // 2):
                        mass_list.append(data[i * 2])
                        intensity_list.append(data[i * 2 + 1])
                    if len(mass_list) != len(intensity_list):
                        raise ValueError(
                            "len(mass_list) is not equal to len(intensity_list)"
                        )
                    scan_list.append(Scan(mass_list, intensity_list))
                    data = []
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))
                    if page_idx > 1:
                        page_idx = 1
                    if xydata_idx > 1:
                        xydata_idx = 1
                else:
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))

    if len(data) % 2 == 1:
        # TODO: This means the data is not in x, y pairs
        #  Make a better error message
        raise ValueError("data not in pair !")

    # get last scan
    mass = []
    intensity = []
    for i in range(len(data) // 2):
        mass.append(data[i * 2])
        intensity.append(data[i * 2 + 1])

    if len(mass) != len(intensity):
        raise ValueError("len(mass) is not equal to len(intensity)")
    scan_list.append(Scan(mass, intensity))

    # sanity check
    time_len = len(time_list)
    scan_len = len(scan_list)
    if time_len != scan_len:
        print(time_list)
        print(scan_list)
        raise ValueError(
            f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})"
        )

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 11
0
def JCAMP_reader(file_name):

    """
    @summary: Generic reader for JCAMP DX files, produces GC-MS data
       object

    @author: Qiao Wang
    @author: Andrew Isaac
    @author: Vladimir Likic
    """

    if not is_str(file_name):
        error("'file_name' not a string")

    print " -> Reading JCAMP file '%s'" % (file_name)
    lines_list = open(file_name,'r')
    data = []
    page_idx = 0
    xydata_idx = 0
    time_list = []
    scan_list = []

    for line in lines_list:
        if not len(line.strip()) == 0:
            prefix = line.find('#')
            # key word or information
            if prefix == 0:
                fields = line.split('=')
                if fields[0].find("##PAGE") >= 0:
                    time = float(fields[2].strip()) #rt for the scan to be submitted
                    time_list.append(time)
                    page_idx = page_idx + 1
                elif fields[0].find("##DATA TABLE") >= 0:
                    xydata_idx = xydata_idx + 1
            # data
            elif prefix == -1:
                if page_idx > 1 or xydata_idx > 1:
                    if len(data) % 2 == 1:
                        error("data not in pair !")
                    mass = []
                    intensity = []
                    for i in range(len(data) / 2):
                        mass.append(data[i * 2])
                        intensity.append(data[i * 2 + 1])
                    if not len(mass) == len(intensity):
                        error("len(mass) is not equal to len(intensity)")
                    scan_list.append(Scan(mass, intensity))
                    data = []
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))
                    if page_idx > 1:
                        page_idx = 1
                    if xydata_idx > 1:
                        xydata_idx = 1
                else:
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))

    if len(data) % 2 == 1:
        error("data not in pair !")
    # get last scan
    mass = []
    intensity = []
    for i in range(len(data) / 2):
        mass.append(data[i * 2])
        intensity.append(data[i * 2 + 1])

    if not len(mass) == len(intensity):
        error("len(mass) is not equal to len(intensity)")
    scan_list.append(Scan(mass, intensity))

    # sanity check
    if not len(time_list) == len(scan_list):
        error("number of time points does not equal the number of scans")

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 12
0
def mzML_reader(file_name):
    """
	A reader for mzML files

	:param file_name: The name of the mzML file
	:type file_name: str or pathlib.Path

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:author: Sean O'Callaghan
	:author: Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    mzml_file = pymzml.run.Reader(str(file_name))

    try:  # avoid printing from each rank
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()

        if rank == 0:
            file_names = []

            for i in range(1, size):
                recv_buffer = ""
                file_n = comm.recv(recv_buffer, i)
                file_names.append(file_n)

            print(" -> Reading mzML files:")
            print(file_name)
            for file_n in file_names:
                print(file_n)
        else:
            comm.send(file_name, dest=0)
    # TODO: Find specific error
    except Exception as e:
        print(e)
        print(f" -> Reading mzML file '{file_name}'")

    scan_list = []
    time_list = []

    for spectrum in mzml_file:
        mass_list = []
        intensity_list = []

        for mz, i in spectrum.peaks:
            mass_list.append(mz)
            intensity_list.append(i)

        # scan_list.append(Scan(mass_list, intensity_list))
        for element in spectrum.xmlTree:
            # For some reason there are spectra with no time value,
            # Ignore these????????????
            if element.get('accession') == "MS:1000016":  # time value
                # We need time in seconds not minutes
                time_list.append(60 * float(element.get('value')))
                scan_list.append(Scan(mass_list, intensity_list))

    # print("time:", len(time_list))
    # print("scan:", len(scan_list))

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 13
0
def ANDI_reader(file_name):
    """
    @summary: A reader for ANDI-MS NetCDF files, returns
        a GC-MS data object

    @param file_name: The name of the ANDI-MS file
    @type file_name: StringType

    @author: Qiao Wang
    @author: Andrew Isaac
    @author: Vladimir Likic
    """

    ## TODO: use 'point_count' and allow for zero len scans

    # the keys used to retrieve certain data from the NetCDF file
    __MASS_STRING = "mass_values"
    __INTENSITY_STRING = "intensity_values"
    __TIME_STRING = "scan_acquisition_time"

    if not is_str(file_name):
        error("'file_name' must be a string")
    try:
        file = CDF(file_name)
    except CDFError:
        error("Cannot open file '%s'" % file_name)

    print " -> Reading netCDF file '%s'" % (file_name)

    scan_list = []
    mass = file.var(__MASS_STRING)
    intensity = file.var(__INTENSITY_STRING)
    mass_values = mass.get().tolist()
    mass_list = []
    mass_previous = mass_values[0]
    mass_list.append(mass_previous)
    intensity_values = intensity.get().tolist()
    intensity_list = []
    intensity_previous = intensity_values[0]
    intensity_list.append(intensity_previous)
    if not len(mass_values) == len(intensity_values):

        error("length of mass_list is not equal to length of intensity_list !")

    for i in range(len(mass_values) - 1):
        # assume masses in ascending order until new scan
        if mass_previous <= mass_values[i + 1]:
            #print mass_values[i+1]
            mass_list.append(mass_values[i + 1])
            mass_previous = mass_values[i + 1]
            intensity_list.append(intensity_values[i + 1])
            intensity_previous = intensity_values[i + 1]
        # new scan
        else:
            scan_list.append(Scan(mass_list, intensity_list))
            #print "Added scan"
            mass_previous = mass_values[i + 1]
            intensity_previous = intensity_values[i + 1]
            mass_list = []
            intensity_list = []
            mass_list.append(mass_previous)
            intensity_list.append(intensity_previous)
    # store final scan
    scan_list.append(Scan(mass_list, intensity_list))
    time = file.var(__TIME_STRING)
    time_list = time.get().tolist()

    # sanity check
    if not len(time_list) == len(scan_list):
        #JT: Debug for old gcms data
        #JT: time longer than scans so trim
        print "Time list is"
        print len(time_list) - len(scan_list)
        print "longer than scan list. Trimming...."
        time_list = time_list[0:len(scan_list)]
        print len(time_list)
        print len(scan_list)
    #error("number of time points does not equal the number of scans")

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 14
0
def mzML_reader(file_name):
    """
    @summary: A reader for mzML files, returns
        a GC-MS data object

    @param file_name: The name of the mzML file
    @type file_name: StringType

    @author: Sean O'Callaghan
    """

    if not is_str(file_name):
        error("'file_name' must be a string")
    try:
        mzml_file = pymzml.run.Reader(file_name)
    except:
        error("Cannot open file '%s'" % file_name)

    try:  # avoid printing from each rank
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()

        if rank == 0:
            file_names = []

            for i in range(1, size):
                recv_buffer = ""
                file_n = comm.recv(recv_buffer, i)
                file_names.append(file_n)

            print " -> Reading mzML files:"
            print file_name
            for file_n in file_names:
                print file_n
        else:
            comm.send(file_name, dest=0)
    except:
        print " -> Reading mzML file '%s'" % (file_name)

    scan_list = []
    time_list = []

    for spectrum in mzml_file:
        mass_list = []
        intensity_list = []

        for mz, i in spectrum.peaks:
            mass_list.append(mz)
            intensity_list.append(i)

        #scan_list.append(Scan(mass_list, intensity_list))
        for element in spectrum.xmlTree:
            # For some reason there are spectra with no time value,
            # Ignore these????????????
            if element.get('accession') == "MS:1000016":  #time value
                # We need time in seconds not minutes
                time_list.append(60 * float(element.get('value')))
                scan_list.append(Scan(mass_list, intensity_list))

    #print "time:", len(time_list)
    #print "scan:", len(scan_list)

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 15
0
def ANDI_reader(file_name):

    """
    @summary: A reader for ANDI-MS NetCDF files, returns
        a GC-MS data object

    @param file_name: The name of the ANDI-MS file
    @type file_name: StringType

    @author: Qiao Wang
    @author: Andrew Isaac
    @author: Vladimir Likic
    """

    ## TODO: use 'point_count' and allow for zero len scans

    # the keys used to retrieve certain data from the NetCDF file
    __MASS_STRING = "mass_values"
    __INTENSITY_STRING = "intensity_values"
    __TIME_STRING = "scan_acquisition_time"

    if not is_str(file_name):
        error("'file_name' must be a string")
    try:
        file = CDF(file_name)
    except CDFError:
        error("Cannot open file '%s'" % file_name)

    try:# avoid printing from each rank
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()
    
        if rank ==0:
            file_names = []
        
            for i in range(1,size):
                recv_buffer = ""
                file_n = comm.recv(recv_buffer, i)
                file_names.append(file_n)

            print " -> Reading netCDF files:"
            print file_name
            for file_n in file_names:
                print file_n
        else:
            comm.send(file_name, dest=0)
    except:
        print " -> Reading netCDF file '%s'" % (file_name)





    scan_list = []
    mass = file.var(__MASS_STRING)
    intensity = file.var(__INTENSITY_STRING)
    mass_values = mass.get().tolist()
    mass_list = []
    mass_previous = mass_values[0]
    mass_list.append(mass_previous)
    intensity_values = intensity.get().tolist()
    intensity_list = []
    intensity_previous = intensity_values[0]
    intensity_list.append(intensity_previous)
    if not len(mass_values) == len(intensity_values):
        error("length of mass_list is not equal to length of intensity_list !")
    for i in range(len(mass_values) - 1):
        # assume masses in ascending order until new scan
        if mass_previous <= mass_values[i + 1]:
            #print mass_values[i+1]
            mass_list.append(mass_values[i + 1])
            mass_previous = mass_values[i + 1]
            intensity_list.append(intensity_values[i + 1])
            intensity_previous = intensity_values[i + 1]
        # new scan
        else:
            scan_list.append(Scan(mass_list, intensity_list))
            #print "Added scan"
            mass_previous = mass_values[i + 1]
            intensity_previous = intensity_values[i + 1]
            mass_list = []
            intensity_list = []
            mass_list.append(mass_previous)
            intensity_list.append(intensity_previous)
    # store final scan
    scan_list.append(Scan(mass_list, intensity_list))
    time = file.var(__TIME_STRING)
    time_list = time.get().tolist()

    # sanity check
    if not len(time_list) == len(scan_list):
        error("number of time points (%d) does not equal the number of scans (%d)"%(len(time_list), len(scan_list)))

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 16
0
def ANDI_reader(file_name):
    """
    @summary: A reader for ANDI-MS NetCDF files, returns
        a GC-MS data object

    @param file_name: The name of the ANDI-MS file
    @type file_name: StringType

    @author: Qiao Wang
    @author: Andrew Isaac
    @author: Vladimir Likic
	@author: Tony Chen
    """

    __MASS_STRING = "mass_values"
    __INTENSITY_STRING = "intensity_values"
    __TIME_STRING = "scan_acquisition_time"
    if not is_str(file_name):
        error("'file_name' must be a string")
    try:
        dataset = Dataset(file_name, 'r')
    except:
        error("Cannot open file '%s'" % file_name)

    print " -> Reading netCDF file '%s'" % (file_name)

    scan_list = list()
    #     mass = np.array(dataset.variables[__MASS_STRING])
    #     intensity =  np.array(dataset.variables[__INTENSITY_STRING])
    mass_values = np.array(dataset.variables[__MASS_STRING])
    mass_list = list()
    mass_previous = mass_values[0]
    mass_list.append(mass_previous)
    intensity_values = np.array(dataset.variables[__INTENSITY_STRING])
    intensity_list = list()
    intensity_previous = intensity_values[0]
    intensity_list.append(intensity_previous)

    if not len(mass_values) == len(intensity_values):
        error("length of mass_list is not equal to length of intensity_list !")

    for i in range(len(mass_values) - 1):
        # assume masses in ascending order until new scan
        if mass_previous <= mass_values[i + 1]:
            mass_list.append(mass_values[i + 1])
            mass_previous = mass_values[i + 1]
            intensity_list.append(intensity_values[i + 1])
            intensity_previous = intensity_values[i + 1]
        # new scan
        else:
            scan_list.append(Scan(mass_list, intensity_list))
            mass_previous = mass_values[i + 1]
            intensity_previous = intensity_values[i + 1]
            mass_list = list()
            intensity_list = list()
            mass_list.append(mass_previous)
            intensity_list.append(intensity_previous)
    # store final scan
    scan_list.append(Scan(mass_list, intensity_list))
    time_list = np.array(dataset.variables[__TIME_STRING])

    # sanity check
    if not len(time_list) == len(scan_list):
        error("number of time points does not equal the number of scans")

    data = GCMS_data(time_list, scan_list)
    return data
Ejemplo n.º 17
0
def ANDI_reader(file_name):
    """
    @summary: A reader for ANDI-MS NetCDF files, returns
        a GC-MS data object

    @param file_name: The name of the ANDI-MS file
    @type file_name: StringType

    @author: Qiao Wang
    @author: Andrew Isaac
    @author: Vladimir Likic
    """

    ## TODO: use 'point_count' and allow for zero len scans

    # the keys used to retrieve certain data from the NetCDF file
    __MASS_STRING = "mass_values"
    __INTENSITY_STRING = "intensity_values"
    __TIME_STRING = "scan_acquisition_time"

    if not is_str(file_name):
        error("'file_name' must be a string")
    try:
        #file = CDF(file_name)
        rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC')
    except:
        raise RuntimeError("Cannot open file '%s'" % file_name)
    print " -> Reading netCDF file '%s'" % (file_name)
    print rootgrp.variables[__MASS_STRING][:]

    scan_list = []
    # mass = file.var(__MASS_STRING)  # old pycdf way
    # intensity = file.var(__INTENSITY_STRING)  #old pycdf way
    mass = rootgrp.variables[__MASS_STRING][:]
    intensity = rootgrp.variables[__INTENSITY_STRING][:]

    mass_values = mass.tolist()
    mass_list = []
    mass_previous = mass_values[0]
    mass_list.append(mass_previous)
    intensity_values = intensity.tolist()
    intensity_list = []
    intensity_previous = intensity_values[0]
    intensity_list.append(intensity_previous)
    if not len(mass_values) == len(intensity_values):
        error("length of mass_list is not equal to length of intensity_list !")
    for i in range(len(mass_values) - 1):
        # assume masses in ascending order until new scan
        if mass_previous <= mass_values[i + 1]:
            #print mass_values[i+1]
            mass_list.append(mass_values[i + 1])
            mass_previous = mass_values[i + 1]
            intensity_list.append(intensity_values[i + 1])
            intensity_previous = intensity_values[i + 1]
        # new scan
        else:
            scan_list.append(Scan(mass_list, intensity_list))
            #print "Added scan"
            mass_previous = mass_values[i + 1]
            intensity_previous = intensity_values[i + 1]
            mass_list = []
            intensity_list = []
            mass_list.append(mass_previous)
            intensity_list.append(intensity_previous)
    # store final scan
    scan_list.append(Scan(mass_list, intensity_list))
    # time = file.var(__TIME_STRING)  #old pycdf way
    time = rootgrp.variables[__TIME_STRING][:]
    time_list = time.tolist()

    # sanity check
    if not len(time_list) == len(scan_list):
        raise RuntimeError(
            "number of time points does not equal the number of scans")

    data = GCMS_data(time_list, scan_list)

    return data
Ejemplo n.º 18
0
def diff(data1: GCMS_data, data2: GCMS_data):
    """
	Compares two GCMS_data objects

	:param data1: GCMS data set 1
	:type data1: pyms.GCMS.Class.GCMS_data
	:param data2: GCMS data set 2
	:type data2: pyms.GCMS.Class.GCMS_data

	:author: Qiao Wang
	:author: Andrew Isaac
	:author: Vladimir Likic
	"""

    # get time attributes
    time_list1 = data1.get_time_list()
    time_list2 = data2.get_time_list()

    # First, check if two data sets have the same number of retention times.
    if len(time_list1) != len(time_list2):
        print(" The number of retention time points differ.")
        print(f"	First data set: {len(time_list1):d} time points")
        print(f"	Second data set: {len(time_list2):d} time points")
        print(" Data sets are different.")
        return
    else:
        time_rmsd = rmsd(time_list1, time_list2)
        print(" Data sets have the same number of time points.")
        print(f"   Time RMSD: {time_rmsd:.2e}")

    # Second, check if each scan has the same number of m/z intensities
    print(" Checking for consistency in scan lengths ...", end='')
    sys.stdout.flush()

    scan_list1 = data1.get_scan_list()
    scan_list2 = data2.get_scan_list()

    if not len(scan_list1) == len(scan_list2):
        # since the number of rention times are the same, this indicated
        # some unexpected problem with data
        raise ValueError("inconsistency in data detected")

    for ii in range(len(scan_list1)):
        scan1 = scan_list1[ii]
        scan2 = scan_list2[ii]
        mass_list1 = scan1.get_mass_list()
        mass_list2 = scan2.get_mass_list()
        if len(mass_list1) != len(mass_list2):
            print(f"\n Different number of points detected in scan no. {ii:d}")
            print(" Data sets are different.")
            return

    print("OK")

    # Third, if here, calculate the max RMSD for m/z and intensities
    print(" Calculating maximum RMSD for m/z values and intensities ...",
          end='')
    sys.stdout.flush()

    max_mass_rmsd = 0.0
    max_intensity_rmsd = 0.0

    for ii in range(len(scan_list1)):
        scan1 = scan_list1[ii]
        scan2 = scan_list2[ii]
        mass_list1 = scan1.get_mass_list()
        mass_list2 = scan2.get_mass_list()
        intensity_list1 = scan1.get_intensity_list()
        intensity_list2 = scan2.get_intensity_list()
        mass_rmsd = rmsd(mass_list1, mass_list2)
        if mass_rmsd > max_mass_rmsd:
            max_mass_rmsd = mass_rmsd
        intensity_rmsd = rmsd(intensity_list1, intensity_list2)
        if intensity_rmsd > max_intensity_rmsd:
            max_intensity_rmsd = intensity_rmsd

    print(f"\n   Max m/z RMSD: {max_mass_rmsd:.2e}")
    print(f"   Max intensity RMSD: {max_intensity_rmsd:.2e}")