def store_expr(file_name, expr): """ @summary: stores an expriment to a file @param file_name: The name of the file @type file_name: StringType @param expr: An experiment object @type expr: pyms.Experiment.Class.Experiment @return: none @rtype: NoneType @author: Vladimir Likic @author: Andrew Isaac """ if not isinstance(expr, Experiment): error("argument not an instance of the class 'Experiment'") if not is_str(file_name): error("'file_name' not a string") fp = open(file_name, 'w') cPickle.dump(expr, fp, 1) fp.close()
def write_intensities_stream(self, file_name): """ @summary: Writes all intensities to a file @param file_name: Output file name @type file_name: StringType This function loop over all scans, and for each scan writes intensities to the file, one intenisity per line. Intensities from different scans are joined without any delimiters. @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' must be a string") N = len(self.__scan_list) print" -> Writing scans to a file" fp = open_for_writing(file_name) for ii in range(len(self.__scan_list)): scan = self.__scan_list[ii] intensities = scan.get_intensity_list() for I in intensities: fp.write("%8.4f\n" % ( I ) ) close_for_writing(fp)
def read_expr_list(file_name): """ @summary: Reads the set of experiment files and returns a list of Experiment objects @param file_name: The name of the file which lists experiment dump file names, one file per line @type file_name: StringType @return: A list of Experiment instances @rtype: ListType @author: Vladimir Likic """ if not is_str(file_name): error("file_name argument must be a string") try: fp = open(file_name, 'r') except IOError: error("error opening file '%s' for reading" % file_name) exprfiles = fp.readlines() fp.close() exprl = [] for exprfile in exprfiles: exprfile = string.strip(exprfile) expr = load_expr(exprfile) exprl.append(expr) return exprl
def load_peaks(file_name): """ @summary: Loads the peak_list stored with 'store_peaks' @param file_name: File name of peak list @type file_name: StringType @return: The list of Peak objects @rtype: ListType @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' not a string") fp = open(file_name, 'r') peak_list = cPickle.load(fp) fp.close() if not is_list(peak_list): error("'file_name' is not a List") if not len(peak_list) > 0 and not isinstance(peak_list[0], Peak): error("'peak_list' must be a list of Peak objects") return peak_list
def write(self, file_name, minutes=False): """ @summary: Writes the ion chromatogram to the specified file @param file_name: Output file name @type file_name: StringType @param minutes: A boolean value indicating whether to write time in minutes @type minutes: BooleanType @return: none @rtype: NoneType @author: Lewis Lee @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' must be a string") fp = open_for_writing(file_name) time_list = copy.deepcopy(self.__time_list) if minutes: for ii in range(len(time_list)): time_list[ii] = time_list[ii]/60.0 for ii in range(len(time_list)): fp.write("%8.4f %#.6e\n" % (time_list[ii], self.__ia[ii])) close_for_writing(fp)
def time_str_secs(time_str): """ @summary: Resolves time string of the form "<NUMBER>s" or "<NUMBER>m", returns time in seconds @param time_str: A time string, which must be of the form "<NUMBER>s" or "<NUMBER>m" where "<NUMBER>" is a valid number @type time_str: StringType @return: Time in seconds @rtype: FloatType @author: Vladimir Likic """ if not is_str(time_str): error("time string not a string") time_number = time_str[:-1] time_spec = time_str[-1].lower() if not is_str_num(time_number): print " --> received time string '%s'" % (time_number) error("improper time string") if not time_spec == "s" and not time_spec == "m": error("time string must end with either 's' or 'm'") time = float(time_number) if time_spec == "m": time = time * 60.0 return time
def time_str_secs(time_str): """ @summary: Resolves time string of the form "<NUMBER>s" or "<NUMBER>m", returns time in seconds @param time_str: A time string, which must be of the form "<NUMBER>s" or "<NUMBER>m" where "<NUMBER>" is a valid number @type time_str: StringType @return: Time in seconds @rtype: FloatType @author: Vladimir Likic """ if not is_str(time_str): error("time string not a string") time_number = time_str[:-1] time_spec = time_str[-1].lower() if not is_str_num(time_number): print " --> received time string '%s'" % (time_number) error("improper time string") if not time_spec == "s" and not time_spec == "m": error("time string must end with either 's' or 'm'") time = float(time_number) if time_spec == "m": time = time*60.0 return time
def load_expr(file_name): """ @summary: Loads an experiment saved with 'store_expr' @param file_name: Experiment file name @type file_name: StringType @return: The experiment intensity matrix and peak list @rtype: pyms.Experiment.Class.Experiment @author: Vladimir Likic @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' not a string") fp = open(file_name,'rb') expr = cPickle.load(fp) fp.close() if not isinstance(expr, Experiment): error("'file_name' is not an Experiment object") return expr
def store_expr(file_name, expr): """ @summary: stores an expriment to a file @param file_name: The name of the file @type file_name: StringType @param expr: An experiment object @type expr: pyms.Experiment.Class.Experiment @return: none @rtype: NoneType @author: Vladimir Likic @author: Andrew Isaac """ if not isinstance(expr, Experiment): error("argument not an instance of the class 'Experiment'") if not is_str(file_name): error("'file_name' not a string") fp = open(file_name,'wb') cPickle.dump(expr, fp, 1) fp.close()
def load_peaks(file_name): """ @summary: Loads the peak_list stored with 'store_peaks' @param file_name: File name of peak list @type file_name: StringType @return: The list of Peak objects @rtype: ListType @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' not a string") fp = open(file_name,'r') peak_list = cPickle.load(fp) fp.close() if not is_list(peak_list): error("'file_name' is not a List") if not len(peak_list) > 0 and not isinstance(peak_list[0], Peak): error("'peak_list' must be a list of Peak objects") return peak_list
def load_expr(file_name): """ @summary: Loads an experiment saved with 'store_expr' @param file_name: Experiment file name @type file_name: StringType @return: The experiment intensity matrix and peak list @rtype: pyms.Experiment.Class.Experiment @author: Vladimir Likic @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' not a string") fp = open(file_name, 'r') expr = cPickle.load(fp) fp.close() if not isinstance(expr, Experiment): error("'file_name' is not an Experiment object") return expr
def write(self, file_root): """ @summary: Writes the entire raw data to two files, one 'file_root'.I.csv (intensities) and 'file_root'.mz.csv (m/z values). This method writes two CSV files, containing intensities and corresponding m/z values. In general these are not two-dimensional matrices, because different scans may have different number of m/z values recorded. @param file_root: The root for the output file names @type file_root: StringType @author: Vladimir Likic """ if not is_str(file_root): error("'file_root' must be a string") file_name1 = file_root + ".I.csv" file_name2 = file_root + ".mz.csv" print " -> Writing intensities to '%s'" % ( file_name1 ) print " -> Writing m/z values to '%s'" % ( file_name2 ) fp1 = open_for_writing(file_name1) fp2 = open_for_writing(file_name2) for ii in range(len(self.__scan_list)): scan = self.__scan_list[ii] intensity_list = scan.get_intensity_list() mass_list = scan.get_mass_list() for ii in range(len(intensity_list)): v = intensity_list[ii] if ii == 0: fp1.write("%.4f" % (v)) else: fp1.write(",%.4f" % (v)) fp1.write("\n") for ii in range(len(mass_list)): v = mass_list[ii] if ii == 0: fp2.write("%.4f" % (v)) else: fp2.write(",%.4f" % (v)) fp2.write("\n") close_for_writing(fp1) close_for_writing(fp2)
def sele_peaks_by_rt(peaks, rt_range): """ @summary: Selects peaks from a retention time range @param peaks: A list of peak objects @type peaks: ListType @param rt_range: A list of two time strings, specifying lower and upper retention times @type rt_range: ListType @return: A list of peak objects @rtype: ListType """ if not is_peak_list(peaks): error("'peaks' not a peak list") if not is_list(rt_range): error("'rt_range' not a list") else: if len(rt_range) != 2: error("'rt_range' must have exactly two elements") if not is_str(rt_range[0]) or not is_str(rt_range[1]): error("lower/upper retention time limits must be strings") rt_lo = time_str_secs(rt_range[0]) rt_hi = time_str_secs(rt_range[1]) if not rt_lo < rt_hi: error("lower retention time limit must be less than upper") peaks_sele = [] for peak in peaks: rt = peak.get_rt() if rt > rt_lo and rt < rt_hi: peaks_sele.append(peak) # print "%d peaks selected" % (len(peaks_sele)) return peaks_sele
def export_leco_csv(self, file_name): """ @summary: Exports data in LECO CSV format @param file_name: File name @type file_name: StringType @return: none @rtype: NoneType @author: Andrew Isaac @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' is not a string") mass_list = self.__mass_list time_list = self.__time_list vals = self.__intensity_matrix fp = open_for_writing(file_name) # Format is text header with: # "Scan","Time",... # and the rest is "TIC" or m/z as text, i.e. "50","51"... # The following lines are: # scan_number,time,value,value,... # scan_number is an int, rest seem to be fixed format floats. # The format is 0.000000e+000 # write header fp.write("\"Scan\",\"Time\"") for ii in mass_list: if is_number(ii): fp.write(",\"%d\"" % int(ii)) else: error("mass list datum not a number") fp.write("\r\n") # windows CR/LF # write lines for ii in range(len(time_list)): fp.write("%s,%#.6e" % (ii, time_list[ii])) for jj in range(len(vals[ii])): if is_number(vals[ii][jj]): fp.write(",%#.6e" % (vals[ii][jj])) else: error("datum not a number") fp.write("\r\n") close_for_writing(fp)
def window_sele_points(ic, window_sele, half_window=False): """ @summary: Converts window selection parameter into points based on the time step in an ion chromatogram @param ic: ion chromatogram object relevant for the conversion @type ic: pyms.GCMS.Class.IonChromatogram @param window_sele: The window selection parameter. This can be an integer or time string. If integer, taken as the number of points. If a string, must of the form "<NUMBER>s" or "<NUMBER>m", specifying a time in seconds or minutes, respectively @type window_sele: IntType or StringType @param half_window: Specifies whether to return half-window @type half_window: BooleanType @return: The number of points in the window @rtype: IntType @author: Vladimir Likic """ if not is_int(window_sele) and not is_str(window_sele): error("'window' must be an integer or a string") if is_int(window_sele): if half_window: if window_sele % 2 == 0: error("window must be an odd number of points") else: points = int(math.floor(window_sele*0.5)) else: points = window_sele else: time = time_str_secs(window_sele) time_step = ic.get_time_step() if half_window: time = time*0.5 points = int(math.floor(time/time_step)) if half_window: if points < 1: error("window too small (half window=%d)" % (points)) else: if points < 2: error("window too small (window=%d)" % (points)) return points
def window_sele_points(ic, window_sele, half_window=False): """ @summary: Converts window selection parameter into points based on the time step in an ion chromatogram @param ic: ion chromatogram object relevant for the conversion @type ic: pyms.GCMS.Class.IonChromatogram @param window_sele: The window selection parameter. This can be an integer or time string. If integer, taken as the number of points. If a string, must of the form "<NUMBER>s" or "<NUMBER>m", specifying a time in seconds or minutes, respectively @type window_sele: IntType or StringType @param half_window: Specifies whether to return half-window @type half_window: BooleanType @return: The number of points in the window @rtype: IntType @author: Vladimir Likic """ if not is_int(window_sele) and not is_str(window_sele): error("'window' must be an integer or a string") if is_int(window_sele): if half_window: if window_sele % 2 == 0: error("window must be an odd number of points") else: points = int(math.floor(window_sele * 0.5)) else: points = window_sele else: time = time_str_secs(window_sele) time_step = ic.get_time_step() if half_window: time = time * 0.5 points = int(math.floor(time / time_step)) if half_window: if points < 1: error("window too small (half window=%d)" % (points)) else: if points < 2: error("window too small (window=%d)" % (points)) return points
def export_ascii(self, root_name, format='dat'): """ @summary: Exports the intensity matrix, retention time vector, and m/z vector to the ascii format By default, export_ascii("NAME") will create NAME.im.dat, NAME.rt.dat, and NAME.mz.dat where these are the intensity matrix, retention time vector, and m/z vector in tab delimited format. If format='csv', the files will be in the CSV format, named NAME.im.csv, NAME.rt.csv, and NAME.mz.csv. @param root_name: Root name for the output files @type root_name: StringType @return: none @rtype: NoneType @author: Milica Ng @author: Andrew Isaac @author: Vladimir Likic """ if not is_str(root_name): error("'root_name' is not a string") if format == 'dat': separator = " " extension = ".dat" elif format == 'csv': separator = "," extension = ".csv" else: error("unkown format '%s'. Only 'dat' or 'csv' supported" % format) # export 2D matrix of intensities vals = self.__intensity_matrix save_data(root_name+'.im'+extension, vals, sep=separator) # export 1D vector of m/z's, corresponding to rows of # the intensity matrix mass_list = self.__mass_list save_data(root_name+'.mz'+extension, mass_list, sep=separator) # export 1D vector of retention times, corresponding to # columns of the intensity matrix time_list = self.__time_list save_data(root_name+'.rt'+extension, time_list, sep=separator)
def mzML_reader(file_name): """ @summary: A reader for mzML files, returns a GC-MS data object @param file_name: The name of the mzML file @type file_name: StringType @author: Sean O'Callaghan """ if not is_str(file_name): error("'file_name' must be a string") try: mzml_file = pymzml.run.Reader(file_name) except: error("Cannot open file '%s'" % file_name) print " -> Reading mzML file '%s'" % (file_name) scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz,i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) #scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": #time value # We need time in seconds not minutes time_list.append(60*float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) print "time:", len(time_list) print "scan:", len(scan_list) data = GCMS_data(time_list, scan_list) return data
def store_peaks(peak_list, file_name): """ @summary:Store the list of peak objects @param peak_list: A list of peak objects @type peak_list: pyms.Peaks.Class.Peak @param file_name: File name to store peak list @type file_name: StringType @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' must be a string") fp = open(file_name, 'w') cPickle.dump(peak_list, fp, 1) fp.close()
def __init__(self, expr_code, peak_list): """ @summary: Models an experiment @param expr_code: Unique identifier for the experiment @type expr_code: StringType @param peak_list: A list of peak objects @type peak_list: ListType """ if not is_str(expr_code): error("'expr_code' must be a string") if not is_peak_list(peak_list): error("'peak_list' must be a list of Peak objects") self.__expr_code = expr_code self.__peak_list = peak_list
def file_lines(file_name, filter=False): """ @summary: Returns lines from a file, as a list @param file_name: Name of a file @type: StringType @param filter: If True, lines are pre-processes. Newline character if removed, leading and taling whitespaces are removed, and lines starting with '#' are discarded @type: BooleanType @return: A list of lines @rtype: ListType @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' is not a string") fp = open_for_reading(file_name) lines = fp.readlines() close_for_reading(fp) if filter: # strip leading and talining whitespaces lines_filtered = [] for line in lines: line = line.strip() lines_filtered.append(line) # discard comments lines_to_discard = [] for line in lines_filtered: # remove empty lines and comments if len(line) == 0 or line[0] == "#": lines_to_discard.append(line) for line in lines_to_discard: lines_filtered.remove(line) lines = lines_filtered return lines
def store_peaks(peak_list, file_name): """ @summary:Store the list of peak objects @param peak_list: A list of peak objects @type peak_list: pyms.Peaks.Class.Peak @param file_name: File name to store peak list @type file_name: StringType @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' must be a string") fp = open(file_name,'w') cPickle.dump(peak_list, fp, 1) fp.close()
def open_for_writing(file_name): """ @summary: Opens file for writing, returns file pointer @param file_name: Name of the file to be opened for writing @type file_name: StringType @return: Pointer to the opened file @rtype: FileType @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' is not a string") try: fp = open(file_name, "w") except IOError: error("Cannot open '%s' for writing" % (file_name)) return fp
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: #file = CDF(file_name) rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC') #except CDFError: # error("Cannot open file '%s'" % file_name) except: ## <TODO> to find out if netCDF4 throws specific errors that we can use here error("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) print rootgrp.variables[__MASS_STRING][:] scan_list = [] # mass = file.var(__MASS_STRING) # old pycdf way # intensity = file.var(__INTENSITY_STRING) #old pycdf way mass = rootgrp.variables[__MASS_STRING][:] intensity = rootgrp.variables[__INTENSITY_STRING][:] mass_values = mass.tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) # time = file.var(__TIME_STRING) #old pycdf way time = rootgrp.variables[__TIME_STRING][:] time_list = time.tolist() # sanity check if not len(time_list) == len(scan_list): error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def trim(self, begin=None, end=None): """ @summary: trims data in the time domain @param begin: begin parameter designating start time or scan number @type begin: IntType or StrType @param end: end parameter designating start time or scan number @type end: IntType or StrType The arguments 'begin' and 'end' can be either integers (in which case they are taken as the first/last scan number for trimming) or strings in which case they are treated as time strings and converted to scan numbers. At present both 'begin' and 'end' must be of the same type, either both scan numbers or time strings. @author: Vladimir Likic """ # trim called with defaults, or silly arguments if begin == None and end == None: print "Nothing to do." return # exit immediately N = len(self.__scan_list) # process 'begin' and 'end' if begin == None: first_scan = 0 elif is_int(begin): first_scan = begin-1 elif is_str(begin): time = time_str_secs(begin) first_scan = self.get_index_at_time(time) + 1 else: error("invalid 'begin' argument") if end == None: last_scan = N-1 elif is_int(end): last_scan = end elif is_str(end): time = time_str_secs(end) last_scan = self.get_index_at_time(time) + 1 else: error("invalid 'end' argument") # sanity checks if not last_scan > first_scan: error("last scan=%d, first scan=%d" % (last_scan, first_scan)) elif first_scan < 0: error("scan number must be greater than one") elif last_scan > N-1: error("last scan=%d, total number of scans=%d" % (last_scan, N)) print "Trimming data to between %d and %d scans" % \ (first_scan+1, last_scan+1) scan_list_new = [] time_list_new = [] for ii in range(len(self.__scan_list)): if ii >= first_scan and ii <= last_scan: scan = self.__scan_list[ii] time = self.__time_list[ii] scan_list_new.append(scan) time_list_new.append(time) # update info self.__scan_list = scan_list_new self.__set_time(time_list_new) self.__set_min_max_mass() self.__calc_tic()
def ANDI_writer(file_name, im): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @param im: The IntensityMatrix @type file_name: pyms.GCMS.Class.IntensityMatrix @author: Andrew Isaac """ # netCDF header info for compatability # attributes #dataset_completeness 0 CHAR 6 C1+C2 #dataset_origin 4 CHAR 16 Santa Clara, CA #experiment_date_time_stamp 7 CHAR 20 20081218044500+1100 #experiment_title 6 CHAR 7 mix ma #experiment_type 10 CHAR 25 Centroided Mass Spectrum #external_file_ref_0 9 CHAR 8 MA_5C.M #languages 3 CHAR 8 English #ms_template_revision 1 CHAR 6 1.0.1 #netcdf_file_date_time_stamp 5 CHAR 20 20090114001531+1100 #netcdf_revision 2 CHAR 6 2.3.2 #number_of_times_calibrated 12 INT 1 0 #number_of_times_processed 11 INT 1 1 #operator_name 8 CHAR 12 Dave and Su #raw_data_intensity_format 25 CHAR 6 Float #raw_data_mass_format 23 CHAR 6 Float #raw_data_time_format 24 CHAR 6 Short #sample_state 13 CHAR 12 Other State #test_detector_type 18 CHAR 20 Electron Multiplier #test_ionization_mode 16 CHAR 16 Electron Impact #test_ionization_polarity 17 CHAR 18 Positive Polarity #test_ms_inlet 15 CHAR 17 Capillary Direct #test_resolution_type 19 CHAR 20 Constant Resolution #test_scan_direction 21 CHAR 3 Up #test_scan_function 20 CHAR 10 Mass Scan #test_scan_law 22 CHAR 7 Linear #test_separation_type 14 CHAR 18 No Chromatography # dimensions #_128_byte_string 6 128 #_16_byte_string 3 16 #_255_byte_string 7 255 #_2_byte_string 0 2 #_32_byte_string 4 32 #_4_byte_string 1 4 #_64_byte_string 5 64 #_8_byte_string 2 8 #error_number 10 1 #instrument_number 12 1 #point_number 9 554826 X #range 8 2 #scan_number 11 9865 # variables #a_d_coaddition_factor 2 SHORT 0 scan_number(9865) #a_d_sampling_rate 1 DOUBLE 0 scan_number(9865) #actual_scan_number 7 INT 0 scan_number(9865) #error_log 0 CHAR 0 error_number(1), _64_byte_string(64) #flag_count 15 INT 0 scan_number(9865) #instrument_app_version 27 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_comments 28 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_fw_version 25 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_id 20 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_mfr 21 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_model 22 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_name 19 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_os_version 26 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_serial_no 23 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_sw_version 24 CHAR 0 instrument_number(1), #_32_byte_string(32) #intensity_values 18 FLOAT 3 point_number(554826) #inter_scan_time 5 DOUBLE 0 scan_number(9865) #mass_range_max 10 DOUBLE 0 scan_number(9865) #mass_range_min 9 DOUBLE 0 scan_number(9865) #mass_values 16 FLOAT 2 point_number(554826) #point_count 14 INT 0 scan_number(9865) #resolution 6 DOUBLE 0 scan_number(9865) #scan_acquisition_time 3 DOUBLE 0 scan_number(9865) #scan_duration 4 DOUBLE 0 scan_number(9865) #scan_index 13 INT 0 scan_number(9865) #time_range_max 12 DOUBLE 0 scan_number(9865) #time_range_min 11 DOUBLE 0 scan_number(9865) #time_values 17 FLOAT 2 point_number(554826) #total_intensity 8 DOUBLE 1 scan_number(9865) # variable information #intensity_values attributes #name idx type len value #-------------------- --- ---- --- ----- #add_offset 1 DOUBLE 1 0.0 #scale_factor 2 DOUBLE 1 1.0 #units 0 CHAR 26 Arbitrary Intensity Units #mass_values attributes #name idx type len value #-------------------- --- ---- --- ----- #scale_factor 1 DOUBLE 1 1.0 #units 0 CHAR 4 M/Z #time_values attributes #name idx type len value #-------------------- --- ---- --- ----- #scale_factor 1 DOUBLE 1 1.0 #units 0 CHAR 8 Seconds #total_intensity attributes #name idx type len value #-------------------- --- ---- --- ----- #units 0 CHAR 26 Arbitrary Intensity Units # netCDF dimension names __POINT_NUMBER = "point_number" __SCAN_NUMBER = "scan_number" # the keys used to create certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" __POINT_COUNT = "point_count" if not is_str(file_name): error("'file_name' must be a string") try: # Open netCDF file in overwrite mode, creating it if inexistent. nc = CDF(file_name, NC.WRITE|NC.TRUNC|NC.CREATE) # Automatically set define and data modes. nc.automode() except CDFError: error("Cannot create file '%s'" % file_name) mass_list = im.get_mass_list() time_list = im.get_time_list() # direct access, don't modify intensity_matrix = im.intensity_matrix # compress by ignoring zero intensities # included for consistency with imported netCDF format mass_values = [] intensity_values = [] point_count_values = [] for row in xrange(len(intensity_matrix)): pc = 0 # point count for col in xrange(len(intensity_matrix[0])): # all rows same len if (intensity_matrix[row][col] > 0): mass_values.append(mass_list[col]) intensity_values.append(intensity_matrix[row][col]) pc += 1 point_count_values.append(pc) # sanity checks if not len(time_list) == len(point_count_values): error("number of time points does not equal the number of scans") # create dimensions # total number of data points dim_point_number = nc.def_dim(__POINT_NUMBER, len(mass_values)) # number of scans dim_scan_number = nc.def_dim(__SCAN_NUMBER, len(point_count_values)) # create variables # points var_mass_values = nc.def_var(__MASS_STRING, NC.FLOAT, dim_point_number) var_intensity_values = nc.def_var(__INTENSITY_STRING, NC.FLOAT, dim_point_number) # scans var_time_list = nc.def_var(__TIME_STRING, NC.DOUBLE, dim_scan_number) var_point_count_values = nc.def_var(__POINT_COUNT, NC.INT, dim_scan_number) # populate variables # points var_mass_values[:] = mass_values var_intensity_values[:] = intensity_values # scans var_time_list[:] = time_list var_point_count_values[:] = point_count_values # close file nc.close()
def import_leco_csv(self, file_name): """ @summary: Imports data in LECO CSV format @param file_name: File name @type file_name: StringType @return: Data as an IntensityMatrix @rtype: pyms.GCMS.Class.IntensityMatrix @author: Andrew Isaac """ if not is_str(file_name): error("'file_name' not a string") lines_list = open(file_name,'r') data = [] time_list = [] mass_list = [] # Format is text header with: # "Scan","Time",... # and the rest is "TIC" or m/z as text, i.e. "50","51"... # The following lines are: # scan_number,time,value,value,... # scan_number is an int, rest seem to be fixed format floats. # The format is 0.000000e+000 num_mass = 0 FIRST = True HEADER = True data_col = -1 time_col = -1 # get each line for line in lines_list: cols = -1 data_row = [] if len(line.strip()) > 0: data_list = line.strip().split(',') # get each value in line for item in data_list: item = item.strip() item = item.strip('\'"') # remove quotes (in header) # Get header if HEADER: cols += 1 if len(item) > 0: if item.lower().find("time") > -1: time_col = cols try: value = float(item) # find 1st col with number as header if FIRST and value > 1: # assume >1 mass data_col = cols # assume time col is previous col if time_col < 0: time_col = cols -1 FIRST = False mass_list.append(value) num_mass += 1 except ValueError: pass # Get rest else: cols += 1 if len(item) > 0: try: value = float(item) if cols == time_col: time_list.append(value) elif cols >= data_col: data_row.append(value) except ValueError: pass # check row length if not HEADER: if len(data_row) == num_mass: data.append(data_row) else: print ("Warning: ignoring row") HEADER = False # check col lengths if len(time_list) != len(data): print ("Warning: number of data rows and time list length differ") self.__mass_list = mass_list self.__time_list = time_list self.__intensity_matrix = data # Direct access for speed (DANGEROUS) self.intensity_matrix = self.__intensity_matrix
def JCAMP_reader(file_name): """ @summary: Generic reader for JCAMP DX files, produces GC-MS data object @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' not a string") print " -> Reading JCAMP file '%s'" % (file_name) lines_list = open(file_name,'r') data = [] page_idx = 0 xydata_idx = 0 time_list = [] scan_list = [] for line in lines_list: if not len(line.strip()) == 0: prefix = line.find('#') # key word or information if prefix == 0: fields = line.split('=') if fields[0].find("##PAGE") >= 0: time = float(fields[2].strip()) #rt for the scan to be submitted time_list.append(time) page_idx = page_idx + 1 elif fields[0].find("##DATA TABLE") >= 0: xydata_idx = xydata_idx + 1 # data elif prefix == -1: if page_idx > 1 or xydata_idx > 1: if len(data) % 2 == 1: error("data not in pair !") mass = [] intensity = [] for i in range(len(data) / 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if not len(mass) == len(intensity): error("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) data = [] data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if page_idx > 1: page_idx = 1 if xydata_idx > 1: xydata_idx = 1 else: data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if len(data) % 2 == 1: error("data not in pair !") # get last scan mass = [] intensity = [] for i in range(len(data) / 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if not len(mass) == len(intensity): error("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) # sanity check if not len(time_list) == len(scan_list): error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: file = CDF(file_name) except CDFError: error("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) scan_list = [] mass = file.var(__MASS_STRING) intensity = file.var(__INTENSITY_STRING) mass_values = mass.get().tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.get().tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time = file.var(__TIME_STRING) time_list = time.get().tolist() # sanity check if not len(time_list) == len(scan_list): error("number of time points (%d) does not equal the number of scans (%d)"%(len(time_list), len(scan_list))) data = GCMS_data(time_list, scan_list) return data
def mzML_reader(file_name): """ @summary: A reader for mzML files, returns a GC-MS data object @param file_name: The name of the mzML file @type file_name: StringType @author: Sean O'Callaghan """ if not is_str(file_name): error("'file_name' must be a string") try: mzml_file = pymzml.run.Reader(file_name) except: error("Cannot open file '%s'" % file_name) try: # avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank == 0: file_names = [] for i in range(1, size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print " -> Reading mzML files:" print file_name for file_n in file_names: print file_n else: comm.send(file_name, dest=0) except: print " -> Reading mzML file '%s'" % (file_name) scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz, i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) #scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": #time value # We need time in seconds not minutes time_list.append(60 * float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) #print "time:", len(time_list) #print "scan:", len(scan_list) data = GCMS_data(time_list, scan_list) return data
def mzML_reader(file_name): """ @summary: A reader for mzML files, returns a GC-MS data object @param file_name: The name of the mzML file @type file_name: StringType @author: Sean O'Callaghan """ if not is_str(file_name): error("'file_name' must be a string") try: mzml_file = pymzml.run.Reader(file_name) except: error("Cannot open file '%s'" % file_name) try:# avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank ==0: file_names = [] for i in range(1,size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print " -> Reading mzML files:" print file_name for file_n in file_names: print file_n else: comm.send(file_name, dest=0) except: print " -> Reading mzML file '%s'" % (file_name) scan_list = [] time_list = [] for spectrum in mzml_file: mass_list = [] intensity_list = [] for mz,i in spectrum.peaks: mass_list.append(mz) intensity_list.append(i) #scan_list.append(Scan(mass_list, intensity_list)) for element in spectrum.xmlTree: # For some reason there are spectra with no time value, # Ignore these???????????? if element.get('accession') == "MS:1000016": #time value # We need time in seconds not minutes time_list.append(60*float(element.get('value'))) scan_list.append(Scan(mass_list, intensity_list)) #print "time:", len(time_list) #print "scan:", len(scan_list) data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: file = CDF(file_name) except CDFError: error("Cannot open file '%s'" % file_name) try:# avoid printing from each rank comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() if rank ==0: file_names = [] for i in range(1,size): recv_buffer = "" file_n = comm.recv(recv_buffer, i) file_names.append(file_n) print " -> Reading netCDF files:" print file_name for file_n in file_names: print file_n else: comm.send(file_name, dest=0) except: print " -> Reading netCDF file '%s'" % (file_name) scan_list = [] mass = file.var(__MASS_STRING) intensity = file.var(__INTENSITY_STRING) mass_values = mass.get().tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.get().tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time = file.var(__TIME_STRING) time_list = time.get().tolist() # sanity check if not len(time_list) == len(scan_list): error("number of time points (%d) does not equal the number of scans (%d)"%(len(time_list), len(scan_list))) data = GCMS_data(time_list, scan_list) return data
def save_data(file_name, data, format_str="%.6f", prepend="", sep=" ", compressed=False): """ @summary: Saves a list of numbers or a list of lists of numbers to a file with specific formatting @param file_name: Name of a file @type: StringType @param data: A list of numbers, or a list of lists @type: ListType @param format_str: A format string for individual entries @type: StringType @param prepend: A string, printed before each row @type: StringType @param sep: A string, printed after each number @type: StringType @param compressed: A boolean. If True, the output will be gzipped @type: BooleanType @return: none @rtype: NoneType @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' is not a string") if not is_list(data): error("'data' is not a list") if not is_str(prepend): error("'prepend' is not a string") if not is_str(sep): error("'sep' is not a string") fp = open_for_writing(file_name) # decide whether data is a vector or matrix if is_number(data[0]): for item in data: if not is_number(item): error("not all elements of the list are numbers") data_is_matrix = 0 else: for item in data: if not is_list(item): error("not all elements of the list are lists") data_is_matrix = 1 if data_is_matrix: for ii in range(len(data)): fp.write(prepend) for jj in range(len(data[ii])): if is_number(data[ii][jj]): fp.write(format_str % (data[ii][jj])) if (jj < (len(data[ii]) - 1)): fp.write(sep) else: error("datum not a number") fp.write("\n") else: for ii in range(len(data)): fp.write(prepend) fp.write(format_str % (data[ii])) fp.write("\n") close_for_writing(fp) if compressed: status = os.system('gzip %s' % (file_name)) if status != 0: error("gzip compress failed")
def save_data(file_name, data, format_str="%.6f", prepend="", sep=" ", compressed=False): """ @summary: Saves a list of numbers or a list of lists of numbers to a file with specific formatting @param file_name: Name of a file @type: StringType @param data: A list of numbers, or a list of lists @type: ListType @param format_str: A format string for individual entries @type: StringType @param prepend: A string, printed before each row @type: StringType @param sep: A string, printed after each number @type: StringType @param compressed: A boolean. If True, the output will be gzipped @type: BooleanType @return: none @rtype: NoneType @author: Vladimir Likic """ if not is_str(file_name): error("'file_name' is not a string") if not is_list(data): error("'data' is not a list") if not is_str(prepend): error("'prepend' is not a string") if not is_str(sep): error("'sep' is not a string") fp = open_for_writing(file_name) # decide whether data is a vector or matrix if is_number(data[0]): for item in data: if not is_number(item): error("not all elements of the list are numbers") data_is_matrix = 0 else: for item in data: if not is_list(item): error("not all elements of the list are lists") data_is_matrix = 1 if data_is_matrix: for ii in range(len(data)): fp.write(prepend) for jj in range(len(data[ii])): if is_number(data[ii][jj]): fp.write(format_str % (data[ii][jj])) if (jj<(len(data[ii])-1)): fp.write(sep) else: error("datum not a number") fp.write("\n") else: for ii in range(len(data)): fp.write(prepend) fp.write(format_str % (data[ii])) fp.write("\n") close_for_writing(fp) if compressed: status = os.system('gzip %s' % (file_name)) if status != 0: error("gzip compress failed")
def ANDI_writer(file_name, im): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @param im: The IntensityMatrix @type file_name: pyms.GCMS.Class.IntensityMatrix @author: Andrew Isaac """ # netCDF header info for compatability # attributes #dataset_completeness 0 CHAR 6 C1+C2 #dataset_origin 4 CHAR 16 Santa Clara, CA #experiment_date_time_stamp 7 CHAR 20 20081218044500+1100 #experiment_title 6 CHAR 7 mix ma #experiment_type 10 CHAR 25 Centroided Mass Spectrum #external_file_ref_0 9 CHAR 8 MA_5C.M #languages 3 CHAR 8 English #ms_template_revision 1 CHAR 6 1.0.1 #netcdf_file_date_time_stamp 5 CHAR 20 20090114001531+1100 #netcdf_revision 2 CHAR 6 2.3.2 #number_of_times_calibrated 12 INT 1 0 #number_of_times_processed 11 INT 1 1 #operator_name 8 CHAR 12 Dave and Su #raw_data_intensity_format 25 CHAR 6 Float #raw_data_mass_format 23 CHAR 6 Float #raw_data_time_format 24 CHAR 6 Short #sample_state 13 CHAR 12 Other State #test_detector_type 18 CHAR 20 Electron Multiplier #test_ionization_mode 16 CHAR 16 Electron Impact #test_ionization_polarity 17 CHAR 18 Positive Polarity #test_ms_inlet 15 CHAR 17 Capillary Direct #test_resolution_type 19 CHAR 20 Constant Resolution #test_scan_direction 21 CHAR 3 Up #test_scan_function 20 CHAR 10 Mass Scan #test_scan_law 22 CHAR 7 Linear #test_separation_type 14 CHAR 18 No Chromatography # dimensions #_128_byte_string 6 128 #_16_byte_string 3 16 #_255_byte_string 7 255 #_2_byte_string 0 2 #_32_byte_string 4 32 #_4_byte_string 1 4 #_64_byte_string 5 64 #_8_byte_string 2 8 #error_number 10 1 #instrument_number 12 1 #point_number 9 554826 X #range 8 2 #scan_number 11 9865 # variables #a_d_coaddition_factor 2 SHORT 0 scan_number(9865) #a_d_sampling_rate 1 DOUBLE 0 scan_number(9865) #actual_scan_number 7 INT 0 scan_number(9865) #error_log 0 CHAR 0 error_number(1), _64_byte_string(64) #flag_count 15 INT 0 scan_number(9865) #instrument_app_version 27 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_comments 28 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_fw_version 25 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_id 20 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_mfr 21 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_model 22 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_name 19 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_os_version 26 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_serial_no 23 CHAR 0 instrument_number(1), #_32_byte_string(32) #instrument_sw_version 24 CHAR 0 instrument_number(1), #_32_byte_string(32) #intensity_values 18 FLOAT 3 point_number(554826) #inter_scan_time 5 DOUBLE 0 scan_number(9865) #mass_range_max 10 DOUBLE 0 scan_number(9865) #mass_range_min 9 DOUBLE 0 scan_number(9865) #mass_values 16 FLOAT 2 point_number(554826) #point_count 14 INT 0 scan_number(9865) #resolution 6 DOUBLE 0 scan_number(9865) #scan_acquisition_time 3 DOUBLE 0 scan_number(9865) #scan_duration 4 DOUBLE 0 scan_number(9865) #scan_index 13 INT 0 scan_number(9865) #time_range_max 12 DOUBLE 0 scan_number(9865) #time_range_min 11 DOUBLE 0 scan_number(9865) #time_values 17 FLOAT 2 point_number(554826) #total_intensity 8 DOUBLE 1 scan_number(9865) # variable information #intensity_values attributes #name idx type len value #-------------------- --- ---- --- ----- #add_offset 1 DOUBLE 1 0.0 #scale_factor 2 DOUBLE 1 1.0 #units 0 CHAR 26 Arbitrary Intensity Units #mass_values attributes #name idx type len value #-------------------- --- ---- --- ----- #scale_factor 1 DOUBLE 1 1.0 #units 0 CHAR 4 M/Z #time_values attributes #name idx type len value #-------------------- --- ---- --- ----- #scale_factor 1 DOUBLE 1 1.0 #units 0 CHAR 8 Seconds #total_intensity attributes #name idx type len value #-------------------- --- ---- --- ----- #units 0 CHAR 26 Arbitrary Intensity Units # netCDF dimension names __POINT_NUMBER = "point_number" __SCAN_NUMBER = "scan_number" # the keys used to create certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" __POINT_COUNT = "point_count" if not is_str(file_name): error("'file_name' must be a string") try: # Open netCDF file in overwrite mode, creating it if inexistent. nc = CDF(file_name, NC.WRITE | NC.TRUNC | NC.CREATE) # Automatically set define and data modes. nc.automode() except CDFError: error("Cannot create file '%s'" % file_name) mass_list = im.get_mass_list() time_list = im.get_time_list() # direct access, don't modify intensity_matrix = im.intensity_matrix # compress by ignoring zero intensities # included for consistency with imported netCDF format mass_values = [] intensity_values = [] point_count_values = [] for row in xrange(len(intensity_matrix)): pc = 0 # point count for col in xrange(len(intensity_matrix[0])): # all rows same len if (intensity_matrix[row][col] > 0): mass_values.append(mass_list[col]) intensity_values.append(intensity_matrix[row][col]) pc += 1 point_count_values.append(pc) # sanity checks if not len(time_list) == len(point_count_values): error("number of time points does not equal the number of scans") # create dimensions # total number of data points dim_point_number = nc.def_dim(__POINT_NUMBER, len(mass_values)) # number of scans dim_scan_number = nc.def_dim(__SCAN_NUMBER, len(point_count_values)) # create variables # points var_mass_values = nc.def_var(__MASS_STRING, NC.FLOAT, dim_point_number) var_intensity_values = nc.def_var(__INTENSITY_STRING, NC.FLOAT, dim_point_number) # scans var_time_list = nc.def_var(__TIME_STRING, NC.DOUBLE, dim_scan_number) var_point_count_values = nc.def_var(__POINT_COUNT, NC.INT, dim_scan_number) # populate variables # points var_mass_values[:] = mass_values var_intensity_values[:] = intensity_values # scans var_time_list[:] = time_list var_point_count_values[:] = point_count_values # close file nc.close()
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: #file = CDF(file_name) rootgrp = Dataset(file_name, "r+", format='NETCDF3_CLASSIC') except: raise RuntimeError("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) print rootgrp.variables[__MASS_STRING][:] scan_list = [] # mass = file.var(__MASS_STRING) # old pycdf way # intensity = file.var(__INTENSITY_STRING) #old pycdf way mass = rootgrp.variables[__MASS_STRING][:] intensity = rootgrp.variables[__INTENSITY_STRING][:] mass_values = mass.tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) # time = file.var(__TIME_STRING) #old pycdf way time = rootgrp.variables[__TIME_STRING][:] time_list = time.tolist() # sanity check if not len(time_list) == len(scan_list): raise RuntimeError( "number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic @author: Tony Chen """ __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: dataset = Dataset(file_name, 'r') except: error("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) scan_list = list() # mass = np.array(dataset.variables[__MASS_STRING]) # intensity = np.array(dataset.variables[__INTENSITY_STRING]) mass_values = np.array(dataset.variables[__MASS_STRING]) mass_list = list() mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = np.array(dataset.variables[__INTENSITY_STRING]) intensity_list = list() intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = list() intensity_list = list() mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time_list = np.array(dataset.variables[__TIME_STRING]) # sanity check if not len(time_list) == len(scan_list): error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data
def ANDI_reader(file_name): """ @summary: A reader for ANDI-MS NetCDF files, returns a GC-MS data object @param file_name: The name of the ANDI-MS file @type file_name: StringType @author: Qiao Wang @author: Andrew Isaac @author: Vladimir Likic """ ## TODO: use 'point_count' and allow for zero len scans # the keys used to retrieve certain data from the NetCDF file __MASS_STRING = "mass_values" __INTENSITY_STRING = "intensity_values" __TIME_STRING = "scan_acquisition_time" if not is_str(file_name): error("'file_name' must be a string") try: file = CDF(file_name) except CDFError: error("Cannot open file '%s'" % file_name) print " -> Reading netCDF file '%s'" % (file_name) scan_list = [] mass = file.var(__MASS_STRING) intensity = file.var(__INTENSITY_STRING) mass_values = mass.get().tolist() mass_list = [] mass_previous = mass_values[0] mass_list.append(mass_previous) intensity_values = intensity.get().tolist() intensity_list = [] intensity_previous = intensity_values[0] intensity_list.append(intensity_previous) if not len(mass_values) == len(intensity_values): error("length of mass_list is not equal to length of intensity_list !") for i in range(len(mass_values) - 1): # assume masses in ascending order until new scan if mass_previous <= mass_values[i + 1]: #print mass_values[i+1] mass_list.append(mass_values[i + 1]) mass_previous = mass_values[i + 1] intensity_list.append(intensity_values[i + 1]) intensity_previous = intensity_values[i + 1] # new scan else: scan_list.append(Scan(mass_list, intensity_list)) #print "Added scan" mass_previous = mass_values[i + 1] intensity_previous = intensity_values[i + 1] mass_list = [] intensity_list = [] mass_list.append(mass_previous) intensity_list.append(intensity_previous) # store final scan scan_list.append(Scan(mass_list, intensity_list)) time = file.var(__TIME_STRING) time_list = time.get().tolist() # sanity check if not len(time_list) == len(scan_list): #JT: Debug for old gcms data #JT: time longer than scans so trim print "Time list is" print len(time_list) - len(scan_list) print "longer than scan list. Trimming...." time_list = time_list[0:len(scan_list)] print len(time_list) print len(scan_list) #error("number of time points does not equal the number of scans") data = GCMS_data(time_list, scan_list) return data