def test_hdf5_before_after(self): save_peaklists_as_hdf5(self.pls_master, to_test_results("MTBLS79_mzml_triplicates.hdf5")) pls = load_peaklists_from_hdf5(to_test_results("MTBLS79_mzml_triplicates.hdf5")) self.assertEqual(len(pls), len(self.pls_master)) self.assertTrue(np.all(pls[0].mz == self.pls_master[0].mz)) self.assertTrue(np.all(pls[0].intensity == self.pls_master[0].intensity)) self.assertTrue(np.all(pls[0].snr == self.pls_master[0].snr)) save_peak_matrix_as_hdf5(self.pm_master, to_test_results("MTBLS79_mzml_peak_matrix.hdf5")) pm = load_peak_matrix_from_hdf5(to_test_results("MTBLS79_mzml_peak_matrix.hdf5")) self.assertEqual(pm.shape, self.pm_master.shape) self.assertTrue(np.all(pm.attr_mean_vector('mz') == self.pm_master.attr_mean_vector('mz'))) self.assertTrue(np.all(pm.attr_mean_vector('intensity') == self.pm_master.attr_mean_vector('intensity'))) self.assertTrue(np.all(pm.attr_mean_vector('snr') == self.pm_master.attr_mean_vector('snr')))
def test_peaklist_portal(self): pkls = self._createPeaklists() save_peaklists_as_hdf5(pkls, '.test_peaklist.hdf5') npkls = load_peaklists_from_hdf5('.test_peaklist.hdf5') self.assertListEqual([x.size for x in npkls], [75] * 6) self.assertListEqual([x.full_size for x in npkls], [100] * 6) self.assertTrue( all([ np.allclose(x[0].mz_all, x[1].mz_all) for x in zip(pkls, npkls) ])) self.assertTrue( all([ np.allclose(x[0].intensity, x[1].intensity) for x in zip(pkls, npkls) ])) self.assertTrue( all([ np.allclose(x[0].snr, x[1].snr, atol=1e-30) for x in zip(pkls, npkls) ])) self.assertTrue( all([ np.all(x[0].quad_flag == x[1].quad_flag) for x in zip(pkls, npkls) ])) self.assertTrue( all([np.all(x[0].lab == x[1].lab) for x in zip(pkls, npkls)])) self.assertTrue( all([ list(x[0].metadata.keys()) == list(x[1].metadata.keys()) for x in zip(pkls, npkls) ])) self.assertTrue( all([ x[0].tags.tag_types == x[1].tags.tag_types for x in zip(pkls, npkls) ])) self.assertTrue( all([ x[0].tags.tag_values == x[1].tags.tag_values for x in zip(pkls, npkls) ]))
def hdf5_peaklists_to_txt(filename: str, path_out: str, delimiter: str = "\t"): """ :param filename: :param path_out: :param delimiter: """ if not os.path.isfile(filename): raise IOError('HDF5 database [%s] does not exist' % filename) if not h5py.is_hdf5(filename): raise IOError('input file [%s] is not a valid HDF5 database' % filename) if not os.path.isdir(path_out): raise IOError("File or Directory does not exist:".format(path_out)) obj = hdf5_portal.load_peaklists_from_hdf5(filename) if "#" in obj[0].ID: fns = set([pl.ID.split("#")[0] for pl in obj]) sub_ids = [pl.ID.split("#")[1] for pl in obj] for fn in fns: with open(os.path.join(path_out, os.path.splitext(fn)[0] + ".txt"), "w") as pk_out: for i, pl in enumerate(obj): if fn in pl.ID: pl.add_attribute("event", pl.full_shape[0] * [sub_ids[i]], flagged_only=False, on_index=3) str_out = pl.to_str(delimiter=delimiter) if i > 0: pk_out.write(str_out[str_out.index('\n'):]) else: pk_out.write(str_out) pl.drop_attribute("event") else: for pl in obj: with open( os.path.join(path_out, os.path.splitext(pl.ID)[0] + ".txt"), "w") as pk_out: pk_out.write(pl.to_str(delimiter=delimiter)) return
def test_peaklist_portal(self): pkls = self._createPeaklists() save_peaklists_as_hdf5(pkls, '.test_peaklist.hdf5') npkls = load_peaklists_from_hdf5('.test_peaklist.hdf5') self.assertListEqual(map(lambda x: x.size, npkls), [75] * 6) self.assertListEqual(map(lambda x: x.full_size, npkls), [100] * 6) self.assertTrue( all( map(lambda x: np.allclose(x[0].mz_all, x[1].mz_all), zip(pkls, npkls)))) self.assertTrue( all( map(lambda x: np.allclose(x[0].intensity, x[1].intensity), zip(pkls, npkls)))) self.assertTrue( all( map(lambda x: np.allclose(x[0].snr, x[1].snr, atol=1e-30), zip(pkls, npkls)))) self.assertTrue( all( map(lambda x: np.all(x[0].quad_flag == x[1].quad_flag), zip(pkls, npkls)))) self.assertTrue( all(map(lambda x: np.all(x[0].lab == x[1].lab), zip(pkls, npkls)))) self.assertTrue( all( map(lambda x: x[0].metadata.keys() == x[1].metadata.keys(), zip(pkls, npkls)))) self.assertTrue( all( map(lambda x: x[0].tags.tag_types == x[1].tags.tag_types, zip(pkls, npkls)))) self.assertTrue( all( map(lambda x: x[0].tags.tag_values == x[1].tags.tag_values, zip(pkls, npkls))))
def check_paths(tsv, source): if tsv is None: if type(source) == str: if os.path.isdir(source): filenames = [ os.path.join(source, fn) for fn in os.listdir(source) if fn.lower().endswith(".mzml") or fn.lower().endswith(".raw") ] elif zipfile.is_zipfile(source): with zipfile.ZipFile(source) as zf: if len([ fn for fn in zf.namelist() if fn.lower().endswith(".raw") ]) > 0: raise IOError( "Archive with *.raw files not yet supported. Convert to mzML" ) filenames = [ fn for fn in zf.namelist() if fn.lower().endswith(".mzml") ] elif h5py.is_hdf5(source): peaklists = hdf5_portal.load_peaklists_from_hdf5(source) filenames = [ os.path.join(os.path.abspath(os.path.dirname(source)), pl.ID) for pl in peaklists ] elif os.path.isfile(source): if source.lower().endswith(".raw") or source.lower().endswith( ".mzml"): filenames = [source] else: raise IOError( "Incorrect file format, provide .mzml or .raw files: {}" .format(source)) else: raise IOError( "[Errno 2] No such file or directory: {}".format(source)) elif type(source) == list or type(source) == tuple: if isinstance(source[0], PeakList): filenames = [pl.ID for pl in source] else: filenames = [] for fn in source: if os.path.isfile(fn): if fn.lower().endswith(".raw") or fn.lower().endswith( ".mzml"): filenames.append(fn) else: raise IOError( "Incorrect file format, provide .mzml or .raw files: {}" .format(source)) else: raise IOError( "[Errno 2] No such file or directory: {}".format( source)) else: raise IOError( "[Errno 2] No such file or directory: {}".format(source)) elif os.path.isfile(tsv): fm = np.genfromtxt(tsv, dtype=None, delimiter="\t", names=True) if len(fm.shape) == 0: fm = np.array([fm]) if fm.dtype.names[0] != "filename" and fm.dtype.names[0] != "sample_id": raise IOError( "Incorrect header for first column. Use filename or sample_id") filenames = [] if type(source) == list or type(source) == tuple: if isinstance(source[0], PeakList): for filename in fm[fm.dtype.names[0]]: if filename in [pl.ID for pl in source]: filenames.append(filename) else: raise IOError( "{} does not exist in list with Peaklist objects". format(filename)) else: for filename in fm[fm.dtype.names[0]]: if filename not in [os.path.basename(fn) for fn in source]: raise IOError( "{} (row {}) does not exist in source provided". format( filename, list(fm[fm.dtype.names[0]]).index(filename) + 1)) for fn in source: if os.path.isfile(fn): filenames.append(fn) else: raise IOError( "[Errno 2] No such file or directory: {}".format( fn)) elif type(source) == str: if os.path.isdir(source): l = os.listdir(source) for fn in fm[fm.dtype.names[0]]: if os.path.basename(fn) not in l: raise IOError( "{} does not exist in directory provided".format( os.path.basename(fn))) filenames.append(os.path.join(source, fn)) elif zipfile.is_zipfile(source): with zipfile.ZipFile(source) as zf: if len([ fn for fn in zf.namelist() if fn.lower().endswith(".raw") ]) > 0: raise IOError( "Archive with *.raw files not yet supported. Convert to mzML" ) for fn in fm[fm.dtype.names[0]]: if fn not in zf.namelist(): raise IOError( "{} does not exist in .zip file".format(fn)) filenames.append(fn) elif h5py.is_hdf5(source): peaklists = hdf5_portal.load_peaklists_from_hdf5(source) filenames = [pl.ID for pl in peaklists] else: raise IOError( "[Errno 2] No such file or directory: {} or {}".format( source, tsv)) else: raise IOError("[Errno 2] No such file or directory: {} or {}".format( source, tsv)) return filenames
def main(): # pragma: no cover print("Executing msnpy version %s." % __version__) parser = argparse.ArgumentParser( description= 'Python package to process and annotate MSn fragmentation data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers(dest='step') parser_g = subparsers.add_parser( 'group-scans', help='Group fragmentation events and/or experiments.') parser_ps = subparsers.add_parser('process-scans', help='Process and filter scans.') parser_cst = subparsers.add_parser( 'create-spectral-trees', help='Create spectral trees from processed scan (fragmentation) data.') parser_ast = subparsers.add_parser( 'annotate-spectral-trees', help='Annotate and/or filter spectral trees.') parser_rst = subparsers.add_parser('rank-spectral-trees', help='Rank annotated spectral trees.') parser_cvst = subparsers.add_parser( 'convert-spectral-trees', help= 'Convert spectral trees to either dimspy peaklists, MSP files or both') ################################################################## # GROUP SCANS ################################################################## parser_g.add_argument('-i', '--input', type=str, required=True, help="Mzml or Thermo Scientific raw file") parser_g.add_argument('-o', '--output', type=str, required=True, help="") parser_g.add_argument('-u', '--report', type=str, required=False, default=None, help="Summary/Report of groups") parser_g.add_argument('-n', '--number-of-headers', default=2, type=int, required=False, help="") parser_g.add_argument('-r', '--min-replicates', default=None, type=int, required=False, help="") parser_g.add_argument('-t', '--max-injection-time', default=None, type=float, required=False, help="") parser_g.add_argument('-s', '--split', action='store_true', required=False, help="") parser_g.add_argument('-m', '--merge-ms1', action='store_true', required=False, help="") ################################################################## # PROCESS SCANS ################################################################## parser_ps.add_argument('-i', '--input', type=str, required=True, help="Mzml or Thermo Scientific raw file") parser_ps.add_argument( '-g', '--groups', type=str, required=True, help="Json or gml file that includes the groups of scans") parser_ps.add_argument('-o', '--output', type=str, required=True, help="HDF5 file to save the peaklist objects to.") parser_ps.add_argument('-m', '--function-noise', choices=["median", "mean", "mad", "noise_packets"], required=True, help="Select function to calculate noise.") parser_ps.add_argument('-s', '--snr-threshold', default=3.0, type=float, required=True, help="Signal-to-noise threshold") parser_ps.add_argument( '-p', '--ppm', default=2.0, type=float, required=False, help= "Mass tolerance in Parts per million to group peaks across scans / mass spectra." ) parser_ps.add_argument( '-a', '--min-fraction', default=0.5, type=float, required=False, help= "Minimum fraction a peak has to be present. Use 0.0 to not apply this filter." ) parser_ps.add_argument( '-d', '--rsd-threshold', default=None, type=float, required=False, help= "Maximum threshold - relative standard deviation (Calculated for peaks that have been measured across a minimum of two scans)." ) parser_ps.add_argument('-n', '--normalise', default=None, type=float, required=False, help="Normalise scans by Total Ion Current (TIC)") parser_ps.add_argument( '-e', '--exclusion-list', nargs='+', default=None, required=False, help= "List of mz values to exclude from processing (e.g. from electrical noise)" ) parser_ps.add_argument('-r', '--ringing-threshold', default=None, type=float, required=False, help="Remove ringing artifacts.") parser_ps.add_argument('-u', '--report', type=str, required=False, default=None, help="Summary/Report of processed mass spectra") parser_ps.add_argument( '-b', '--block-size', default=5000, type=int, required=False, help="The size of each block of peaks to perform clustering on.") parser_ps.add_argument('-c', '--ncpus', default=None, type=int, required=False, help="Number of central processing units (CPUs).") ################################################################## # CREATE SPECTRAL TREES ################################################################## parser_cst.add_argument( '-i', '--input', type=str, required=True, help="HDF5 file (Peaklist objects) from step 'process-scans'.") parser_cst.add_argument('-g', '--groups', type=str, required=True, help="") parser_cst.add_argument('-o', '--output', type=str, required=True, help="") ################################################################## # ANNOTATE SPECTRAL TREES ################################################################## parser_ast.add_argument('-i', '--input', type=str, required=True, help="Json file containing spectral trees") parser_ast.add_argument('-p', '--ppm', default=2.0, type=float, required=False, help="Mass tolerance in Parts per million.") parser_ast.add_argument('-r', '--rules', action='store_true', required=False, help="") parser_ast.add_argument('-m', '--mf-db', type=str, required=False, default="http://multiomics-int.cs.bham.ac.uk", help="Molecular formulae database") parser_ast.add_argument( '-d', '--output-db', type=str, required=True, help= "Sqlite database file to store information regarding the annotations.") parser_ast.add_argument( '-o', '--output-trees', type=str, required=True, help="Json file containing the annotated spectral trees.") parser_ast.add_argument( '-a', '--adducts', nargs='+', required=True, help="Adducts e.g. [M+H]+ [M+NH4]+ [M+Na]+ [M+(39K)]+", default=['[M+H]+', '[M+Na]+', '[M+NH4]+']) parser_ast.add_argument('-f', '--filter', action='store_true', required=False, help="Filter the spectral tree annotations") ################################# # RANK SPECTRAL TREES ################################# parser_rst.add_argument( '-i', '--input', type=str, required=True, help="Json file containing annotated spectral trees") parser_rst.add_argument('-o', '--output', type=str, required=True, help="Summary of the rankings") ################################################################## # CONVERT SPECTRA TREES - TO DIMSPY.PEAKLISTS AND MSP FILES ################################################################## parser_cvst.add_argument( '-i', '--input', type=str, required=True, help= "Json file containing annotated spectral trees or dimspy peaklist hdf5 file" ) parser_cvst.add_argument('-o', '--output', type=str, required=True, help="Out folder containing spectra") parser_cvst.add_argument( '-x', '--input_type', default="json", type=str, required=False, help="If input is either a dimspy peaklist or a msnpy json") parser_cvst.add_argument('-n', '--name', type=str, required=False, help="Name to use for suffixing files") parser_cvst.add_argument('-a', '--adjust_mz', action='store_true', required=False, help="Filter the spectral tree annotations") parser_cvst.add_argument('-m', '--merge', action='store_true', required=False, help="Filter the spectral tree annotations") parser_cvst.add_argument('-p', '--ppm', default=5.0, type=float, required=False, help="Mass tolerance in Parts per million.") parser_cvst.add_argument('-s', '--msp', action='store_true', required=False, help="Filter the spectral tree annotations") parser_cvst.add_argument( '-t', '--msp_type', default="massbank", type=str, required=False, help="If MSP file is to be created what type (massbank, msp)") parser_cvst.add_argument( '-z', '--polarity', type=str, required=False, default='NA', help="Polarity to add to the MSP file (positive or negative)") parser_cvst.add_argument( '-y', '--ms1', action='store_true', required=False, help= "Output ms1 spectra (creates spectra for the precursors in the MS1 spectra" ) args = parser.parse_args() print(args) if args.step == "group-scans": groups = group_scans(filename=args.input, nh=args.number_of_headers, min_replicates=args.min_replicates, report=args.report, max_injection_time=args.max_injection_time, merge_ms1=args.merge_ms1, split=args.split) save_groups(groups=groups, filename=args.output, format="json") if args.step == "process-scans": peaklists = process_scans( filename=args.input, groups=load_groups(args.groups, format="json") if args.groups else None, function_noise=args.function_noise, snr_thres=args.snr_threshold, ppm=args.ppm, min_fraction=args.min_fraction, rsd_thres=args.rsd_threshold, normalise=args.normalise, ringing_thres=args.ringing_threshold, exclusion_list=args.exclusion_list, report=args.report, block_size=args.block_size, ncpus=args.ncpus) hdf5_portal.save_peaklists_as_hdf5(peaklists, args.output) if args.step == "create-spectral-trees": groups = load_groups(args.groups, format="json") pls = hdf5_portal.load_peaklists_from_hdf5(args.input) spectral_trees = create_spectral_trees(groups, pls) save_trees(spectral_trees, args.output, format="json") if args.step == "annotate-spectral-trees": spectral_trees = load_trees(args.input, format="json") adducts = [ a.replace('__ob__', '[').replace('__cb__', ']') for a in args.adducts ] st = annotate_mf(spectral_trees=spectral_trees, db_out=args.output_db, ppm=args.ppm, adducts=adducts, rules=args.rules, mf_db=args.mf_db) if args.filter: st = filter_mf(st, args.output_db) save_trees(st, args.output_trees, format="json") if args.step == "rank-spectral-trees": st = load_trees(args.input, format="json") ranks = rank_mf(st) ranks.to_csv(args.output, sep="\t", index=False) if args.step == "create-spectral-trees": groups = load_groups(args.groups, format="json") pls = hdf5_portal.load_peaklists_from_hdf5(args.input) spectral_trees = create_spectral_trees(groups, pls) save_trees(spectral_trees, args.output, format="json") if args.step == "convert-spectral-trees": print('converting trees to dimspy peaklists') if args.input_type == 'json': non_merged_pls, merged_pls, ms1_precursor_pl = tree2peaklist( tree_pth=args.input, out_pth=args.output, name=args.name, adjust_mz=args.adjust_mz, merge=args.merge, ppm=args.ppm) if args.msp: print('Converting dimspy peaklists to MSP files') if non_merged_pls: peaklist2msp(non_merged_pls, os.path.join( args.output, '{}_non_merged.msp'.format(args.name)), msp_type=args.msp_type, polarity=args.polarity) if merged_pls: peaklist2msp(merged_pls, os.path.join( args.output, '{}_merged.msp'.format(args.name)), msp_type=args.msp_type, polarity=args.polarity) if ms1_precursor_pl: peaklist2msp(ms1_precursor_pl, os.path.join( args.output, '{}_ms1_precursors.msp'.format( args.name)), msp_type=args.msp_type, polarity=args.polarity, include_ms1=True) else: pls = hdf5_portal.load_peaklists_from_hdf5(args.input) peaklist2msp(pls, os.path.join(args.output, '{}.msp'.format(args.name)), msp_type=args.msp_type, polarity=args.polarity)
def main(): # Create ArgumentParser object parser = argparse.ArgumentParser( description= 'Python package for processing acoustic mist ionisation-mass spectrometry -based metabolomics and lipidomics data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # subparsers subparsers = parser.add_subparsers(dest='step') parser_scans = subparsers.add_parser( 'process-scans', help='Process and align scans within samples.') parser_samples = subparsers.add_parser('process-samples', help='Process and align samples.') parser_hpmt = subparsers.add_parser( 'hdf5-pm-to-txt', help='Write HDF5 output (peak matrix) to text format.') parser_hplt = subparsers.add_parser( 'hdf5-pls-to-txt', help='Write HDF5 output (peak lists) to text format.') ##################### # Process Scans ##################### parser_scans.add_argument( "-i", "--input", type=str, nargs='+', required=True, metavar='source', help= "Absolute or relative path to the *.mzml file(s). Must be in same order as 'metascans *txt files'" ) parser_scans.add_argument( '-ms', '--metascans', type=str, nargs='+', required=True, metavar='source', help= "Absolute or relative path to the comma-delimited *.txt metadata file. Must be in same order and 'input' *mzml files. Header names must contain and be in the following order names =['barcode', 'date/time', 'row', 'col', 'scan', 'ejection time', 'NA'] as output by MS-Parser tool" ) parser_scans.add_argument( "-o", "--output", help="Absolute or relative path to the output file", action="store", type=str, required=True) parser_scans.add_argument( "-f", "--failed-wells", help= "Absolute or relative path to the *.txt output of which well failed", action="store", type=str, required=True) parser_scans.add_argument( "-pr", "--processed_scans", help= "Absolute or relative path to the *.txt output of which well failed", action="store", type=str, required=True) parser_scans.add_argument( "-m", "--method", help= "Method to define which scans to extract data from. DEFAULT = on_scans_no_edge", action="store", type=str, choices=["all_scans", "on_scans", "off_scans", "on_scan_no_edge"], default="on_scans_no_edge") parser_scans.add_argument( "-d", "--id-snr", help= "For identifying on/off scans: Hard SNR threshold for differentiating between on/off scans. DEFAULT = 15", action="store", type=int, default=15) parser_scans.add_argument( "-t", "--id-tol", help= "For identifying on/off scans: Number of features with SNR > threshold to tolerate in off scans. DEFAULT = 3", action="store", type=int, default=3) parser_scans.add_argument( "-s", "--snr-threshold", help="SNR threshold to remove noise features. DEFAULT = 2", action="store", type=int, default=3) parser_scans.add_argument( "-n", "--min-scans", help= "Minimum number of scans required to be labelled on within a well for sample to be taken forward. DEFAULT = 0", action="store", type=int, default=0) parser_scans.add_argument( "-r", "--rsd-threshold", help= "RSD filter (scan level): Threshold of RSD of features across scans in sample for it to be retained. DEFAULT = None", action="store", type=int, default=None) parser_scans.add_argument( "-fr", "--min-fraction", help= "Minimum fraction a peak has to be present. Use 0.0 to not apply this filter.", action="store", type=float, default=None) parser_scans.add_argument( "-p", "--ppm", help= "Aligning scans: m/z precision (ppm) to align scans in sample - REQUIRED PARAMETER!", action="store", type=int, required=True) parser_scans.add_argument( '-l', '--metalist', type=str, required=False, help= "Absolute or relative path to the tab-delimited *.txt file that include the name of the data files (*.mzml) and meta data. " "Column names: filename, replicate, batch, injectionOrder, classLabel." ) ################################# # Process Samples ################################# parser_samples.add_argument( "-i", "--input", help= "Absolute or relative path to the *.hdf5 file containing all peaklists from process scans", action="store", type=str, required=True) parser_samples.add_argument( "-o", "--output", help="Absolute or relative path to the output file", action="store", type=str, required=True) parser_samples.add_argument( "-p", "--ppm", help= "Aligning samples: m/z precision (ppm) to align samples in study - REQUIRED PARAMETER!", action="store", type=int, required=True) parser_samples.add_argument( "-b", "--block-size", help= "Aligning samples: Number peaks in each centre clustering block for alignment of samples. DEFAULT = 5000 (should increase for large studies)", action="store", type=int, default=5000) parser_samples.add_argument( "-fr", "--min-fraction", help="Minimum percentage of samples a peak has to be present.", action="store", type=float, required=False, default=None) parser_samples.add_argument( '-r', '--rsd-threshold', default=None, type=float, required=False, help= "Peaks where the associated QC peaks are above this threshold will be removed." ) parser_samples.add_argument( '-w', '--within', type=bool, nargs='?', const=True, default=False, help="Apply sample filter within each sample class.") parser_samples.add_argument('-q', '--qc-label', default=None, type=str, required=False, help="Class label for QCs") ################################# # HDF5 peaklists to text ################################# parser_hplt.add_argument( '-i', '--input', type=str, required=True, help= "Absolute or relative path to the HDF5 file that contains a list of peaklist objects from one of the processing steps." ) parser_hplt.add_argument("-o", "--output", help="Directory to write to.", action="store", type=str, default=os.getcwd()) parser_hplt.add_argument( '-d', '--delimiter', default="tab", choices=["tab", "comma"], help="Values on each line of the file are separated by this character." ) ################################# # HDF5 peak matrix to text ################################# parser_hpmt.add_argument( '-i', '--input', type=str, required=True, help= "Absolute or relative path to the HDF5 file that contains a peak matrix object from one of the processing steps." ) parser_hpmt.add_argument('-o', '--output', type=str, required=True, help="Directory to write to.") parser_hpmt.add_argument('-a', '--attribute_name', default="intensity", choices=["intensity", "mz", "snr"], required=False, help="Type of matrix to print.") parser_hpmt.add_argument( '-l', '--class-label-rsd', action='append', required=False, default=(), help="Class label to select samples for RSD calculatons (e.g. QC).") parser_hpmt.add_argument( '-d', '--delimiter', default="tab", choices=["tab", "comma"], help="Values on each line of the file are separated by this character." ) parser_hpmt.add_argument( '-s', '--representation-samples', default="rows", choices=["rows", "columns"], help="Should the rows or columns respresent the samples?") parser_hpmt.add_argument( '-c', '--comprehensive', action='store_true', required=False, help= "Whether to output simple or comprehensive version of the peak matrix. Do not use argument if want simple output, use -c or --comprehensive for comprehensive output" ) args = parser.parse_args() print(args) if args.step == "process-scans": peaklists = [] failed_wells = [] scans_processed = {} for i in range(len(args.input)): print("Acquisition; {}".format(args.input[i])) # Store spectral data run = Mzml(args.input[i]) # Define which wells scans are associated with df = pd.read_csv(args.metascans[i], header=None, names=[ "barcode", "date/time", "row", "col", "scan", "ejection time", "NA" ]) df = df[["barcode", "row", "col", "scan"]] alphabet = list(string.ascii_uppercase) df['well_label'] = df.apply( lambda row: "%s_%s%02d" % (row.barcode, alphabet[row.row - 1], row.col), axis=1) if args.metalist is not None: metadata = validate_metadata(args.metalist) for index, well in df[["well_label"]].drop_duplicates().iterrows(): well_scans = list( df[(df["well_label"] == well["well_label"])]["scan"]) wellInfo = Scans(run, well, well_scans, args.id_snr, args.id_tol) scan_ids = wellInfo.extract(args.method) if isinstance(scan_ids, str): scans_processed[well[0]] = scan_ids else: scans_processed[well[0]] = scan_ids if len(scan_ids) < args.min_scans: line = "Well: {}, failed due to: < {} scans in well taken forward. Scan_ids for well: {}".format( well, args.min_scans, scan_ids) failed_wells.append(line) else: # Regenerates peak lists for each well (pl is individual # scan) with user defined snr, rsd and min fraction # thresholds # pls is the spectral data (mz, intensity, snr, flags) for # all scans pls = run.peaklists(scan_ids, function_noise="median") pls = [ filter_attr( pl, "snr", min_threshold=args.snr_threshold) if len(pl.mz) > 0 else pl for pl in pls ] # Filters out noise using SNR # dataframe with only extracted scans/peaklists pls = [pl for pl in pls if int(pl.ID) in scan_ids] try: # Forms aligned peak matrix from peakLists pm = align_peaks(pls, ppm=args.ppm, block_size=5000, edge_extend=(2 * args.ppm)) except ValueError as e: line = "Well: {}, failed due to: {}.".format(well, e) failed_wells.append(line) continue # Generates peakLists from aligned peak matrix pl_aligned = pm.to_peaklist( ID="{}".format(well["well_label"])) if "snr" in pm.attributes: pl_aligned.add_attribute("snr", pm.attr_mean_vector("snr"), on_index=2) pl_aligned.add_attribute("rsd", pm.rsd(flagged_only=False), on_index=4) pl_aligned.add_attribute('snr_flag', np.ones(pl_aligned.full_size), flagged_only=False, is_flag=True) if args.rsd_threshold is not None: rsd_flag = map( lambda x: not np.isnan(x) and x < args. rsd_threshold, pl_aligned.get_attribute("rsd", flagged_only=False)) pl_aligned.add_attribute("rsd_flag", rsd_flag, flagged_only=False, is_flag=True) if args.min_fraction is not None: pl_aligned.add_attribute( "internal_fraction_flag", (pm.present / float(pm.shape[0])) >= args.min_fraction, flagged_only=False, is_flag=True) if args.metalist is not None: pl_aligned = update_metadata_and_labels([pl_aligned], metadata) peaklists.append(pl_aligned[0]) else: peaklists.append(pl_aligned) with open(args.failed_wells, "w") as out: for well in failed_wells: out.write("{}\n".format(well)) out_df = pd.DataFrame.from_dict(scans_processed, orient='index') out_df.to_csv(args.processed_scans, sep='\t') hdf5_portal.save_peaklists_as_hdf5(peaklists, "{}.hdf5".format(args.output)) if args.step == "process-samples": peaklists = hdf5_portal.load_peaklists_from_hdf5(args.input) peakmatrix = align_peaks( peaklists, ppm=args.ppm, block_size=args.block_size, edge_extend=( 2 * args.ppm)) # align peaks into mz bins... ppm = ppm_precision peakmatrix = sample_filter(peakmatrix, min_fraction=args.min_fraction, within=args.within, qc_label=args.qc_label, rsd_thres=args.rsd_threshold) hdf5_portal.save_peak_matrix_as_hdf5(peakmatrix, args.output) if args.step == 'hdf5-pls-to-txt': hdf5_peaklists_to_txt(args.input, path_out=args.output, delimiter=map_delimiter(args.delimiter)) if args.step == 'hdf5-pm-to-txt': if args.representation_samples == "rows": samples_in_rows = True else: samples_in_rows = False hdf5_peak_matrix_to_txt(args.input, path_out=args.output, attr_name=args.attribute_name, delimiter=map_delimiter(args.delimiter), rsd_tags=args.class_label_rsd, samples_in_rows=samples_in_rows, comprehensive=args.comprehensive)