コード例 #1
0
    def test_hdf5_before_after(self):

        save_peaklists_as_hdf5(self.pls_master, to_test_results("MTBLS79_mzml_triplicates.hdf5"))
        pls = load_peaklists_from_hdf5(to_test_results("MTBLS79_mzml_triplicates.hdf5"))
        self.assertEqual(len(pls), len(self.pls_master))
        self.assertTrue(np.all(pls[0].mz == self.pls_master[0].mz))
        self.assertTrue(np.all(pls[0].intensity == self.pls_master[0].intensity))
        self.assertTrue(np.all(pls[0].snr == self.pls_master[0].snr))

        save_peak_matrix_as_hdf5(self.pm_master, to_test_results("MTBLS79_mzml_peak_matrix.hdf5"))
        pm = load_peak_matrix_from_hdf5(to_test_results("MTBLS79_mzml_peak_matrix.hdf5"))
        self.assertEqual(pm.shape, self.pm_master.shape)
        self.assertTrue(np.all(pm.attr_mean_vector('mz') == self.pm_master.attr_mean_vector('mz')))
        self.assertTrue(np.all(pm.attr_mean_vector('intensity') == self.pm_master.attr_mean_vector('intensity')))
        self.assertTrue(np.all(pm.attr_mean_vector('snr') == self.pm_master.attr_mean_vector('snr')))
コード例 #2
0
ファイル: test_hdf5_portal.py プロジェクト: RJMW/dimspy
    def test_peaklist_portal(self):
        pkls = self._createPeaklists()

        save_peaklists_as_hdf5(pkls, '.test_peaklist.hdf5')
        npkls = load_peaklists_from_hdf5('.test_peaklist.hdf5')

        self.assertListEqual([x.size for x in npkls], [75] * 6)
        self.assertListEqual([x.full_size for x in npkls], [100] * 6)
        self.assertTrue(
            all([
                np.allclose(x[0].mz_all, x[1].mz_all)
                for x in zip(pkls, npkls)
            ]))
        self.assertTrue(
            all([
                np.allclose(x[0].intensity, x[1].intensity)
                for x in zip(pkls, npkls)
            ]))
        self.assertTrue(
            all([
                np.allclose(x[0].snr, x[1].snr, atol=1e-30)
                for x in zip(pkls, npkls)
            ]))
        self.assertTrue(
            all([
                np.all(x[0].quad_flag == x[1].quad_flag)
                for x in zip(pkls, npkls)
            ]))
        self.assertTrue(
            all([np.all(x[0].lab == x[1].lab) for x in zip(pkls, npkls)]))
        self.assertTrue(
            all([
                list(x[0].metadata.keys()) == list(x[1].metadata.keys())
                for x in zip(pkls, npkls)
            ]))
        self.assertTrue(
            all([
                x[0].tags.tag_types == x[1].tags.tag_types
                for x in zip(pkls, npkls)
            ]))
        self.assertTrue(
            all([
                x[0].tags.tag_values == x[1].tags.tag_values
                for x in zip(pkls, npkls)
            ]))
コード例 #3
0
def hdf5_peaklists_to_txt(filename: str, path_out: str, delimiter: str = "\t"):
    """

    :param filename:
    :param path_out:
    :param delimiter:
    """

    if not os.path.isfile(filename):
        raise IOError('HDF5 database [%s] does not exist' % filename)
    if not h5py.is_hdf5(filename):
        raise IOError('input file [%s] is not a valid HDF5 database' %
                      filename)

    if not os.path.isdir(path_out):
        raise IOError("File or Directory does not exist:".format(path_out))

    obj = hdf5_portal.load_peaklists_from_hdf5(filename)
    if "#" in obj[0].ID:
        fns = set([pl.ID.split("#")[0] for pl in obj])
        sub_ids = [pl.ID.split("#")[1] for pl in obj]
        for fn in fns:
            with open(os.path.join(path_out,
                                   os.path.splitext(fn)[0] + ".txt"),
                      "w") as pk_out:
                for i, pl in enumerate(obj):
                    if fn in pl.ID:
                        pl.add_attribute("event",
                                         pl.full_shape[0] * [sub_ids[i]],
                                         flagged_only=False,
                                         on_index=3)
                        str_out = pl.to_str(delimiter=delimiter)
                        if i > 0:
                            pk_out.write(str_out[str_out.index('\n'):])
                        else:
                            pk_out.write(str_out)
                        pl.drop_attribute("event")
    else:
        for pl in obj:
            with open(
                    os.path.join(path_out,
                                 os.path.splitext(pl.ID)[0] + ".txt"),
                    "w") as pk_out:
                pk_out.write(pl.to_str(delimiter=delimiter))
    return
コード例 #4
0
    def test_peaklist_portal(self):
        pkls = self._createPeaklists()

        save_peaklists_as_hdf5(pkls, '.test_peaklist.hdf5')
        npkls = load_peaklists_from_hdf5('.test_peaklist.hdf5')

        self.assertListEqual(map(lambda x: x.size, npkls), [75] * 6)
        self.assertListEqual(map(lambda x: x.full_size, npkls), [100] * 6)
        self.assertTrue(
            all(
                map(lambda x: np.allclose(x[0].mz_all, x[1].mz_all),
                    zip(pkls, npkls))))
        self.assertTrue(
            all(
                map(lambda x: np.allclose(x[0].intensity, x[1].intensity),
                    zip(pkls, npkls))))
        self.assertTrue(
            all(
                map(lambda x: np.allclose(x[0].snr, x[1].snr, atol=1e-30),
                    zip(pkls, npkls))))
        self.assertTrue(
            all(
                map(lambda x: np.all(x[0].quad_flag == x[1].quad_flag),
                    zip(pkls, npkls))))
        self.assertTrue(
            all(map(lambda x: np.all(x[0].lab == x[1].lab), zip(pkls, npkls))))
        self.assertTrue(
            all(
                map(lambda x: x[0].metadata.keys() == x[1].metadata.keys(),
                    zip(pkls, npkls))))
        self.assertTrue(
            all(
                map(lambda x: x[0].tags.tag_types == x[1].tags.tag_types,
                    zip(pkls, npkls))))
        self.assertTrue(
            all(
                map(lambda x: x[0].tags.tag_values == x[1].tags.tag_values,
                    zip(pkls, npkls))))
コード例 #5
0
ファイル: paths.py プロジェクト: arpankbasak/dimspy
def check_paths(tsv, source):
    if tsv is None:
        if type(source) == str:
            if os.path.isdir(source):
                filenames = [
                    os.path.join(source, fn) for fn in os.listdir(source) if
                    fn.lower().endswith(".mzml") or fn.lower().endswith(".raw")
                ]
            elif zipfile.is_zipfile(source):
                with zipfile.ZipFile(source) as zf:
                    if len([
                            fn for fn in zf.namelist()
                            if fn.lower().endswith(".raw")
                    ]) > 0:
                        raise IOError(
                            "Archive with *.raw files not yet supported. Convert to mzML"
                        )
                    filenames = [
                        fn for fn in zf.namelist()
                        if fn.lower().endswith(".mzml")
                    ]
            elif h5py.is_hdf5(source):
                peaklists = hdf5_portal.load_peaklists_from_hdf5(source)
                filenames = [
                    os.path.join(os.path.abspath(os.path.dirname(source)),
                                 pl.ID) for pl in peaklists
                ]
            elif os.path.isfile(source):
                if source.lower().endswith(".raw") or source.lower().endswith(
                        ".mzml"):
                    filenames = [source]
                else:
                    raise IOError(
                        "Incorrect file format, provide .mzml or .raw files: {}"
                        .format(source))
            else:
                raise IOError(
                    "[Errno 2] No such file or directory: {}".format(source))

        elif type(source) == list or type(source) == tuple:
            if isinstance(source[0], PeakList):
                filenames = [pl.ID for pl in source]
            else:
                filenames = []
                for fn in source:
                    if os.path.isfile(fn):
                        if fn.lower().endswith(".raw") or fn.lower().endswith(
                                ".mzml"):
                            filenames.append(fn)
                        else:
                            raise IOError(
                                "Incorrect file format, provide .mzml or .raw files: {}"
                                .format(source))
                    else:
                        raise IOError(
                            "[Errno 2] No such file or directory: {}".format(
                                source))
        else:
            raise IOError(
                "[Errno 2] No such file or directory: {}".format(source))

    elif os.path.isfile(tsv):
        fm = np.genfromtxt(tsv, dtype=None, delimiter="\t", names=True)
        if len(fm.shape) == 0:
            fm = np.array([fm])
        if fm.dtype.names[0] != "filename" and fm.dtype.names[0] != "sample_id":
            raise IOError(
                "Incorrect header for first column. Use filename or sample_id")

        filenames = []
        if type(source) == list or type(source) == tuple:
            if isinstance(source[0], PeakList):
                for filename in fm[fm.dtype.names[0]]:
                    if filename in [pl.ID for pl in source]:
                        filenames.append(filename)
                    else:
                        raise IOError(
                            "{} does not exist in list with Peaklist objects".
                            format(filename))
            else:
                for filename in fm[fm.dtype.names[0]]:
                    if filename not in [os.path.basename(fn) for fn in source]:
                        raise IOError(
                            "{} (row {}) does not exist in source provided".
                            format(
                                filename,
                                list(fm[fm.dtype.names[0]]).index(filename) +
                                1))
                for fn in source:
                    if os.path.isfile(fn):
                        filenames.append(fn)
                    else:
                        raise IOError(
                            "[Errno 2] No such file or directory: {}".format(
                                fn))

        elif type(source) == str:
            if os.path.isdir(source):
                l = os.listdir(source)
                for fn in fm[fm.dtype.names[0]]:
                    if os.path.basename(fn) not in l:
                        raise IOError(
                            "{} does not exist in directory provided".format(
                                os.path.basename(fn)))
                    filenames.append(os.path.join(source, fn))

            elif zipfile.is_zipfile(source):
                with zipfile.ZipFile(source) as zf:
                    if len([
                            fn for fn in zf.namelist()
                            if fn.lower().endswith(".raw")
                    ]) > 0:
                        raise IOError(
                            "Archive with *.raw files not yet supported. Convert to mzML"
                        )
                    for fn in fm[fm.dtype.names[0]]:
                        if fn not in zf.namelist():
                            raise IOError(
                                "{} does not exist in .zip file".format(fn))
                        filenames.append(fn)

            elif h5py.is_hdf5(source):
                peaklists = hdf5_portal.load_peaklists_from_hdf5(source)
                filenames = [pl.ID for pl in peaklists]
            else:
                raise IOError(
                    "[Errno 2] No such file or directory: {} or {}".format(
                        source, tsv))
    else:
        raise IOError("[Errno 2] No such file or directory: {} or {}".format(
            source, tsv))

    return filenames
コード例 #6
0
def main():  # pragma: no cover

    print("Executing msnpy version %s." % __version__)

    parser = argparse.ArgumentParser(
        description=
        'Python package to process and annotate MSn fragmentation data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    subparsers = parser.add_subparsers(dest='step')

    parser_g = subparsers.add_parser(
        'group-scans', help='Group fragmentation events and/or experiments.')
    parser_ps = subparsers.add_parser('process-scans',
                                      help='Process and filter scans.')

    parser_cst = subparsers.add_parser(
        'create-spectral-trees',
        help='Create spectral trees from processed scan (fragmentation) data.')
    parser_ast = subparsers.add_parser(
        'annotate-spectral-trees',
        help='Annotate and/or filter spectral trees.')
    parser_rst = subparsers.add_parser('rank-spectral-trees',
                                       help='Rank annotated spectral trees.')
    parser_cvst = subparsers.add_parser(
        'convert-spectral-trees',
        help=
        'Convert spectral trees to either dimspy peaklists, MSP files or both')

    ##################################################################
    # GROUP SCANS
    ##################################################################

    parser_g.add_argument('-i',
                          '--input',
                          type=str,
                          required=True,
                          help="Mzml or Thermo Scientific raw file")

    parser_g.add_argument('-o', '--output', type=str, required=True, help="")

    parser_g.add_argument('-u',
                          '--report',
                          type=str,
                          required=False,
                          default=None,
                          help="Summary/Report of groups")

    parser_g.add_argument('-n',
                          '--number-of-headers',
                          default=2,
                          type=int,
                          required=False,
                          help="")

    parser_g.add_argument('-r',
                          '--min-replicates',
                          default=None,
                          type=int,
                          required=False,
                          help="")

    parser_g.add_argument('-t',
                          '--max-injection-time',
                          default=None,
                          type=float,
                          required=False,
                          help="")

    parser_g.add_argument('-s',
                          '--split',
                          action='store_true',
                          required=False,
                          help="")

    parser_g.add_argument('-m',
                          '--merge-ms1',
                          action='store_true',
                          required=False,
                          help="")

    ##################################################################
    # PROCESS SCANS
    ##################################################################

    parser_ps.add_argument('-i',
                           '--input',
                           type=str,
                           required=True,
                           help="Mzml or Thermo Scientific raw file")

    parser_ps.add_argument(
        '-g',
        '--groups',
        type=str,
        required=True,
        help="Json or gml file that includes the groups of scans")

    parser_ps.add_argument('-o',
                           '--output',
                           type=str,
                           required=True,
                           help="HDF5 file to save the peaklist objects to.")

    parser_ps.add_argument('-m',
                           '--function-noise',
                           choices=["median", "mean", "mad", "noise_packets"],
                           required=True,
                           help="Select function to calculate noise.")

    parser_ps.add_argument('-s',
                           '--snr-threshold',
                           default=3.0,
                           type=float,
                           required=True,
                           help="Signal-to-noise threshold")

    parser_ps.add_argument(
        '-p',
        '--ppm',
        default=2.0,
        type=float,
        required=False,
        help=
        "Mass tolerance in Parts per million to group peaks across scans / mass spectra."
    )

    parser_ps.add_argument(
        '-a',
        '--min-fraction',
        default=0.5,
        type=float,
        required=False,
        help=
        "Minimum fraction a peak has to be present. Use 0.0 to not apply this filter."
    )

    parser_ps.add_argument(
        '-d',
        '--rsd-threshold',
        default=None,
        type=float,
        required=False,
        help=
        "Maximum threshold - relative standard deviation (Calculated for peaks that have been measured across a minimum of two scans)."
    )

    parser_ps.add_argument('-n',
                           '--normalise',
                           default=None,
                           type=float,
                           required=False,
                           help="Normalise scans by Total Ion Current (TIC)")

    parser_ps.add_argument(
        '-e',
        '--exclusion-list',
        nargs='+',
        default=None,
        required=False,
        help=
        "List of mz values to exclude from processing (e.g. from electrical noise)"
    )

    parser_ps.add_argument('-r',
                           '--ringing-threshold',
                           default=None,
                           type=float,
                           required=False,
                           help="Remove ringing artifacts.")

    parser_ps.add_argument('-u',
                           '--report',
                           type=str,
                           required=False,
                           default=None,
                           help="Summary/Report of processed mass spectra")

    parser_ps.add_argument(
        '-b',
        '--block-size',
        default=5000,
        type=int,
        required=False,
        help="The size of each block of peaks to perform clustering on.")

    parser_ps.add_argument('-c',
                           '--ncpus',
                           default=None,
                           type=int,
                           required=False,
                           help="Number of central processing units (CPUs).")

    ##################################################################
    # CREATE SPECTRAL TREES
    ##################################################################

    parser_cst.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help="HDF5 file (Peaklist objects) from step 'process-scans'.")

    parser_cst.add_argument('-g', '--groups', type=str, required=True, help="")

    parser_cst.add_argument('-o', '--output', type=str, required=True, help="")

    ##################################################################
    # ANNOTATE SPECTRAL TREES
    ##################################################################
    parser_ast.add_argument('-i',
                            '--input',
                            type=str,
                            required=True,
                            help="Json file containing spectral trees")

    parser_ast.add_argument('-p',
                            '--ppm',
                            default=2.0,
                            type=float,
                            required=False,
                            help="Mass tolerance in Parts per million.")

    parser_ast.add_argument('-r',
                            '--rules',
                            action='store_true',
                            required=False,
                            help="")

    parser_ast.add_argument('-m',
                            '--mf-db',
                            type=str,
                            required=False,
                            default="http://multiomics-int.cs.bham.ac.uk",
                            help="Molecular formulae database")

    parser_ast.add_argument(
        '-d',
        '--output-db',
        type=str,
        required=True,
        help=
        "Sqlite database file to store information regarding the annotations.")

    parser_ast.add_argument(
        '-o',
        '--output-trees',
        type=str,
        required=True,
        help="Json file containing the annotated spectral trees.")

    parser_ast.add_argument(
        '-a',
        '--adducts',
        nargs='+',
        required=True,
        help="Adducts e.g. [M+H]+ [M+NH4]+ [M+Na]+ [M+(39K)]+",
        default=['[M+H]+', '[M+Na]+', '[M+NH4]+'])

    parser_ast.add_argument('-f',
                            '--filter',
                            action='store_true',
                            required=False,
                            help="Filter the spectral tree annotations")

    #################################
    # RANK SPECTRAL TREES
    #################################
    parser_rst.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help="Json file containing annotated spectral trees")

    parser_rst.add_argument('-o',
                            '--output',
                            type=str,
                            required=True,
                            help="Summary of the rankings")

    ##################################################################
    # CONVERT SPECTRA TREES - TO DIMSPY.PEAKLISTS AND MSP FILES
    ##################################################################
    parser_cvst.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help=
        "Json file containing annotated spectral trees or dimspy peaklist hdf5 file"
    )

    parser_cvst.add_argument('-o',
                             '--output',
                             type=str,
                             required=True,
                             help="Out folder containing spectra")

    parser_cvst.add_argument(
        '-x',
        '--input_type',
        default="json",
        type=str,
        required=False,
        help="If input is either a dimspy peaklist or a msnpy json")

    parser_cvst.add_argument('-n',
                             '--name',
                             type=str,
                             required=False,
                             help="Name to use for suffixing files")

    parser_cvst.add_argument('-a',
                             '--adjust_mz',
                             action='store_true',
                             required=False,
                             help="Filter the spectral tree annotations")

    parser_cvst.add_argument('-m',
                             '--merge',
                             action='store_true',
                             required=False,
                             help="Filter the spectral tree annotations")

    parser_cvst.add_argument('-p',
                             '--ppm',
                             default=5.0,
                             type=float,
                             required=False,
                             help="Mass tolerance in Parts per million.")

    parser_cvst.add_argument('-s',
                             '--msp',
                             action='store_true',
                             required=False,
                             help="Filter the spectral tree annotations")

    parser_cvst.add_argument(
        '-t',
        '--msp_type',
        default="massbank",
        type=str,
        required=False,
        help="If MSP file is to be created what type (massbank, msp)")

    parser_cvst.add_argument(
        '-z',
        '--polarity',
        type=str,
        required=False,
        default='NA',
        help="Polarity to add to the MSP file (positive or negative)")

    parser_cvst.add_argument(
        '-y',
        '--ms1',
        action='store_true',
        required=False,
        help=
        "Output ms1 spectra (creates spectra for the precursors in the MS1 spectra"
    )

    args = parser.parse_args()

    print(args)

    if args.step == "group-scans":
        groups = group_scans(filename=args.input,
                             nh=args.number_of_headers,
                             min_replicates=args.min_replicates,
                             report=args.report,
                             max_injection_time=args.max_injection_time,
                             merge_ms1=args.merge_ms1,
                             split=args.split)

        save_groups(groups=groups, filename=args.output, format="json")

    if args.step == "process-scans":
        peaklists = process_scans(
            filename=args.input,
            groups=load_groups(args.groups, format="json")
            if args.groups else None,
            function_noise=args.function_noise,
            snr_thres=args.snr_threshold,
            ppm=args.ppm,
            min_fraction=args.min_fraction,
            rsd_thres=args.rsd_threshold,
            normalise=args.normalise,
            ringing_thres=args.ringing_threshold,
            exclusion_list=args.exclusion_list,
            report=args.report,
            block_size=args.block_size,
            ncpus=args.ncpus)

        hdf5_portal.save_peaklists_as_hdf5(peaklists, args.output)

    if args.step == "create-spectral-trees":
        groups = load_groups(args.groups, format="json")
        pls = hdf5_portal.load_peaklists_from_hdf5(args.input)
        spectral_trees = create_spectral_trees(groups, pls)
        save_trees(spectral_trees, args.output, format="json")

    if args.step == "annotate-spectral-trees":
        spectral_trees = load_trees(args.input, format="json")

        adducts = [
            a.replace('__ob__', '[').replace('__cb__', ']')
            for a in args.adducts
        ]

        st = annotate_mf(spectral_trees=spectral_trees,
                         db_out=args.output_db,
                         ppm=args.ppm,
                         adducts=adducts,
                         rules=args.rules,
                         mf_db=args.mf_db)

        if args.filter:
            st = filter_mf(st, args.output_db)
        save_trees(st, args.output_trees, format="json")

    if args.step == "rank-spectral-trees":
        st = load_trees(args.input, format="json")
        ranks = rank_mf(st)
        ranks.to_csv(args.output, sep="\t", index=False)

    if args.step == "create-spectral-trees":
        groups = load_groups(args.groups, format="json")
        pls = hdf5_portal.load_peaklists_from_hdf5(args.input)
        spectral_trees = create_spectral_trees(groups, pls)
        save_trees(spectral_trees, args.output, format="json")

    if args.step == "convert-spectral-trees":
        print('converting trees to dimspy peaklists')
        if args.input_type == 'json':
            non_merged_pls, merged_pls, ms1_precursor_pl = tree2peaklist(
                tree_pth=args.input,
                out_pth=args.output,
                name=args.name,
                adjust_mz=args.adjust_mz,
                merge=args.merge,
                ppm=args.ppm)
            if args.msp:
                print('Converting dimspy peaklists to MSP files')
                if non_merged_pls:
                    peaklist2msp(non_merged_pls,
                                 os.path.join(
                                     args.output,
                                     '{}_non_merged.msp'.format(args.name)),
                                 msp_type=args.msp_type,
                                 polarity=args.polarity)
                if merged_pls:
                    peaklist2msp(merged_pls,
                                 os.path.join(
                                     args.output,
                                     '{}_merged.msp'.format(args.name)),
                                 msp_type=args.msp_type,
                                 polarity=args.polarity)
                if ms1_precursor_pl:
                    peaklist2msp(ms1_precursor_pl,
                                 os.path.join(
                                     args.output,
                                     '{}_ms1_precursors.msp'.format(
                                         args.name)),
                                 msp_type=args.msp_type,
                                 polarity=args.polarity,
                                 include_ms1=True)
        else:
            pls = hdf5_portal.load_peaklists_from_hdf5(args.input)
            peaklist2msp(pls,
                         os.path.join(args.output, '{}.msp'.format(args.name)),
                         msp_type=args.msp_type,
                         polarity=args.polarity)
コード例 #7
0
def main():

    # Create ArgumentParser object
    parser = argparse.ArgumentParser(
        description=
        'Python package for processing acoustic mist ionisation-mass spectrometry -based metabolomics and lipidomics data',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # subparsers
    subparsers = parser.add_subparsers(dest='step')

    parser_scans = subparsers.add_parser(
        'process-scans', help='Process and align scans within samples.')
    parser_samples = subparsers.add_parser('process-samples',
                                           help='Process and align samples.')
    parser_hpmt = subparsers.add_parser(
        'hdf5-pm-to-txt',
        help='Write HDF5 output (peak matrix) to text format.')
    parser_hplt = subparsers.add_parser(
        'hdf5-pls-to-txt',
        help='Write HDF5 output (peak lists) to text format.')

    #####################
    # Process Scans
    #####################

    parser_scans.add_argument(
        "-i",
        "--input",
        type=str,
        nargs='+',
        required=True,
        metavar='source',
        help=
        "Absolute or relative path to the *.mzml file(s). Must be in same order as 'metascans *txt files'"
    )

    parser_scans.add_argument(
        '-ms',
        '--metascans',
        type=str,
        nargs='+',
        required=True,
        metavar='source',
        help=
        "Absolute or relative path to the comma-delimited *.txt metadata file. Must be in same order and 'input' *mzml files. Header names must contain and be in the following order names =['barcode', 'date/time', 'row', 'col', 'scan', 'ejection time', 'NA'] as output by MS-Parser tool"
    )

    parser_scans.add_argument(
        "-o",
        "--output",
        help="Absolute or relative path to the output file",
        action="store",
        type=str,
        required=True)

    parser_scans.add_argument(
        "-f",
        "--failed-wells",
        help=
        "Absolute or relative path to the *.txt output of which well failed",
        action="store",
        type=str,
        required=True)

    parser_scans.add_argument(
        "-pr",
        "--processed_scans",
        help=
        "Absolute or relative path to the *.txt output of which well failed",
        action="store",
        type=str,
        required=True)

    parser_scans.add_argument(
        "-m",
        "--method",
        help=
        "Method to define which scans to extract data from. DEFAULT = on_scans_no_edge",
        action="store",
        type=str,
        choices=["all_scans", "on_scans", "off_scans", "on_scan_no_edge"],
        default="on_scans_no_edge")

    parser_scans.add_argument(
        "-d",
        "--id-snr",
        help=
        "For identifying on/off scans: Hard SNR threshold for differentiating between on/off scans. DEFAULT = 15",
        action="store",
        type=int,
        default=15)

    parser_scans.add_argument(
        "-t",
        "--id-tol",
        help=
        "For identifying on/off scans: Number of features with SNR > threshold to tolerate in off scans. DEFAULT = 3",
        action="store",
        type=int,
        default=3)

    parser_scans.add_argument(
        "-s",
        "--snr-threshold",
        help="SNR threshold to remove noise features. DEFAULT = 2",
        action="store",
        type=int,
        default=3)

    parser_scans.add_argument(
        "-n",
        "--min-scans",
        help=
        "Minimum number of scans required to be labelled on within a well for sample to be taken forward. DEFAULT = 0",
        action="store",
        type=int,
        default=0)

    parser_scans.add_argument(
        "-r",
        "--rsd-threshold",
        help=
        "RSD filter (scan level): Threshold of RSD of features across scans in sample for it to be retained. DEFAULT = None",
        action="store",
        type=int,
        default=None)

    parser_scans.add_argument(
        "-fr",
        "--min-fraction",
        help=
        "Minimum fraction a peak has to be present. Use 0.0 to not apply this filter.",
        action="store",
        type=float,
        default=None)

    parser_scans.add_argument(
        "-p",
        "--ppm",
        help=
        "Aligning scans: m/z precision (ppm) to align scans in sample - REQUIRED PARAMETER!",
        action="store",
        type=int,
        required=True)

    parser_scans.add_argument(
        '-l',
        '--metalist',
        type=str,
        required=False,
        help=
        "Absolute or relative path to the tab-delimited *.txt file that include the name of the data files (*.mzml) and meta data. "
        "Column names: filename, replicate, batch, injectionOrder, classLabel."
    )

    #################################
    # Process Samples
    #################################

    parser_samples.add_argument(
        "-i",
        "--input",
        help=
        "Absolute or relative path to the *.hdf5 file containing all peaklists from process scans",
        action="store",
        type=str,
        required=True)

    parser_samples.add_argument(
        "-o",
        "--output",
        help="Absolute or relative path to the output file",
        action="store",
        type=str,
        required=True)

    parser_samples.add_argument(
        "-p",
        "--ppm",
        help=
        "Aligning samples: m/z precision (ppm) to align samples in study - REQUIRED PARAMETER!",
        action="store",
        type=int,
        required=True)

    parser_samples.add_argument(
        "-b",
        "--block-size",
        help=
        "Aligning samples: Number peaks in each centre clustering block for alignment of samples. DEFAULT = 5000 (should increase for large studies)",
        action="store",
        type=int,
        default=5000)

    parser_samples.add_argument(
        "-fr",
        "--min-fraction",
        help="Minimum percentage of samples a peak has to be present.",
        action="store",
        type=float,
        required=False,
        default=None)

    parser_samples.add_argument(
        '-r',
        '--rsd-threshold',
        default=None,
        type=float,
        required=False,
        help=
        "Peaks where the associated QC peaks are above this threshold will be removed."
    )

    parser_samples.add_argument(
        '-w',
        '--within',
        type=bool,
        nargs='?',
        const=True,
        default=False,
        help="Apply sample filter within each sample class.")

    parser_samples.add_argument('-q',
                                '--qc-label',
                                default=None,
                                type=str,
                                required=False,
                                help="Class label for QCs")

    #################################
    # HDF5 peaklists to text
    #################################

    parser_hplt.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help=
        "Absolute or relative path to the HDF5 file that contains a list of peaklist objects from one of the processing steps."
    )

    parser_hplt.add_argument("-o",
                             "--output",
                             help="Directory to write to.",
                             action="store",
                             type=str,
                             default=os.getcwd())

    parser_hplt.add_argument(
        '-d',
        '--delimiter',
        default="tab",
        choices=["tab", "comma"],
        help="Values on each line of the file are separated by this character."
    )

    #################################
    # HDF5 peak matrix to text
    #################################

    parser_hpmt.add_argument(
        '-i',
        '--input',
        type=str,
        required=True,
        help=
        "Absolute or relative path to the HDF5 file that contains a peak matrix object from one of the processing steps."
    )

    parser_hpmt.add_argument('-o',
                             '--output',
                             type=str,
                             required=True,
                             help="Directory to write to.")

    parser_hpmt.add_argument('-a',
                             '--attribute_name',
                             default="intensity",
                             choices=["intensity", "mz", "snr"],
                             required=False,
                             help="Type of matrix to print.")

    parser_hpmt.add_argument(
        '-l',
        '--class-label-rsd',
        action='append',
        required=False,
        default=(),
        help="Class label to select samples for RSD calculatons (e.g. QC).")

    parser_hpmt.add_argument(
        '-d',
        '--delimiter',
        default="tab",
        choices=["tab", "comma"],
        help="Values on each line of the file are separated by this character."
    )

    parser_hpmt.add_argument(
        '-s',
        '--representation-samples',
        default="rows",
        choices=["rows", "columns"],
        help="Should the rows or columns respresent the samples?")

    parser_hpmt.add_argument(
        '-c',
        '--comprehensive',
        action='store_true',
        required=False,
        help=
        "Whether to output simple or comprehensive version of the peak matrix. Do not use argument if want simple output, use -c or --comprehensive for comprehensive output"
    )

    args = parser.parse_args()

    print(args)

    if args.step == "process-scans":

        peaklists = []
        failed_wells = []
        scans_processed = {}

        for i in range(len(args.input)):

            print("Acquisition; {}".format(args.input[i]))
            # Store spectral data
            run = Mzml(args.input[i])

            # Define which wells scans are associated with
            df = pd.read_csv(args.metascans[i],
                             header=None,
                             names=[
                                 "barcode", "date/time", "row", "col", "scan",
                                 "ejection time", "NA"
                             ])
            df = df[["barcode", "row", "col", "scan"]]
            alphabet = list(string.ascii_uppercase)
            df['well_label'] = df.apply(
                lambda row: "%s_%s%02d" %
                (row.barcode, alphabet[row.row - 1], row.col),
                axis=1)

            if args.metalist is not None:
                metadata = validate_metadata(args.metalist)

            for index, well in df[["well_label"]].drop_duplicates().iterrows():

                well_scans = list(
                    df[(df["well_label"] == well["well_label"])]["scan"])

                wellInfo = Scans(run, well, well_scans, args.id_snr,
                                 args.id_tol)

                scan_ids = wellInfo.extract(args.method)

                if isinstance(scan_ids, str):
                    scans_processed[well[0]] = scan_ids

                else:
                    scans_processed[well[0]] = scan_ids

                if len(scan_ids) < args.min_scans:
                    line = "Well: {}, failed due to: < {} scans in well taken forward. Scan_ids for well: {}".format(
                        well, args.min_scans, scan_ids)
                    failed_wells.append(line)

                else:
                    # Regenerates peak lists for each well (pl is individual
                    # scan) with user defined snr, rsd and min fraction
                    # thresholds
                    # pls is the spectral data (mz, intensity, snr, flags) for
                    # all scans
                    pls = run.peaklists(scan_ids, function_noise="median")

                    pls = [
                        filter_attr(
                            pl, "snr", min_threshold=args.snr_threshold)
                        if len(pl.mz) > 0 else pl for pl in pls
                    ]  # Filters out noise using SNR
                    # dataframe with only extracted scans/peaklists
                    pls = [pl for pl in pls if int(pl.ID) in scan_ids]

                    try:
                        # Forms aligned peak matrix from peakLists
                        pm = align_peaks(pls,
                                         ppm=args.ppm,
                                         block_size=5000,
                                         edge_extend=(2 * args.ppm))

                    except ValueError as e:
                        line = "Well: {}, failed due to: {}.".format(well, e)
                        failed_wells.append(line)
                        continue

                    # Generates peakLists from aligned peak matrix
                    pl_aligned = pm.to_peaklist(
                        ID="{}".format(well["well_label"]))

                    if "snr" in pm.attributes:
                        pl_aligned.add_attribute("snr",
                                                 pm.attr_mean_vector("snr"),
                                                 on_index=2)

                    pl_aligned.add_attribute("rsd",
                                             pm.rsd(flagged_only=False),
                                             on_index=4)

                    pl_aligned.add_attribute('snr_flag',
                                             np.ones(pl_aligned.full_size),
                                             flagged_only=False,
                                             is_flag=True)

                    if args.rsd_threshold is not None:
                        rsd_flag = map(
                            lambda x: not np.isnan(x) and x < args.
                            rsd_threshold,
                            pl_aligned.get_attribute("rsd",
                                                     flagged_only=False))
                        pl_aligned.add_attribute("rsd_flag",
                                                 rsd_flag,
                                                 flagged_only=False,
                                                 is_flag=True)

                    if args.min_fraction is not None:
                        pl_aligned.add_attribute(
                            "internal_fraction_flag",
                            (pm.present / float(pm.shape[0])) >=
                            args.min_fraction,
                            flagged_only=False,
                            is_flag=True)

                    if args.metalist is not None:
                        pl_aligned = update_metadata_and_labels([pl_aligned],
                                                                metadata)
                        peaklists.append(pl_aligned[0])
                    else:
                        peaklists.append(pl_aligned)

        with open(args.failed_wells, "w") as out:
            for well in failed_wells:
                out.write("{}\n".format(well))

        out_df = pd.DataFrame.from_dict(scans_processed, orient='index')
        out_df.to_csv(args.processed_scans, sep='\t')

        hdf5_portal.save_peaklists_as_hdf5(peaklists,
                                           "{}.hdf5".format(args.output))

    if args.step == "process-samples":

        peaklists = hdf5_portal.load_peaklists_from_hdf5(args.input)

        peakmatrix = align_peaks(
            peaklists,
            ppm=args.ppm,
            block_size=args.block_size,
            edge_extend=(
                2 *
                args.ppm))  # align peaks into mz bins... ppm = ppm_precision

        peakmatrix = sample_filter(peakmatrix,
                                   min_fraction=args.min_fraction,
                                   within=args.within,
                                   qc_label=args.qc_label,
                                   rsd_thres=args.rsd_threshold)

        hdf5_portal.save_peak_matrix_as_hdf5(peakmatrix, args.output)

    if args.step == 'hdf5-pls-to-txt':
        hdf5_peaklists_to_txt(args.input,
                              path_out=args.output,
                              delimiter=map_delimiter(args.delimiter))

    if args.step == 'hdf5-pm-to-txt':
        if args.representation_samples == "rows":
            samples_in_rows = True
        else:
            samples_in_rows = False

        hdf5_peak_matrix_to_txt(args.input,
                                path_out=args.output,
                                attr_name=args.attribute_name,
                                delimiter=map_delimiter(args.delimiter),
                                rsd_tags=args.class_label_rsd,
                                samples_in_rows=samples_in_rows,
                                comprehensive=args.comprehensive)