Example #1
0
    def __init__(self, path_to_db_files, path_to_output_db_file=None):
        """Initialize V1BaseDBHandler.

        Args:
            path_to_db_files (list): list fo db-files with extension '.arrow'
            path_to_output_db_file (str): output database for saving

        """
        if isinstance(path_to_db_files, str):
            path_to_db_files = [path_to_db_files]

        # Check file extension
        for file in path_to_db_files:
            if not file.endswith('.arrow'):
                raise ValueError('DB file path must ends with ".arrow"')

        self.path_to_db_file = path_to_db_files[0]
        if path_to_output_db_file is not None:
            self.path_to_db_file = path_to_output_db_file
        self._cursor = 0
        self.logger = logging.getLogger(__name__)

        # load config
        self._config = load_config(self.__version__)
        self.columns = self._config[self.df_name]['columns']

        # initialize dataframe
        try:
            self._df = vaex.open_many(path_to_db_files)
        except IOError:
            self._df = None
Example #2
0
 def _consolidate_folder(self, folder):
     abs_path = os.path.abspath(self.folder_path + '/' + folder)
     contents = os.listdir(abs_path)
     concatenated_df = vaex.open_many(
         [str(abs_path + '/' + content) for content in contents])
     _ = [os.remove(abs_path + '/' + content) for content in contents]
     concatenated_df.export_hdf5(abs_path + '/' + folder.lower() +
                                 '.hdf5', )
Example #3
0
def h5pandas_to_vaex_combine(tmp_search_pattern: str,
                             path_out_str: str,
                             check_files_number: int = None,
                             del_found_tmp_files: bool = False,
                             **export_hdf5_args) -> int:
    """
    Combine many vaex.hdf5 files to one
    :param tmp_search_pattern:
    :param path_out_str: path argument of vaex.dataframe.export_hdf5()
    :param check_files_number: if not None must be equl to number of found files
    :param del_found_tmp_files: not implemented feature
    :param export_hdf5_args, dict, optional. Note that here default of 'progress' is set to True
    :return: number of tmp files found
    """
    # Find files
    hdf5_list = glob.glob(tmp_search_pattern)
    hdf5_list.sort()
    # hdf5_list_array = np.array(hdf5_list)

    # Check files existence
    if Path(path_out_str).is_file():
        lf.warning('Overwriting {:s}!', path_out_str)
    if check_files_number:
        assert len(hdf5_list) == check_files_number, "Incorrect number of files"
        lf.info('Combining {:d} found {:s} files to {:s}', check_files_number, tmp_search_pattern, PurePath(path_out_str).name)
    else:
        check_files_number = len(hdf5_list)
        lf.info('Combining {:s} to {:s}', tmp_search_pattern, PurePath(path_out_str).name)
    master_df = vaex.open_many(hdf5_list)
    try:
        master_df.export_hdf5(**{'path': path_out_str, 'progress': True, **export_hdf5_args})
    except AttributeError as e:
        # , progress=True gets AttributeError: 'ProgressBar' object has no attribute 'stime0'
        lf.debug('Try install progressbar2')
        pass

    # delete tmp files found
    if del_found_tmp_files:
        # will not work, todo: do only when export finished (use custom progress func?)
        try:
            i = 0
            for i, path_tmp in enumerate(hdf5_list):
                Path(path_tmp).unlink()  # remove file
        except Exception:
            lf.exception('Combined {0:d} but removed {i:d} temporary vaex.hdf5 files:', check_files_number, i=i)
        else:
            lf.info('Combined and removed {0:d} files.', check_files_number)
    else:
        lf.info('Combined {:d} files ({:s}), they remains', check_files_number, tmp_search_pattern)
    return check_files_number
Example #4
0
def read_dataframes_from_file(
        path_list: str) -> Optional[vaex.dataframe.DataFrame]:
    """Only read dataframe present in data/processed.

    Args:
        path_list: list of path relative to data/processed.

    Returns:
        vaex dataframe.
    """

    base_path = os.path.join(get_base_data_path(), "processed")

    file_path_list: List[str] = []

    for path in path_list:
        file_path = os.path.join(base_path, f"{path}.arrow")
        if not os.path.exists(file_path):
            raise OSError
        file_path_list.append(file_path)

    return vaex.open_many(file_path_list)
Example #5
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help="do not output anything")
    parser.add_argument('--list',
                        '-l',
                        default=False,
                        action='store_true',
                        help="list columns of input")
    parser.add_argument('--progress',
                        help="show progress (default: %(default)s)",
                        default=True,
                        action='store_true')
    parser.add_argument('--no-progress', dest="progress", action='store_false')
    parser.add_argument('--shuffle',
                        "-s",
                        dest="shuffle",
                        action='store_true',
                        default=False)
    parser.add_argument('--sort', dest="sort", default=None)
    parser.add_argument('--virtual',
                        dest="virtual",
                        action='store_true',
                        default=False,
                        help="Also export virtual columns")
    parser.add_argument('--fraction',
                        "-f",
                        dest="fraction",
                        type=float,
                        default=1.0,
                        help="fraction of input dataset to export")
    parser.add_argument('--filter',
                        dest="filter",
                        default=None,
                        help="filter to apply before exporting")

    subparsers = parser.add_subparsers(help='type of input source',
                                       dest="task")

    parser_soneira = subparsers.add_parser(
        'soneira', help='create soneira peebles dataset')
    parser_soneira.add_argument('output', help='output file')
    parser_soneira.add_argument(
        "columns",
        help="list of columns to export (or all when empty)",
        nargs="*")
    parser_soneira.add_argument('--dimension',
                                '-d',
                                type=int,
                                help='dimensions',
                                default=4)
    # parser_soneira.add_argument('--eta','-e', type=int, help='dimensions', default=3)
    parser_soneira.add_argument('--max-level',
                                '-m',
                                type=int,
                                help='dimensions',
                                default=28)
    parser_soneira.add_argument('--lambdas',
                                '-l',
                                type=int,
                                help='lambda values for fractal',
                                default=[1.1, 1.3, 1.6, 2.])

    parser_tap = subparsers.add_parser(
        'tap', help='use TAP (Table Access Protocol) as source')
    parser_tap.add_argument("tap_url", help="input source or file")
    parser_tap.add_argument("table_name", help="input source or file")
    parser_tap.add_argument("output",
                            help="output file (ends in .fits or .hdf5)")
    parser_tap.add_argument(
        "columns",
        help="list of columns to export (or all when empty)",
        nargs="*")

    parser_file = subparsers.add_parser(
        'file',
        help=
        'use a file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)'
    )
    parser_file.add_argument(
        "input",
        help=
        "input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)"
    )
    parser_file.add_argument("output",
                             help="output file (ends in .fits or .hdf5)")
    parser_file.add_argument(
        "columns",
        help="list of columns to export (or all when empty)",
        nargs="*")

    parser_file = subparsers.add_parser(
        'csv',
        help=
        'use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)'
    )
    parser_file.add_argument(
        "input",
        help=
        "input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)"
    )
    parser_file.add_argument("output", help="output file (ends in .hdf5)")
    parser_file.add_argument(
        "columns",
        help="list of columns to export (or all when empty)",
        nargs="*")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    dataset = None
    if args.task == "soneira":
        if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level,
                                         vaex.utils.confirm_on_console):
            if not args.quiet:
                print("generating soneira peebles dataset...")
            dataset = vaex.dataset_misc.SoneiraPeebles(args.dimension, 2,
                                                       args.max_level,
                                                       args.lambdas)
            dataset = vaex.dataframe.DataFrameLocal(dataset)
        else:
            return 1
    if args.task == "tap":
        dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print(
                "exporting from {tap_url} table name {table_name} to {output}".
                format(tap_url=args.tap_url,
                       table_name=args.table_name,
                       output=args.output))
    if args.task == "csv":
        # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {input} to {output}".format(
                input=args.input, output=args.output))
    if args.task == "file":
        if args.input[0] == "@":
            inputs = open(args.input[1:]).readlines()
            dataset = vaex.open_many(inputs)
        else:
            dataset = vaex.open(args.input)
        if not args.quiet:
            print("exporting from {input} to {output}".format(
                input=args.input, output=args.output))

    if dataset is None and args.task not in ["csv"]:
        if not args.quiet:
            print("Cannot open input")
        return 1
    if dataset:
        dataset.set_active_fraction(args.fraction)
    if args.list:
        if not args.quiet:
            print("columns names: " + " ".join(dataset.get_column_names()))
    else:
        if args.task == "csv":
            row_count = -1  # the header does not count
            with file(args.input) as lines:
                for line in lines:
                    row_count += 1
                    # print line
            logger.debug("row_count: %d", row_count)
            with file(args.input) as lines:
                line = next(lines).strip()
                # print line
                names = line.strip().split(",")
                line = next(lines).strip()
                values = line.strip().split(",")
                numerics = []
                for value in values:
                    try:
                        float(value)
                        numerics.append(True)
                    except:
                        numerics.append(False)
                names_numeric = [
                    name for name, numeric in zip(names, numerics) if numeric
                ]
                print(names_numeric)
                output = vaex.dataset_misc.Hdf5MemoryMapped.create(
                    args.output, row_count, names_numeric)
                Ncols = len(names)
                cols = [
                    output.columns[name] if numeric else None
                    for name, numeric in zip(names, numerics)
                ]

                def copy(line, row_index):
                    values = line.strip().split(",")
                    for column_index in range(Ncols):
                        if numerics[column_index]:
                            value = float(values[column_index])
                            cols[column_index][row_index] = value

                row = 0
                copy(line, row)
                row += 1
                progressbar = vaex.utils.progressbar(
                    title="exporting") if args.progress else None
                for line in lines:
                    # print line
                    copy(line, row)
                    row += 1
                    if row % 1000:
                        progressbar.update(row / float(row_count))
                progressbar.finish()
                # print names
        else:
            if args.columns:
                columns = args.columns
            else:
                columns = None
            if columns is None:
                columns = dataset.get_column_names(strings=True,
                                                   virtual=args.virtual)
            for column in columns:
                if column not in dataset.get_column_names(strings=True,
                                                          virtual=True):
                    if not args.quiet:
                        print(
                            "column %r does not exist, run with --list or -l to list all columns"
                            % column)
                    return 1

            base, output_ext = os.path.splitext(args.output)
            if output_ext not in [".hdf5", ".fits", ".arrow"]:
                if not args.quiet:
                    print(
                        "extension %s not supported, only .hdf5, .arrow and .fits are"
                        % output_ext)
                return 1

            if not args.quiet:
                print("exporting %d rows and %d columns" %
                      (len(dataset), len(columns)))
                print("columns: " + " ".join(columns))
            progressbar = vaex.utils.progressbar(
                title="exporting") if args.progress else None

            def update(p):
                if progressbar:
                    progressbar.update(p)
                return True

            if args.filter:
                dataset.select(args.filter, name='export')
                selection = 'export'
            else:
                selection = None
            if output_ext == ".hdf5":
                export_hdf5(dataset,
                            args.output,
                            column_names=columns,
                            progress=update,
                            shuffle=args.shuffle,
                            sort=args.sort,
                            selection=selection)
            elif output_ext == ".arrow":
                from vaex.arrow.export import export as export_arrow
                export_arrow(dataset,
                             args.output,
                             column_names=columns,
                             progress=update,
                             shuffle=args.shuffle,
                             sort=args.sort,
                             selection=selection)
            elif output_ext == ".fits":
                export_fits(dataset,
                            args.output,
                            column_names=columns,
                            progress=update,
                            shuffle=args.shuffle,
                            sort=args.sort,
                            selection=selection)
            if progressbar:
                progressbar.finish()
            if not args.quiet:
                print("\noutput to %s" % os.path.abspath(args.output))
            dataset.close()
    return 0
Example #6
0
 def open(self):
     return vx.open_many(self.filenames_vaex) if len(self.filenames_vaex) != 1 else vx.open(self.filenames_vaex[0])
Example #7
0
 def open(self):
     return vx.open_many(
         self.filenames_vaex) if len(self.filenames_vaex) != 1 else vx.open(
             self.filenames_vaex[0])
Example #8
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--quiet', '-q', default=False, action='store_true', help="do not output anything")
    parser.add_argument('--list', '-l', default=False, action='store_true', help="list columns of input")
    parser.add_argument('--progress', help="show progress (default: %(default)s)", default=True, action='store_true')
    parser.add_argument('--no-progress', dest="progress", action='store_false')
    parser.add_argument('--shuffle', "-s", dest="shuffle", action='store_true', default=False)
    parser.add_argument('--sort', dest="sort", default=None)
    parser.add_argument('--virtual', dest="virtual", action='store_true', default=False, help="Also export virtual columns")
    parser.add_argument('--fraction', "-f", dest="fraction", type=float, default=1.0, help="fraction of input dataset to export")
    parser.add_argument('--filter', dest="filter", default=None, help="filter to apply before exporting")

    subparsers = parser.add_subparsers(help='type of input source', dest="task")

    parser_soneira = subparsers.add_parser('soneira', help='create soneira peebles dataset')
    parser_soneira.add_argument('output', help='output file')
    parser_soneira.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")
    parser_soneira.add_argument('--dimension', '-d', type=int, help='dimensions', default=4)
    # parser_soneira.add_argument('--eta','-e', type=int, help='dimensions', default=3)
    parser_soneira.add_argument('--max-level', '-m', type=int, help='dimensions', default=28)
    parser_soneira.add_argument('--lambdas', '-l', type=int, help='lambda values for fractal', default=[1.1, 1.3, 1.6, 2.])

    parser_tap = subparsers.add_parser('tap', help='use TAP (Table Access Protocol) as source')
    parser_tap.add_argument("tap_url", help="input source or file")
    parser_tap.add_argument("table_name", help="input source or file")
    parser_tap.add_argument("output", help="output file (ends in .fits or .hdf5)")
    parser_tap.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    parser_file = subparsers.add_parser('file', help='use a file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)')
    parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)")
    parser_file.add_argument("output", help="output file (ends in .fits or .hdf5)")
    parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    parser_file = subparsers.add_parser('csv', help='use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)')
    parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)")
    parser_file.add_argument("output", help="output file (ends in .hdf5)")
    parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    dataset = None
    if args.task == "soneira":
        if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console):
            if not args.quiet:
                print("generating soneira peebles dataset...")
            dataset = vaex.file.other.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas)
        else:
            return 1
    if args.task == "tap":
        dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {tap_url} table name {table_name} to {output}".format(tap_url=args.tap_url, table_name=args.table_name, output=args.output))
    if args.task == "csv":
        # dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
        if not args.quiet:
            print("exporting from {input} to {output}".format(input=args.input, output=args.output))
    if args.task == "file":
        if args.input[0] == "@":
            inputs = open(args.input[1:]).readlines()
            dataset = vaex.open_many(inputs)
        else:
            dataset = vaex.open(args.input)
        if not args.quiet:
            print("exporting from {input} to {output}".format(input=args.input, output=args.output))

    if dataset is None and args.task not in ["csv"]:
        if not args.quiet:
            print("Cannot open input")
        return 1
    if dataset:
        dataset.set_active_fraction(args.fraction)
    if args.list:
        if not args.quiet:
            print("columns names: " + " ".join(dataset.get_column_names()))
    else:
        if args.task == "csv":
            row_count = -1  # the header does not count
            with file(args.input) as lines:
                for line in lines:
                    row_count += 1
                    # print line
            logger.debug("row_count: %d", row_count)
            with file(args.input) as lines:
                line = next(lines).strip()
                # print line
                names = line.strip().split(",")
                line = next(lines).strip()
                values = line.strip().split(",")
                numerics = []
                for value in values:
                    try:
                        float(value)
                        numerics.append(True)
                    except:
                        numerics.append(False)
                names_numeric = [name for name, numeric in zip(names, numerics) if numeric]
                print(names_numeric)
                output = vaex.file.other.Hdf5MemoryMapped.create(args.output, row_count, names_numeric)
                Ncols = len(names)
                cols = [output.columns[name] if numeric else None for name, numeric in zip(names, numerics)]

                def copy(line, row_index):
                    values = line.strip().split(",")
                    for column_index in range(Ncols):
                        if numerics[column_index]:
                            value = float(values[column_index])
                            cols[column_index][row_index] = value
                row = 0
                copy(line, row)
                row += 1
                progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None
                for line in lines:
                    # print line
                    copy(line, row)
                    row += 1
                    if row % 1000:
                        progressbar.update(row / float(row_count))
                progressbar.finish()
                # print names
        else:
            if args.columns:
                columns = args.columns
            else:
                columns = None
            if columns is None:
                columns = dataset.get_column_names(strings=True, virtual=args.virtual)
            for column in columns:
                if column not in dataset.get_column_names(strings=True, virtual=True):
                    if not args.quiet:
                        print("column %r does not exist, run with --list or -l to list all columns" % column)
                    return 1

            base, output_ext = os.path.splitext(args.output)
            if output_ext not in [".hdf5", ".fits", ".arrow"]:
                if not args.quiet:
                    print("extension %s not supported, only .hdf5, .arrow and .fits are" % output_ext)
                return 1

            if not args.quiet:
                print("exporting %d rows and %d columns" % (len(dataset), len(columns)))
                print("columns: " + " ".join(columns))
            progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None

            def update(p):
                if progressbar:
                    progressbar.update(p)
                return True
            if args.filter:
                dataset.select(args.filter, name='export')
                selection = 'export'
            else:
                selection = None
            if output_ext == ".hdf5":
                export_hdf5(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection)
            elif output_ext == ".arrow":
                from vaex_arrow.export import export as export_arrow
                export_arrow(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection)
            elif output_ext == ".fits":
                export_fits(dataset, args.output, column_names=columns, progress=update, shuffle=args.shuffle, sort=args.sort, selection=selection)
            if progressbar:
                progressbar.finish()
            if not args.quiet:
                print("\noutput to %s" % os.path.abspath(args.output))
            dataset.close_files()
    return 0
Example #9
0
def main(argv):
    import argparse
    parser = argparse.ArgumentParser(argv[0])
    parser.add_argument('--verbose', '-v', action='count', default=0)
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help="do not output anything")
    parser.add_argument('--list',
                        '-l',
                        default=False,
                        action='store_true',
                        help="list columns of input")
    parser.add_argument('--progress',
                        help="show progress (default: %(default)s)",
                        default=True,
                        action='store_true')
    parser.add_argument('--no-progress', dest="progress", action='store_false')
    parser.add_argument('--no-delete',
                        help="Delete file on failure (default: %(default)s)",
                        dest='delete',
                        default=True,
                        action='store_false')
    parser.add_argument('--shuffle',
                        "-s",
                        dest="shuffle",
                        action='store_true',
                        default=False)
    parser.add_argument('--sort', dest="sort", default=None)
    parser.add_argument('--fraction',
                        "-f",
                        dest="fraction",
                        type=float,
                        default=1.0,
                        help="fraction of input dataset to export")
    parser.add_argument('--filter',
                        dest="filter",
                        default=None,
                        help="filter to apply before exporting")
    parser.add_argument(
        "input",
        help=
        "input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)"
    )
    parser.add_argument("output", help="output file (ends in .hdf5)")
    parser.add_argument("columns",
                        help="list of columns to export (or all when empty)",
                        nargs="*")

    args = parser.parse_args(argv[1:])

    verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
    logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
    if args.input[0] == "@":
        inputs = open(args.input[1:]).readlines()
        df = vaex.open_many(inputs)
    else:
        df = vaex.open(args.input)

    if df:
        df.set_active_fraction(args.fraction)
    if args.list:
        print("\n".join(df.get_column_names()))
    else:
        if args.columns:
            all_columns = df.get_column_names()
            columns = args.columns
            for column in columns:
                if column not in all_columns:
                    # if not args.quiet:
                    print(
                        "column %r does not exist, run with --list or -l to list all columns"
                        % column)
                    return 1
            df = df[columns]
        else:
            columns = df.get_column_names()

        if not args.quiet:
            print("exporting %d rows and %d columns" % (len(df), len(columns)))
            print("columns: " + " ".join(columns))

        if args.filter:
            df = df.filter(args.filter)
        if args.sort:
            df = df.sort(args.sort)
        try:
            df.export(args.output, progress=args.progress)
            if not args.quiet:
                print("\noutput to %s" % os.path.abspath(args.output))
            df.close()
        except:
            if not args.quiet:
                print("\nfailed to write to%s" % os.path.abspath(args.output))
            if args.delete:
                if args.delete:
                    os.remove(args.output)
                    print(
                        "\ndeleted output %s (pass --no-delete to avoid that)"
                        % os.path.abspath(args.output))
            raise

    return 0
Example #10
0
# to change mz_idx to m/z there are two possibilities.
# 0. use the built in method
MZ = D.mzIdx2mz(S.mz_idx)
I = S.i
plot_spectrum(MZ, I)

# or used a fitted model
MS2 = D.mzIdx2mz_model(S.mz_idx)
D.plot_models()

# making it all faster
hdf5 = Path("/mnt/samsung/bruker/testHDF5/prec_prec_100ms")
hdf5_files = [str(f) for f in hdf5.glob('*.hdf5')]
p = Path('/home/matteo/Projects/bruker/BrukerMIDIA/MIDIA_CE10_precursor/20190912_HeLa_Bruker_TEN_MIDIA_200ng_CE10_100ms_Slot1-9_1_488.d')
D = TimsDIA(p)
R = vx.open_many(hdf5_files)

R['rt'] = D.frame2rt_model(R.frame)
R['im'] = D.scan2im_model(R.scan)
R['mz'] = D.tof2mz_model(R.tof)
R.plot(R.mz, R.im, shape=(1000,919))
plt.show()




D.plot_models()

frames = range(1,100)
list(frames)
Example #11
0
import vaex as vx

from timspy.timspy import TimsDIA
from timspy.iterators import ranges
from time import time

p = Path('/home/matteo/Projects/bruker/BrukerMIDIA/MIDIA_CE10_precursor/20190912_HeLa_Bruker_TEN_MIDIA_200ng_CE10_100ms_Slot1-9_1_488.d')
D = TimsDIA(p)
# output_folder = Path("/home/matteo/Projects/bruker/data_dumps/prec_prec_100ms")
output_folder = Path('/mnt/samsung/bruker/testHDF5/prec_prec_100ms')
# D.to_hdf5(output_folder)

D.tof2mz_model.plot()
D.tof2mz_model.params

df = vx.open_many([str(p) for p in output_folder.glob("*.hdf5")])
df.plot(df.tof, df.scan, what=vx.stat.sum(df.i))
plt.tight_layout()
plt.show()


df.plot(df.tof, df.scan, what=vx.stat.sum(df.i))
plt.tight_layout()
plt.show()


# check if all stats are there
x = df.count(df.i, binby=[df.scan], shape=1000)
S = df.groupby(df.scan).agg({'i':'sum'})
np.sort(S.scan.values)
S = df.groupby(df.scan).agg({'i':'sum'})
Example #12
0
import numpy as np

def tryint(s):
    try:
        return int(s)
    except:
        return s

def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]
# print(dic)
hdf5_list = glob.glob('./data_output/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
print(hdf5_list)

# df = vaex.open(hdf5_list[0])
# print(df)


# This is an important step
master_df = vaex.open_many(hdf5_list)

# exporting
master_df.export_hdf5(path='./green_tripdata.hdf5')
#
df = vaex.open('./green_tripdata.hdf5')
print(df)
Example #13
0
def perform_update():
    # Print that database is being updated
    print(f"Updating micro-lensing database in {ARGS.dir!r}.")

    # Open the master HDF5-file, creating it if it does not exist yet
    with h5py.File(ARGS.master_file, mode='a') as m_file:
        # Set the version of MLDatabase
        m_file.attrs['version'] = __version__

        # Obtain what exposures the database knows about
        n_expnums_known = m_file.attrs.setdefault('n_expnums', 0)
        expnums_dset =\
            m_file.require_dataset('expnums',
                                   shape=(n_expnums_known,),
                                   dtype=[*list(XTR_HEADER.items())[:-1],
                                          ('last_modified', int)],
                                   maxshape=(None,))
        expnums_known = expnums_dset[:]

    # Obtain sorted string of all files available
    filenames = str(sorted(next(os.walk(ARGS.dir))[2]))

    # Create a regex iterator
    re_iter = re.finditer(EXP_REGEX, filenames)

    # Create dict with up to ARGS.n_expnums exposure files
    exp_dict = {int(m['expnum']): (path.join(ARGS.dir, m['exp_file']),
                                   path.join(ARGS.dir, m['xtr_file']))
                for m in islice(re_iter, ARGS.n_expnums)}

    # Add the required flat exposure files (REGEX above explicitly ignores it)
#    exp_dict[0] = (path.join(ARGS.dir, REQ_FILES[0]),
#                   path.join(ARGS.dir, REQ_FILES[1]))

    # Initialize the number of exposures found and their types
    n_expnums = len(exp_dict)
    expnums_outdated = []

    # Create empty list of temporary HDF5-files
    temp_files = []

    # Determine which ones require updating
    for expnum, *_, mtime in expnums_known:
        # Try to obtain the exp_files of this expnum
        exp_files = exp_dict.get(expnum)

        # If this is not None, it is already known
        if exp_files is not None:
            # Check if it requires updating by comparing last-modified times
            if(path.getmtime(exp_files[0]) > mtime):
                # If so, add to expnums_outdated
                expnums_outdated.append(expnum)
                continue
            else:
                # If not, remove from dict
                exp_dict.pop(expnum)

        # Determine path to temporary HDF5-file of exposure
        temp_hdf5 = path.join(ARGS.mld, TEMP_EXP_FILE.format(expnum))

        # If it already exists, add it to temp_files
        if path.exists(temp_hdf5):
            temp_files.append(temp_hdf5)

    # Print the number of exposure files found
    n_expnums_outdated = len(expnums_outdated)
    n_expnums_new = len(exp_dict)-n_expnums_outdated
    n_expnums_temp = len(temp_files)
    print(f"\nFound {n_expnums:,} exposure files, of which {n_expnums_new:,} "
          f"are new and {n_expnums_outdated:,} are outdated. Also found "
          f"{n_expnums_temp:,} processed exposure files that require merging.")

    # If exp_dict contains at least 1 item
    if exp_dict:
        # Create tqdm iterator for processing
        exp_iter = tqdm(exp_dict.items(), desc="Processing exposure files",
                        dynamic_ncols=True)

        # Process all exposure files
        try:
            for expnum, exp_files in exp_iter:
                # Set which exposure is being processed in exp_iter
                exp_iter.set_postfix_str(path.basename(exp_files[0]))

                # Process this exposure
                temp_files.append(process_exp_files(expnum, exp_files))

        # If a KeyboardInterrupt is raised, update database with progress
        except KeyboardInterrupt:
            print("WARNING: Processing has been interrupted. Updating "
                  "database with currently processed exposures.")

        # Open master file
        with h5py.File(ARGS.master_file, 'r+') as m_file:
            # Obtain the total number of exposures now
            n_expnums_known = m_file.attrs['n_expnums']

    # If temp_files contains at least 1 item
    if temp_files:
        # Import vaex
        import vaex

        # Update database
        print("\nUpdating database with processed exposures (NOTE: This may "
              "take a while for large databases).")

        # Divide temp_files up into lists of length 100 with last of length 150
        n_temp = len(temp_files)
        temp_files = [temp_files[slc] for slc in dyn_range(len(temp_files))]

        # If the master exposure file exists and there are outdated exposures
        if path.exists(ARGS.master_exp_file) and expnums_outdated:
            # Wrap in try-statement to ensure file is closed
            try:
                # Open the master exposure file
                master_df = vaex.open(ARGS.master_exp_file)

                # Solely select the exposures that were not outdated
                for expnum in expnums_outdated:
                    master_df = master_df.filter(master_df.expnum != expnum,
                                                 'and')

                # Extract the master DataFrame
                master_df = master_df.extract()

                # Export to HDF5
                master_temp_file = path.join(ARGS.mld, 'temp.hdf5')
                master_df.export_hdf5(master_temp_file)

            # Close master exposure file
            finally:
                master_df.close()

            # Remove original master file
            os.remove(ARGS.master_exp_file)

            # Rename master_temp_file to master exposure file name
            os.rename(master_temp_file, ARGS.master_exp_file)

        # Create tqdm iterator for merging
        temp_iter = tqdm(desc="Merging processed exposure files", total=n_temp,
                         dynamic_ncols=True)

        # Loop over all temporary exposure HDF5-files
        # TODO: Figure out how to avoid copying over all the data every time
        for temp_files_list in temp_files:
            # Determine number of files in this list
            n_temp_list = len(temp_files_list)

            # Wrap in try-statement to ensure files are closed
            try:
                # Open all temporary exposure HDF5-files in this list
                temp_df = vaex.open_many(temp_files_list)

                # Add to master_df if it exists
                if path.exists(ARGS.master_exp_file):
                    # Open the master exposure file
                    master_df = vaex.open(ARGS.master_exp_file)
                    master_df = master_df.concat(temp_df)
                    temp_files_list.append(ARGS.master_exp_file)
                else:
                    master_df = temp_df

                # Export to HDF5
                master_temp_file = path.join(ARGS.mld, 'temp.hdf5')
                master_df.export_hdf5(master_temp_file)

            # Close all temporary HDF5-files
            finally:
                master_df.close_files()

            # Remove all temporary files
            for temp_file in temp_files_list:
                os.remove(temp_file)

            # Rename master_temp_file to master exposure file name
            os.rename(master_temp_file, ARGS.master_exp_file)

            # Update tqdm iterator
            temp_iter.update(n_temp_list)

        # Close the tqdm iterator
        temp_iter.close()

        # Determine all objids that are known
        print("\nDetermining all objects in the database.")
        try:
            master_df = vaex.open(ARGS.master_exp_file)
            objids, counts = np.unique(master_df['objid'].values,
                                       return_counts=True)
        finally:
            master_df.close()

        # Open master file
        with h5py.File(ARGS.master_file, 'r+') as m_file:
            # Obtain previously known objids
            n_objids_known = m_file.attrs.setdefault('n_objids', 0)
            objids_dset = m_file.require_dataset('objids',
                                                 shape=(n_objids_known,),
                                                 dtype=[('objid', int),
                                                        ('count', int)],
                                                 maxshape=(None,))

            # Save currently known objids
            n_objids = len(objids)
            objids_dset.resize(n_objids, axis=0)
            objids_dset['objid'] = objids
            objids_dset['count'] = counts
            m_file.attrs['n_objids'] = n_objids

            # Obtain the total number of exposures now
            n_expnums = m_file.attrs['n_expnums']

        # Print that processing is finished
        print(f"The database now contains {n_expnums:,} exposures with "
              f"{n_objids:,} objects.")

    # If no new exposure files are found, database is already up-to-date
    else:
        print("Database is already up-to-date.")