Esempio n. 1
0
def _test():
    import sys
    import os
    # Act like gzip; with -d, act like gunzip.
    # The input file is not deleted, however, nor are any other gzip
    # options or features supported.
    args = sys.argv[1:]
    decompress = args and args[0] == "-d"
    if decompress:
        arg = args[1]
    else:
        arg = args[0]
    # if not args:
    #     args = ["-"]
    if decompress:
        tsize = 0
        if arg != "-":
            # outf = arg + ".dcp"
            outf = "/dev/null"
            fh = open(outf, "wb")
            gh = mgzip.open(arg, "rb")
            t0 = time.time()
            # gh.show_index()
            # data = b"AAA"
            chunk_size = 10**7
            while True:
                data = gh.read(chunk_size)
                # data = gh.readline()
                if not data:
                    break
                fh.write(data)
                tsize += len(data)
            # data = gh.readline()
            t1 = time.time()
            fh.close()
            gh.close()
            size = tsize / (1024**2)
            seconds = t1 - t0
            speed = size / seconds
            nsize = os.stat(arg).st_size
            print(
                "Decompressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %"
                .format(size, seconds, speed, nsize / tsize * 100))
    else:
        if arg != "-":
            outf = arg + ".gz"
            fh = open(arg, "rb")
            gh = mgzip.open(outf, "wb", compresslevel=6)
            data = fh.read()
            t0 = time.time()
            gh.write(data)
            gh.close()
            t1 = time.time()
            size = len(data) / (1024**2)
            seconds = t1 - t0
            speed = size / seconds
            nsize = os.stat(outf).st_size
            print(
                "Compressed {:.2f} MB data in {:.2f} S, Speed: {:.2f} MB/s, Rate: {:.2f} %"
                .format(size, seconds, speed, nsize / len(data) * 100))
Esempio n. 2
0
def compress_gzip(target_fullpath_w_filename):
    output_fullpath_w_filename = target_fullpath_w_filename + ".gz"
    target_filestream = open(target_fullpath_w_filename, "rb")
    output_gz = mgzip.open(output_fullpath_w_filename, "wb", compresslevel=9)
    data = target_filestream.read()
    output_gz.write(data)
    output_gz.close()
Esempio n. 3
0
    def writeOutPredictionDict(self, dumping_data, outfilename):
        if not str(outfilename).endswith('.bin.gz'):
            outfilename = os.path.splitext(outfilename)[0] + '.bin.gz'

        with mgzip.open(outfilename, 'wb', thread=8,
                        blocksize=2 * 10**7) as f2:
            pickle.dump(dumping_data, f2)
Esempio n. 4
0
def test_read_rb(tmpdir):

    filename = os.path.join(tmpdir, "test.gz")
    with gzip.open(filename, "wb") as f1:
        f1.write(DATA1 * 500)

    with mgzip.open(filename, "rb") as f2:
        file_content = f2.read()
    assert file_content == DATA1 * 500
Esempio n. 5
0
def _cmp(fnm, lvl):
    #print('>>>'+fnm)
    fh = open(fnm, "rb")
    #gh = mgzip.open(fnm + ".gz", "wb", compresslevel=lvl)
    #thread=8, blocksize=2*10**8
    #gh = gzip.open(fnm + ".gz", "wb", compresslevel=lvl)
    gh = mgzip.open(fnm + ".gz", "wb", compresslevel=lvl, blocksize=10**6)
    data = fh.read()
    gh.write(data)
    gh.close()
def main(files, pdfpath, dumppath, soft, database_table_prefix, run_for=-1):
    global dataset_analysis_dict, fake_max_iou_values
    file_index = 0

    t1 = time.time()
    for file in files:
        print("\nFILE\n", file_index)
        with gzip.open(file, 'rb') as f:
            data_loaded = pickle.load(f)
            # print("XYZ", len(data_dict['features']), len(data_dict['predicted']), len(data_dict['truth']))
            file_results = analyse_multiple_endcaps_multi_cpu(
                data_loaded,
                soft=soft,
                beta_threshold=beta_threshold,
                distance_threshold=distance_threshold,
                iou_threshold=iou_threshold)
            for r in file_results:
                append_endcap_dict_to_dataset_dict(dataset_analysis_dict, r)
            # analyse_one_file(data_loaded, soft=soft)
            if file_index == run_for - 1:
                break
            file_index += 1

    print("It took", time.time() - t1, "seconds")

    if len(dumppath) > 0:
        print("Dumping analysis to bin file", dumppath)

        with mgzip.open(dumppath, 'wb', thread=8, blocksize=2 * 10**7) as f:
            pickle.dump(dataset_analysis_dict, f)
    else:
        print(
            "WARNING: No analysis output path specified. Skipped dumping of analysis."
        )

    # print("Number of total fakes is ", num_total_fakes)

    # np.savetxt('max_fake_iou.txt', fake_max_iou_values, delimiter=',')
    # 0/0

    plotter = HGCalAnalysisPlotter()

    plotter.add_data_from_analysis_dict(dataset_analysis_dict)
    if len(pdfpath) != 0:
        plotter.write_to_pdf(pdfpath)

    if len(database_table_prefix) != 0:
        print("Will write plots to database")
        database_manager = ExperimentDatabaseManager(
            mysql_credentials=sql_credentials.credentials, cache_size=40)
        database_manager.set_experiment('analysis_plotting_experiments')
        plotter.write_data_to_database(database_manager, database_table_prefix)
        database_manager.close()
Esempio n. 7
0
    def compress(file_path, np=1):
        """Compress a file in .gz."""
        # Note: this will open the whole file in memory
        #  this might not be the best idea
        with open(file_path, 'r') as fr:
            file_string = ''.join(fr.readlines())
        fr.close()

        with mgzip.open(f'{file_path}.gz', "wt", thread=np) as fw:
            fw.write(file_string)
        fw.close()

        os.remove(file_path)
Esempio n. 8
0
def gunzip_file(
    input_filename: str,
    output_filename: str,
    blocksize: int = 5 * 1024 * 1024,
    threads: Optional[int] = None,
) -> str:
    """Gzip a file using mgzip for multithreading."""
    with open(output_filename, mode="wb") as f_out:
        with mgzip.open(input_filename,
                        mode="rb",
                        blocksize=blocksize,
                        thread=threads) as f_in:
            shutil.copyfileobj(f_in, f_out, length=blocksize // 2)
    return output_filename
Esempio n. 9
0
def test_write_wb(tmpdir):

    filename = os.path.join(tmpdir, "test.gz")
    with mgzip.open(filename, "wb", compresslevel=6) as f1:
        f1.write(DATA1 * 50)
        # Try flush and fileno.
        f1.flush()
        f1.fileno()
        if hasattr(os, "fsync"):
            os.fsync(f1.fileno())
        f1.close()
    f1.close()

    assert os.path.exists(filename)
    with gzip.open(filename, "rb") as f2:
        file_content = f2.read()
    assert file_content == DATA1 * 50
Esempio n. 10
0
    def make_map(self, file_):
        """construct a dictionary of pid - (batchnum, line#)"""
        file_, allowed_pids = file_
        use_allowed_pids = len(allowed_pids) > 0
        batchnum = int(file_.name.replace(".jsonl.gz", ""))

        pid2idx = {}
        result = set()
        with gzip.open(str(file_), "r") as f:
            for idx, line in enumerate(f):
                pid = re.search(PID_PAT, line.decode("utf8")).group(1)
                result.add(pid)
                pid2idx[pid] = idx
            if use_allowed_pids:
                result = list(result.intersection(allowed_pids))
        for pid in result:
            self.results[pid] = (batchnum, pid2idx[pid])
Esempio n. 11
0
    def make_map(self, file_):
        """construct a dictionary of pid - (batchnum, line#)"""
        file_, allowed_pids = file_
        use_allowed_pids = len(allowed_pids) > 0
        batchnum = int(
            file_.name.replace(".jsonl.gz", "").replace("pdf_parses_", ""))

        pid2idx = {}
        result = set()
        with gzip.open(str(file_), "r") as f:
            for idx, line in enumerate(f):
                obj = json.loads(line)
                result.add(obj["paper_id"])
                pid2idx[obj["paper_id"]] = idx
            if use_allowed_pids:
                result = list(result.intersection(allowed_pids))
        for pid in result:
            self.results[pid] = (batchnum, pid2idx[pid])
Esempio n. 12
0
def backup(path: str, callback=None, block_size=1024 * 1024):
    """
    Performs a gzipped copy of the device containing / and /boot to an
    external drive.

    :param path: Path to the destination
    :param callback: Optional callback function for progress reporting
    :param block_size amount of data in bytes to read/write at a time
    :return: Tuple of 2 strings: destination-path/file and elapsed
            backup time (HH:MM:SS)
    """
    device_size = shutil.disk_usage('/')[0] + shutil.disk_usage('/boot')[0]
    # device_size = 100*1024*1024
    now = datetime.datetime.now().strftime('%Y%m%dT%H%M%S')
    zip_file = f"{os.uname()[1]}_{round(device_size / 1000000000)}GB_{now}.gz"
    copied = 0
    start = int(time.time())
    try:
        with open(disk, 'rb') as file_in, \
                mgzip.open(f"{path}/{zip_file}", 'wb', blocksize=block_size) as file_out:
            while run_backup:
                block = file_in.read(block_size)
                # if not block:
                if not block or copied >= device_size:
                    break
                file_out.write(block)
                copied += block_size
                if callback:
                    # Make sure copied doesn't exceed device_size,
                    # which will likely happen on the last block.
                    callback(min(max(copied, 0), device_size),
                             total=device_size)
    except IOError:
        # print("I/O ERROR({0}): {1}".format(e.errno, e.strerror),
        #       file=sys.stderr)
        return f"{path}/{zip_file}", None
    # except: #handle other exceptions such as attribute errors
    #     print("Unexpected error:", sys.exc_info()[0])
    if os.path.isfile(f"{path}/{zip_file}") and not run_backup:
        os.remove(f"{path}/{zip_file}")
        return f"{path}/{zip_file}", None
    end = int(time.time())
    elapsed = end - start
    return f"{path}/{zip_file}", f"{datetime.timedelta(seconds=elapsed)}"
Esempio n. 13
0
    def __createTensileBenchmarkContainer(baseImage, dockerFilePath, tag, outDir, logDir, tensileFork, tensileBranch, tensileCommit):
        """
        Build a docker container with a specific
        ROCm image, Tensile branch and tag. Docker
        will pre-build the Tensile Client and
        configure the container to run the benchmark.
        """
        # Save stdout and stderr to file
        with open(os.path.join(logDir, "dockerBuildLog.log"), 'w') as logFile:

            # Docker build command
            buildCmd = str("\
                docker build \
                -t {0} \
                --pull -f {1} \
                --build-arg user_uid=$UID \
                --build-arg base_image={2} \
                --build-arg tensile_fork={3} \
                --build-arg tensile_branch={4} \
                --build-arg tensile_commit={5} \
                . ").format(tag, dockerFilePath, baseImage, tensileFork, tensileBranch, tensileCommit)

            # Build container and save output streams
            print("Building docker image: {0} ...".format(tag))
            print(buildCmd)
            subprocess.check_call(shlex.split(buildCmd), stdout=logFile, stderr=logFile)
            print("Done building docker image!")

            # Docker save command
            imageBaseName = tag.split(':')[0]
            archivePath = os.path.join(outDir, imageBaseName + str(".tar.gz"))
            saveCmd = str("docker save {0}").format(tag)

            # Docker will save .tar binary to stdout as long as it's not attached to console.
            # Pipe stdout into gzip to get smaller .tar.gz archive
            print("Saving docker image: {0}  to {1} ...".format(tag, archivePath))

            with gzip.open(archivePath, 'wb') as zipFile:
                with subprocess.Popen(shlex.split(saveCmd), stdout=subprocess.PIPE, stderr=logFile) as proc:
                    zipFile.write(proc.stdout.read())

            print("Done saving docker image!")
Esempio n. 14
0
    def filter_ids_complete(self, file_: Path):
        """go over the metadata and get the in/out-bound citation data for each VALID
        paper. Validity is determined by whether the paper has a proper grobid parse,
        plus the existence of at least one of in/out citations.

        This method accumulates data into the state, as it won't take so much space.

        Used by filtering/filter_ids.py.
        """
        file_, valid_pids, min_cite, max_cite, seed = file_
        # make sure
        random.seed(seed)

        with gzip.open(str(file_), "r") as f:
            # metadata_ID.jsonl.gz
            fname = int(
                file_.name.replace(".jsonl.gz", "").replace("metadata_", ""))
            self.results[fname] = []
            for line in f:
                obj = json.loads(line)
                # has_pdf_parse, has_pdf_parsed_(bib_entries, abstract, body_text)
                if (valid_pids.get(obj["paper_id"], False)
                        and obj["has_inbound_citations"]
                        and obj["has_outbound_citations"]
                        and len(obj["inbound_citations"]) > min_cite
                        and len(obj["outbound_citations"]) > min_cite):
                    # filter valid citations
                    ibc = [
                        pid for pid in obj["inbound_citations"]
                        if valid_pids.get(pid, False)
                    ]
                    obc = [
                        pid for pid in obj["outbound_citations"]
                        if valid_pids.get(pid, False)
                    ]
                    random.shuffle(ibc)
                    random.shuffle(obc)
                    if len(ibc) > 0 and len(obc) > 0:
                        self.results[fname].append(
                            (obj["paper_id"], ibc[:max_cite], obc[:max_cite]))
Esempio n. 15
0
    def filter_ids_text(self, file_: Path):
        """go over the metadata and get the list of paper_ids for each paper with
        pdf_parse text. Validity is determined by whether the paper has a proper grobid parse.
        Used to create a list of parseable papers.

        This method accumulates data into the state, as it won't take so much space.

        Used by filtering/filter_ids.py.
        """
        # ignore irrelevant entry (cf. filter_ids.py)
        file_, _ = file_

        with gzip.open(str(file_), "r") as f:
            # metadata_ID.jsonl.gz
            fname = int(
                file_.name.replace(".jsonl.gz", "").replace("metadata_", ""))
            self.results[fname] = []
            for line in f:
                obj = json.loads(line)
                if (obj["has_pdf_parse"] and obj["has_pdf_parsed_abstract"]
                        and obj["has_pdf_parsed_bib_entries"]
                        and obj["has_pdf_parsed_body_text"]):
                    self.results[fname].append(obj["paper_id"])
Esempio n. 16
0
def test_pool_close(tmpdir):

    filename = os.path.join(tmpdir, "test.gz")
    fh = mgzip.open(filename, "wb", compresslevel=6, thread=4, blocksize=128)
    fh.write(DATA1 * 500)
    if sys.version_info >= (3, 8):
        assert (repr(fh.pool) ==
                "<multiprocessing.pool.ThreadPool state=RUN pool_size=4>")
    fh.close()
    assert fh.fileobj is None
    assert fh.myfileobj is None
    assert fh.pool_result == []
    if sys.version_info >= (3, 8):
        assert (repr(fh.pool) ==
                "<multiprocessing.pool.ThreadPool state=CLOSE pool_size=4>")
    if sys.version_info >= (3, 7):
        with pytest.raises(ValueError) as excinfo:
            fh.pool.apply(print, ("x", ))
        assert "Pool not running" in str(excinfo.value)
    else:
        with pytest.raises(AssertionError) as excinfo:
            fh.pool.apply(print, ("x", ))
        assert "" == str(excinfo.value)
Esempio n. 17
0
    def open(cls,
             column_names: typing.List[str],
             file_path: typing.Optional[Path],
             who: str = "output",
             require_all_columns: bool = True,
             prohibit_extra_columns: bool = True,
             fill_missing_columns: bool = False,
             error_file: typing.TextIO = sys.stderr,
             header_error_action: ValidationAction = ValidationAction.EXIT,
             use_mgzip: bool = False,
             mgzip_threads: int = MGZIP_THREAD_COUNT_DEFAULT,
             gzip_in_parallel: bool = False,
             gzip_queue_size: int = GZIP_QUEUE_SIZE_DEFAULT,
             column_separator: str = KgtkFormat.COLUMN_SEPARATOR,
             mode: Mode = Mode.AUTO,
             output_format: typing.Optional[str] = None,
             output_column_names: typing.Optional[typing.List[str]] = None,
             old_column_names: typing.Optional[typing.List[str]] = None,
             new_column_names: typing.Optional[typing.List[str]] = None,
             verbose: bool = False,
             very_verbose: bool = False) -> "KgtkWriter":

        if file_path is None or str(file_path) == "-":
            if verbose:
                print("KgtkWriter: writing stdout",
                      file=error_file,
                      flush=True)

            if output_format is None:
                output_format = cls.OUTPUT_FORMAT_DEFAULT

            return cls._setup(
                column_names=column_names,
                file_path=None,
                who=who,
                file_out=sys.stdout,
                require_all_columns=require_all_columns,
                prohibit_extra_columns=prohibit_extra_columns,
                fill_missing_columns=fill_missing_columns,
                error_file=error_file,
                header_error_action=header_error_action,
                use_mgzip=use_mgzip,
                mgzip_threads=mgzip_threads,
                gzip_in_parallel=gzip_in_parallel,
                gzip_queue_size=gzip_queue_size,
                column_separator=column_separator,
                mode=mode,
                output_format=output_format,
                output_column_names=output_column_names,
                old_column_names=old_column_names,
                new_column_names=new_column_names,
                verbose=verbose,
                very_verbose=very_verbose,
            )

        if str(file_path).startswith(">"):
            fd: int = int(str(file_path)[1:])
            if verbose:
                print("%s: writing file descriptor %d" % (who, fd),
                      file=error_file,
                      flush=True)

            if output_format is None:
                output_format = cls.OUTPUT_FORMAT_DEFAULT

            return cls._setup(
                column_names=column_names,
                file_path=file_path,
                who=who,
                file_out=open(fd, "w"),
                require_all_columns=require_all_columns,
                prohibit_extra_columns=prohibit_extra_columns,
                fill_missing_columns=fill_missing_columns,
                error_file=error_file,
                header_error_action=header_error_action,
                use_mgzip=use_mgzip,
                mgzip_threads=mgzip_threads,
                gzip_in_parallel=gzip_in_parallel,
                gzip_queue_size=gzip_queue_size,
                column_separator=column_separator,
                mode=mode,
                output_format=output_format,
                output_column_names=output_column_names,
                old_column_names=old_column_names,
                new_column_names=new_column_names,
                verbose=verbose,
                very_verbose=very_verbose,
            )

        if verbose:
            print("File_path.suffix: %s" % file_path.suffix,
                  file=error_file,
                  flush=True)

        if file_path.suffix in [".gz", ".bz2", ".xz", ".lz4"]:
            # TODO: find a better way to coerce typing.IO[Any] to typing.TextIO
            gzip_file: typing.TextIO
            if file_path.suffix == ".gz":
                if use_mgzip:
                    if verbose:
                        print("KgtkWriter: writing gzip with %d threads: %s" %
                              (mgzip_threads, str(file_path)),
                              file=error_file,
                              flush=True)
                    import mgzip
                    gzip_file = mgzip.open(
                        str(file_path), mode="wt",
                        thread=mgzip_threads)  # type: ignore
                else:
                    if verbose:
                        print("KgtkWriter: writing gzip %s" % str(file_path),
                              file=error_file,
                              flush=True)
                    import gzip
                    gzip_file = gzip.open(file_path, mode="wt")  # type: ignore

            elif file_path.suffix == ".bz2":
                if verbose:
                    print("KgtkWriter: writing bz2 %s" % str(file_path),
                          file=error_file,
                          flush=True)
                import bz2
                gzip_file = bz2.open(file_path, mode="wt")  # type: ignore

            elif file_path.suffix == ".xz":
                if verbose:
                    print("KgtkWriter: writing lzma %s" % str(file_path),
                          file=error_file,
                          flush=True)
                import lzma
                gzip_file = lzma.open(file_path, mode="wt")  # type: ignore

            elif file_path.suffix == ".lz4":
                if verbose:
                    print("KgtkWriter: writing lz4 %s" % str(file_path),
                          file=error_file,
                          flush=True)
                import lz4  # type: ignore
                gzip_file = lz4.frame.open(file_or_path,
                                           mode="wt")  # type: ignore
            else:
                # TODO: throw a better exception.
                raise ValueError("Unexpected file_path.suffiz = '%s'" %
                                 file_path.suffix)

            if output_format is None:
                if len(file_path.suffixes) < 2:
                    output_format = cls.OUTPUT_FORMAT_DEFAULT
                else:
                    format_suffix: str = file_path.suffixes[-2]
                    if format_suffix == ".md":
                        output_format = cls.OUTPUT_FORMAT_MD
                    elif format_suffix == ".csv":
                        output_format = cls.OUTPUT_FORMAT_CSV
                    elif format_suffix == ".json":
                        output_format = cls.OUTPUT_FORMAT_JSON
                    elif format_suffix == ".jsonl":
                        output_format = cls.OUTPUT_FORMAT_JSONL
                    else:
                        output_format = cls.OUTPUT_FORMAT_DEFAULT

            return cls._setup(
                column_names=column_names,
                file_path=file_path,
                who=who,
                file_out=gzip_file,
                require_all_columns=require_all_columns,
                prohibit_extra_columns=prohibit_extra_columns,
                fill_missing_columns=fill_missing_columns,
                error_file=error_file,
                header_error_action=header_error_action,
                use_mgzip=use_mgzip,
                mgzip_threads=mgzip_threads,
                gzip_in_parallel=gzip_in_parallel,
                gzip_queue_size=gzip_queue_size,
                column_separator=column_separator,
                mode=mode,
                output_format=output_format,
                output_column_names=output_column_names,
                old_column_names=old_column_names,
                new_column_names=new_column_names,
                verbose=verbose,
                very_verbose=very_verbose,
            )

        else:
            if output_format is None:
                if file_path.suffix == ".md":
                    output_format = cls.OUTPUT_FORMAT_MD
                elif file_path.suffix == ".csv":
                    output_format = cls.OUTPUT_FORMAT_CSV
                elif file_path.suffix == ".json":
                    output_format = cls.OUTPUT_FORMAT_JSON
                elif file_path.suffix == ".jsonl":
                    output_format = cls.OUTPUT_FORMAT_JSONL
                else:
                    output_format = cls.OUTPUT_FORMAT_DEFAULT

            if verbose:
                print("KgtkWriter: writing file %s" % str(file_path),
                      file=error_file,
                      flush=True)
            return cls._setup(
                column_names=column_names,
                file_path=file_path,
                who=who,
                file_out=open(file_path, "w"),
                require_all_columns=require_all_columns,
                prohibit_extra_columns=prohibit_extra_columns,
                fill_missing_columns=fill_missing_columns,
                error_file=error_file,
                header_error_action=header_error_action,
                use_mgzip=use_mgzip,
                mgzip_threads=mgzip_threads,
                gzip_in_parallel=gzip_in_parallel,
                gzip_queue_size=gzip_queue_size,
                column_separator=column_separator,
                mode=mode,
                output_format=output_format,
                output_column_names=output_column_names,
                old_column_names=old_column_names,
                new_column_names=new_column_names,
                verbose=verbose,
                very_verbose=very_verbose,
            )
Esempio n. 18
0
def analyse(preddir, pdfpath, beta_threshold, distance_threshold, iou_threshold, matching_mode, analysisoutpath, nfiles,
            local_distance_scaling, is_soft, op, de_e_cut, angle_cut, kill_pu=True):
    hits2showers = OCHits2Showers(beta_threshold, distance_threshold, is_soft, local_distance_scaling, op=op)
    showers_matcher = ShowersMatcher(matching_mode, iou_threshold, de_e_cut, angle_cut)

    files_to_be_tested = [os.path.join(preddir, x) for x in os.listdir(preddir) if x.endswith('.bin.gz')]
    if nfiles!=-1:
        files_to_be_tested = files_to_be_tested[0:min(nfiles, len(files_to_be_tested))]

    showers_dataframe = pd.DataFrame()
    event_id = 0

    for i, file in enumerate(files_to_be_tested):
        print("Analysing file", i, file)
        with mgzip.open(file, 'rb') as f:
            file_data = pickle.load(f)
            for j, endcap_data in enumerate(file_data):
                print("Analysing endcap",j)
                stopwatch = time.time()
                features_dict, truth_dict, predictions_dict = endcap_data
                processed_pred_dict, pred_shower_alpha_idx = hits2showers.call(features_dict, predictions_dict)
                print('took',time.time()-stopwatch,'s for inference clustering')
                stopwatch = time.time()
                showers_matcher.set_inputs(
                    features_dict=features_dict,
                    truth_dict=truth_dict,
                    predictions_dict=processed_pred_dict,
                    pred_alpha_idx=pred_shower_alpha_idx
                )
                showers_matcher.process()
                print('took',time.time()-stopwatch,'s to match')
                stopwatch = time.time()
                dataframe = showers_matcher.get_result_as_dataframe()
                print('took',time.time()-stopwatch,'s to make data frame')
                dataframe['event_id'] = event_id
                event_id += 1
                if kill_pu:
                    from globals import pu
                    if len(dataframe[dataframe['truthHitAssignementIdx']>=pu.t_idx_offset]):
                        print('\nWARNING REMOVING PU TRUTH MATCHED SHOWERS, HACK.\n')
                        dataframe = dataframe[dataframe['truthHitAssignementIdx']<pu.t_idx_offset]
                showers_dataframe = pd.concat((showers_dataframe, dataframe))

    # This is only to write to pdf files
    scalar_variables = {
        'beta_threshold': str(beta_threshold),
        'distance_threshold': str(distance_threshold),
        'iou_threshold': str(iou_threshold),
        'matching_mode': str(matching_mode),
        'is_soft': str(is_soft),
        'de_e_cut': str(de_e_cut),
        'angle_cut': str(angle_cut),
    }

    if len(analysisoutpath) > 0:
        analysis_data = {
            'showers_dataframe' : showers_dataframe,
            'events_dataframe' : None,
            'scalar_variables' : scalar_variables,
        }
        with gzip.open(analysisoutpath, 'wb') as f:
            print("Writing dataframes to pickled file",analysisoutpath)
            pickle.dump(analysis_data,f)

    if len(pdfpath)>0:
        plotter = HGCalAnalysisPlotter()
        plotter.set_data(showers_dataframe, None, '', pdfpath, scalar_variables=scalar_variables)
        plotter.process()
Esempio n. 19
0
    def _fetch(self):
        training_performance_metrics = None
        if not self.ignore_cache:
            if os.path.exists(self.cache_path):
                with mgzip.open(self.cache_path, 'rb') as f:
                    dumping_data = pickle.load(f)
                    print(dumping_data['experiment_name'])
                    if dumping_data['experiment_name'] == self.experiment_name:
                        training_performance_metrics = dumping_data['data']
                        print("Loaded data from cache...")
                    else:
                        print(
                            "Cache doesn't contain this experiment, will have to re-fetch."
                        )

        condition_string = None
        if training_performance_metrics is not None:
            old_exp_names = np.unique(
                training_performance_metrics['experiment_name']).tolist()
            old_max_iterations = [
                np.max(
                    np.array(training_performance_metrics['iteration'])[
                        np.char.equal(
                            training_performance_metrics['experiment_name'],
                            expn)]) for expn in old_exp_names
            ]

            condition_string = '(%s)' % ' OR '.join([
                "(experiment_name='%s' and iteration > '%d')" %
                (exp_n, iteration)
                for exp_n, iteration in zip(old_exp_names, old_max_iterations)
            ])
            # condition_string = '(%s)' % condition_string

        if self.experiment_name is not None:
            experiment_name = str(self.experiment_name).split(',')
            if len(experiment_name) == 1:
                experiment_name = experiment_name[0]
        else:
            experiment_name = self.experiment_name

        if _debug:
            print("Going to fetch from server")
        new_data = self.reading_manager.get_data(
            '%s' % self.database_table_name,
            experiment_names=experiment_name,
            condition_string=condition_string)
        if _debug:
            print("Fetch from server complete")

        if new_data is not None and training_performance_metrics is not None:
            training_performance_metrics = self._combine(
                training_performance_metrics, new_data)
        elif new_data is not None and training_performance_metrics is None:
            training_performance_metrics = new_data
        if not self.ignore_cache:
            with mgzip.open(self.cache_path, 'wb') as f:
                dumping_data = {
                    'experiment_name': self.experiment_name,
                    'data': training_performance_metrics
                }
                pickle.dump(dumping_data, f)

        if training_performance_metrics is None:
            print(
                "Experiment not found, in your configured database, the following experiments were found:"
            )

            available_experiment_names = self.reading_manager.get_data_from_query(
                'SELECT DISTINCT(experiment_name) FROM %s' %
                self.database_table_name)
            available_experiment_names = [
                x[0] for x in available_experiment_names
            ]
            print(available_experiment_names)
            raise TrainingMetricPlots.ExperimentNotFoundError(
                "Experiment not found in your configured database")

        self.training_performance_metrics = training_performance_metrics
Esempio n. 20
0
    def bundle(self,
               firmware_hash: str,
               datasets: List[Dataset],
               *,
               file: Union[str, BinaryIO, IO[bytes]],
               shard_spec: ShardSpec = None,
               delta_to: Dict[str, str] = None,
               overwrite: bool = False) -> List[ObjectInfo]:
        """
        Builds a data bundle (*.tar.gz) for a firmware hash, including content from the
        specified datasets (FWAN plugin output locations)

        :param firmware_hash: The firmware hash to bundle
        :param datasets: The datasets to include in the bundle
        :param file: The output path or file-like-object to which the *.tar.gz output should be written
        :param shard_spec: If provided, only the specified shard of file hashes will appear in the bundle
        :param delta_to: A dictionary of path->etag values, which if supplied will cause the bundle to be built as a
        delta to that set, meaning only new objects or objects with modified etags will appear in the bundle.
        :param overwrite: The output path will not be overwritten, to prevent accidental data loss, unless this is set
        :return: A list of the object included in the bundle
        """
        if not firmware_hash:
            raise ValueError('firmware_hash must be specified')

        if not datasets:
            raise ValueError('datasets must be specified, and non-empty')

        if delta_to is None:
            delta_to = {}

        logger.info(
            f"Building {'delta' if delta_to else ''} bundle for {firmware_hash}"
        )

        contents: List[ObjectInfo] = []

        with mgzip.open(filename=file,
                        mode='w' if overwrite else 'x') as gz, tarfile.open(
                            fileobj=gz,
                            mode='w',
                            bufsize=tarfile.RECORDSIZE * 4) as tar:
            # Fetch and process the file tree, using that as the basis for all other paths that need to be bundled.

            file_tree_path = f'file_tree/{firmware_hash}.jsonl'

            with CodeTimer('Read firmware file tree from object storage'):
                try:
                    file_tree_result = fetch_object(
                        bucket=self.firmware_metadata_bucket,
                        key=file_tree_path)
                except ClientError as e:
                    raise Exception(
                        'Firmware file tree could not be read') from e

            with CodeTimer('Extract file hashes from file tree'):
                try:
                    file_hashes = extract_file_hashes(file_tree_result.payload)
                except json.JSONDecodeError as e:
                    raise Exception(
                        'Firmware file tree could not be parsed') from e

            if not file_hashes:
                raise Exception('Firmware file tree is empty')

            file_tree_in_bundle = False

            if is_dataset_in_shard(
                    dataset=FILE_TREE_DATASET, shard_spec=shard_spec
            ) and file_tree_result.info.etag != delta_to.get(file_tree_path):
                with CodeTimer('Add file tree to bundle'):
                    add_to_tarfile(tar, file_tree_result)
                    contents.append(file_tree_result.info)
                    file_tree_in_bundle = True
            else:
                logger.info(
                    'File tree is unchanged or not part of this shard and will not be included in the bundle'
                )

            file_tree_size = file_tree_result.info.size

            logger.info(
                'File tree num distinct file hashes = {hash_count}; size = {size}'
                .format(hash_count=len(file_hashes),
                        size=naturalsize(file_tree_size)))

            if shard_spec:
                with CodeTimer(
                        f'Limiting file hashes to shard {shard_spec.index}'):

                    def is_in_shard(file_hash: str) -> bool:
                        return int(file_hash,
                                   16) % shard_spec.count == shard_spec.index

                    file_hashes = [
                        file_hash for file_hash in file_hashes
                        if is_in_shard(file_hash)
                    ]
                logger.info(f'Sharded num file hashes = {len(file_hashes)}')

            file_tree_result = None

            # Build paths to be bundled

            with CodeTimer('Build paths for bundle'):
                bundle_datasets = [
                    ds for ds in datasets
                    if is_dataset_in_shard(dataset=ds, shard_spec=shard_spec)
                ]
                paths = self.build_paths(firmware_hash=firmware_hash,
                                         datasets=bundle_datasets,
                                         file_hashes=file_hashes,
                                         delta_to=delta_to) or []
                path_count = len(paths)

            # Validate the paths (check for duplicates)

            with CodeTimer('Validate paths'):
                duplicates = [
                    path for path, count in collections.Counter(paths).items()
                    if count > 1
                ]
                if duplicates:
                    raise Exception(
                        f'Bundle paths contained {len(duplicates)} duplicates: {duplicates}'
                    )

            total_path_count = path_count + 1 if file_tree_in_bundle else 0

            logger.info(
                f'Bundle will include at most {total_path_count} paths from object storage'
            )

            fetch_count = 0
            miss_count = 0
            skip_count = 0
            fetch_bytes = 0

            with CodeTimer('Bundle objects'):
                with concurrent.futures.ThreadPoolExecutor(
                        max_workers=self.max_workers) as executor:
                    fetch_start = datetime.datetime.now()

                    with CodeTimer(
                            'Submit object storage path retrieval tasks'):
                        # Randomize path ordering to improve the performance of fetches from object storage,
                        # so that a diversity of object key prefixes is being fetched at any one time.

                        random.shuffle(paths)

                        futures = [
                            executor.submit(
                                fetch_object,
                                bucket=self.firmware_metadata_bucket,
                                key=path,
                                compare_etag=delta_to.get(path),
                            ) for path in paths
                        ]

                    for future in concurrent.futures.as_completed(futures):
                        try:
                            result = future.result()

                            if result:
                                add_to_tarfile(tar, result)
                                result.payload = None
                                fetch_count += 1
                                fetch_bytes += result.info.size
                                contents.append(result.info)
                            else:
                                skip_count += 1

                            if fetch_count % 1000 == 0:
                                logger.info(
                                    'Bundled {} objects ({}) in {}'.format(
                                        fetch_count, naturalsize(fetch_bytes),
                                        naturaldelta(datetime.datetime.now() -
                                                     fetch_start)))
                        except ClientError as e:
                            error_code = e.response.get('Error',
                                                        {}).get('Code')
                            if 'NoSuchKey' in error_code:
                                miss_count += 1
                            elif '304' == error_code:
                                # The ETag on this object was not modified, so it was not returned
                                skip_count += 1
                            else:
                                raise e

        if skip_count:
            logger.info(f'Skipped {skip_count} unmodified objects')

        if miss_count:
            logger.info(
                f"Made {miss_count} attempts to access object storage paths that didn't exist"
            )

        logger.info('Bundled {} objects ({})'.format(
            fetch_count + (1 if file_tree_in_bundle else 0),
            naturalsize(fetch_bytes +
                        (file_tree_size if file_tree_in_bundle else 0))))

        # Validate fetched paths (check that each path was uniquely processed)

        with CodeTimer('Validate fetched paths'):
            fetched_path_counter = collections.Counter(
                [obj.path for obj in contents])
            duplicates = [
                path for path, count in fetched_path_counter.items()
                if count > 1
            ]
            if duplicates:
                raise Exception(
                    f'Bundle paths contained {len(duplicates)} duplicates: {duplicates}'
                )

        with CodeTimer('Finalize output'):
            contents = sorted(contents, key=lambda obj: obj.path)

        return contents
Esempio n. 21
0
def get_vasp_dirs():
    ctx = click.get_current_context()
    run = ctx.parent.parent.params["run"]
    nmax = ctx.parent.params["nmax"]
    pattern = ctx.parent.params["pattern"]
    reorg = ctx.parent.params["reorg"]

    base_path = ctx.parent.params["directory"].rstrip(os.sep)
    base_path_index = len(base_path.split(os.sep))
    if pattern:
        pattern_split = pattern.split(os.sep)
        pattern_split_len = len(pattern_split)

    counter = 0
    for root, dirs, files in os.walk(base_path, topdown=True):
        if counter == nmax:
            break

        level = len(root.split(os.sep)) - base_path_index
        if pattern and dirs and pattern_split_len > level:
            p = pattern_split[level]
            dirs[:] = [d for d in dirs if fnmatch(d, p)]

        for d in dirs:
            dn = os.path.join(root, d)
            st = os.stat(dn)
            if not bool(st.st_mode & perms):
                raise EmmetCliError(
                    f"Insufficient permissions {st.st_mode} for {dn}.")

        if is_vasp_dir(files):
            gzipped = False
            for f in files:
                fn = os.path.join(root, f)
                if os.path.islink(fn):
                    if run:
                        os.unlink(fn)
                        logger.warning(f"Unlinked {fn}.")
                    else:
                        logger.warning(f"Would unlink {fn}.")
                    continue

                st = os.stat(fn)
                if not bool(st.st_mode & perms):
                    raise EmmetCliError(
                        f"Insufficient permissions {st.st_mode} for {fn}.")

                if run and not f.endswith(".gz"):
                    fn_gz = fn + ".gz"
                    if os.path.exists(fn_gz):
                        os.remove(fn_gz)  # remove left-over gz (cancelled job)

                    with open(fn, "rb") as fo, mgzip.open(fn_gz,
                                                          "wb",
                                                          thread=0) as fw:
                        fw.write(fo.read())

                    os.remove(fn)  # remove original
                    shutil.chown(fn_gz, group="matgen")
                    gzipped = True

            # NOTE skip symlink'ing on MP calculations from the early days
            vasp_dir = get_symlinked_path(root,
                                          base_path_index) if reorg else root
            create_orig_inputs(vasp_dir)
            dirs[:] = []  # don't descend further (i.e. ignore relax1/2)
            logger.log(logging.INFO if gzipped else logging.DEBUG, vasp_dir)
            yield vasp_dir
            counter += 1

    return counter
Esempio n. 22
0
def run(
    input_file: KGTKFiles,
    output_file: KGTKFiles,
    entity_ids: typing.List[str],
    input_limit: int,
    output_limit: int,
    use_mgzip_for_input: bool,
    use_mgzip_for_output: bool,
    mgzip_threads_for_input: int,
    mgzip_threads_for_output: int,
):

    import simplejson as json
    import sys

    in_path = KGTKArgumentParser.get_input_file(input_file)
    out_path = KGTKArgumentParser.get_output_file(output_file)

    from gzip import GzipFile
    print("Processing.", file=sys.stderr, flush=True)

    # Open the input file first to make it easier to monitor with "pv".
    input_f: typing.Union[GzipFile, typing.IO[typing.Any]]
    if str(in_path) == "-":
        print('Processing wikidata from standard input',
              file=sys.stderr,
              flush=True)
        # It is not well documented, but this is how you read binary data
        # from stdin in Python 3.
        input_f = sys.stdin.buffer

    else:
        print('Processing wikidata file %s' % str(in_path),
              file=sys.stderr,
              flush=True)
        input_f = open(in_path, mode='rb')

        if str(in_path).endswith(".bz2"):
            import bz2
            print('Decompressing (bz2)', file=sys.stderr, flush=True)
            # TODO: Optionally use a system decompression program.
            input_f = bz2.open(input_f)

        elif str(in_path).endswith(".gz"):
            # TODO: Optionally use a system decompression program.
            if use_mgzip_for_input:
                import mgzip
                print('Decompressing (mgzip)', file=sys.stderr, flush=True)
                input_f = mgzip.open(input_f, thread=mgzip_threads_for_input)
            else:
                import gzip
                print('Decompressing (gzip)', file=sys.stderr, flush=True)
                input_f = gzip.open(input_f)

    # Open the input file first to make it easier to monitor with "pv".
    output_f: typing.Union[GzipFile, typing.IO[typing.Any]]
    if str(out_path) == "-":
        print('Sending wikidata JSON to standatd output',
              file=sys.stderr,
              flush=True)
        # It is not well documented, but this is how you write binary data
        # from stdin in Python 3.
        output_f = sys.stdout.buffer

    else:
        print('Writing wikidata file %s' % str(out_path),
              file=sys.stderr,
              flush=True)
        output_f = open(out_path, mode='wb')

        if str(out_path).endswith(".bz2"):
            import bz2
            print('Compressing (bz2)', file=sys.stderr, flush=True)
            # TODO: Optionally use a system decompression program.
            output_f = bz2.open(output_f, "wb")

        elif str(out_path).endswith(".gz"):
            # TODO: Optionally use a compression program.
            if use_mgzip_for_output:
                import mgzip
                print('Compressing (mgzip)', file=sys.stderr, flush=True)
                output_f = mgzip.open(output_f,
                                      "wb",
                                      thread=mgzip_threads_for_output)
            else:
                import gzip
                print('Compressing (gzip)', file=sys.stderr, flush=True)
                output_f = gzip.open(output_f, "wb")

    entity_id_set: typing.Set[str] = set(entity_ids)

    output_count: int = 0
    input_count: int
    line: bytes
    for input_count, line in enumerate(input_f):
        if input_limit and input_count >= input_limit:
            break
        clean_line = line.strip()
        if clean_line.endswith(b","):
            clean_line = clean_line[:-1]
        if len(clean_line) > 1:
            obj = json.loads(clean_line)
            entity = obj["id"]
            if entity in entity_id_set:
                if output_count == 0:
                    output_f.write(b"[\n")
                else:
                    output_f.write(b",\n")
                output_f.write(clean_line)
                output_count += 1
                if output_limit is not None and output_count >= output_limit:
                    break

    print('Done processing {}'.format(str(in_path)),
          file=sys.stderr,
          flush=True)
    input_f.close()

    if output_count > 0:
        output_f.write(b"\n]\n")
    output_f.close()

    print('Wrote {} records'.format(output_count), file=sys.stderr, flush=True)
Esempio n. 23
0
        rate = orig_size / duration

        logger.info("Archived %s files (%sB) in %.3fs: %sB/s" %
                    (file_count, numToReadable(orig_size), duration,
                     numToReadable(rate)))

        write_manifest_file(out_manifest_file, manifest)

        start_time = time.time()
        blocksize = int(min(100 * 2**20, orig_size / threads))
        logger.info(
            "Compressing (%s threads, blocksize %sB) %s to %s" %
            (threads, numToReadable(blocksize), tarfile, destination_file))

        with mgzip.open(destination_file,
                        "wb",
                        thread=threads,
                        blocksize=blocksize) as my_gzip:
            with open(tarfile, "rb") as my_tar:
                my_gzip.write(my_tar.read())

        end_time = time.time()
        duration = end_time - start_time

        out_size = os.path.getsize(destination_file)
        rate = orig_size / duration
        logger.info("Compressed %sB into %sB in %.3fs: %sB/s" %
                    (numToReadable(orig_size), numToReadable(out_size),
                     duration, numToReadable(rate)))

        os.unlink(tarfile)
Esempio n. 24
0
    def gather_abstracts(self, file_):
        file_, paper_ids, output_dir, valid_lines, pid2loc = file_
        target_batchnum = int(
            file_.name.replace(".jsonl.gz", "").replace("pdf_parses_", ""))

        # load citations
        bnum_bucketed_citations = {i: [] for i in range(100)}
        for pid, (bnum, lnum) in pid2loc.items():
            bnum_bucketed_citations[bnum].append((lnum, pid))

        # batch load citations instead of reading files on the fly for each citations
        pat = re.compile(r"paper_id\":\s+\"(\d+)\"")
        pid2citation = {}
        for bnum in range(100):
            cite_valid_lines = sorted(bnum_bucketed_citations[bnum],
                                      key=lambda x: x[0])
            pdf_parse = file_.parent / file_.name.replace(
                f"{target_batchnum}", f"{bnum}")
            f_idx, l_idx = 0, 0
            with gzip.open(str(pdf_parse), "rb", thread=8) as f:
                for f_idx, line in enumerate(f):
                    if f_idx > cite_valid_lines[-1][0]:
                        break

                    if f_idx != cite_valid_lines[l_idx][0]:
                        continue
                    else:
                        l_idx += 1

                    pid = re.search(pat, line.decode("utf8")).group(1)
                    pid2citation[pid] = line

        result = []
        f_idx, l_idx = 0, 0
        with gzip.open(str(file_), "r", thread=8) as f:
            for f_idx, line in enumerate(f):
                if f_idx > valid_lines[-1]:
                    break

                if f_idx != valid_lines[l_idx]:
                    continue
                else:
                    l_idx += 1

                # json parse takes a lot of time
                obj = json.loads(line)
                pid = obj["paper_id"]

                inb_pids, outb_pids = paper_ids[pid]

                # random pick strategy
                out_data = []
                for out_pid in outb_pids:
                    out_obj = pid2citation.get(out_pid, None)
                    if out_obj is not None:
                        out_obj = json.loads(out_obj)
                    out_abs = load_bg_info(out_pid,
                                           out_obj,
                                           content="abstract")
                    # target is the paper itself, not the citation
                    bginfo = load_bg_info(out_pid, obj, content="cite_context")
                    if bginfo is None:
                        bginfo = []
                    out_data.append((out_pid, bginfo, out_abs))

                # sort by number of times a paper is cited (i.e., number of matches.)
                out_data = sorted(out_data,
                                  key=lambda x: len(x[1]),
                                  reverse=True)

                # max 10, look up 20 in case there are bunch of None's
                in_data = []
                # TODO: better inbound citation selection strategy
                for in_pid in inb_pids:
                    in_obj = pid2citation.get(in_pid, None)
                    if in_obj is not None:
                        in_obj = json.loads(in_obj)
                    cite_context = load_bg_info(pid,
                                                in_obj,
                                                content="cite_context")
                    if cite_context != []:
                        in_data.append((in_pid, cite_context))

                result.append((in_data, out_data, obj))

        with (output_dir / f"{target_batchnum}.pkl").open("wb") as f:
            pickle.dump(result, f)
Esempio n. 25
0
    else:
        raise Exception('Error: couldn\'t locate output folder/file')

    print(files_to_be_tested)
    pdfpath = ''
    if len(args.p) != 0:
        pdfpath = args.p

    # TODO: Remove this
    files_to_be_tested = files_to_be_tested[0:100]

    if False:
        all_data = []
        for file in files_to_be_tested:
            print("Reading", file)
            with mgzip.open(file, 'rb') as f:
                data_loaded = pickle.load(f)
                all_data.append(data_loaded)
        analysed_graphs, metadata = matching_and_analysis.OCAnlayzerWrapper(
            metadata).analyse_from_data(all_data)
    else:
        analysed_graphs, metadata = matching_and_analysis.OCAnlayzerWrapper(
            metadata).analyse_from_files(files_to_be_tested)
    plotter = hp.TrackMLPlotter()
    plotter.add_data_from_analysed_graph_list(analysed_graphs, metadata)
    if len(pdfpath) > 0:
        plotter.write_to_pdf(pdfpath=pdfpath)

    if len(args.analysisoutpath) != 0:
        with gzip.open(args.analysisoutpath, 'wb') as f:
            pickle.dump((analysed_graphs, metadata), f)
Esempio n. 26
0
def ff():
    with open(FN, 'rb') as f_in:
        with mgzip.open(FN+'.mpgz', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)