Exemple #1
0
def test_nan():
    """Test dealing with nan values."""

    data = np.arange(7, dtype=float)
    data[5] = np.nan
    bins = np.arange(8)
    hp.print_hist((data, bins))
def print1d(h1d, title=""):
    # try:
    vals = h1d.project('msd').values()[()]
    edges = h1d.project('msd').axis("msd").edges()
    print_hist((vals, edges), title=title, 
            columns=100,
    )
Exemple #3
0
def test_boost():
    """Test boost-histogram if it is available."""

    try:
        import boost_histogram as bh
    except ModuleNotFoundError:
        return

    hist = bh.Histogram(bh.axis.Regular(20, -3, 3))
    hist.fill(np.random.randn(1000))
    hp.print_hist(hist, title="Boost Histogram")
Exemple #4
0
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="input file to read from (stdin by default)")
    parser.add_argument("-b",
                        "--buckets",
                        type=int,
                        help="number of bins",
                        default=20)
    parser.add_argument(
        "-s",
        "--screen-width",
        type=int,
        help="maximum screen width",
        default=os.get_terminal_size()[0],
    )
    parser.add_argument("-t", "--label", type=str, help="label for plot")
    parser.add_argument("-o",
                        "--output-image",
                        type=str,
                        help="save image to file")
    args = parser.parse_args()

    print(
        "Classic hist interface - please use histoprint instead; this supports multiple file formats and much more!"
    )

    with open(args.input) if args.input else sys.stdin as f:
        values = [float(v) for v in f]

    h = bh.numpy.histogram(values, bins=args.buckets, histogram=hist.Hist)

    if args.output_image:
        import matplotlib.pyplot as plt

        h.plot()
        plt.savefig(args.output_image)
    else:
        print_hist(
            h,
            label=args.label,
            summary=True,
        )
Exemple #5
0
    def show(self, **kwargs: Any) -> Any:
        """
        Pretty print the stacked histograms to the console.
        """
        if "labels" not in kwargs:
            if all(h.name is not None for h in self):
                kwargs["labels"] = [h.name for h in self]

        return histoprint.print_hist(list(self), stack=True, **kwargs)
Exemple #6
0
def test_uproot():
    """Test uproot hsitograms if it is available."""

    try:
        # Only used to check whether modules are available
        import awkward  # noqa: F401
        import uproot
    except ModuleNotFoundError:
        return

    with uproot.open("tests/data/histograms.root") as F:
        hist = F["one"]

    try:
        # Works with uproot 3
        hist.show()
    except Exception:
        pass
    hp.print_hist(hist, title="uproot TH1")
Exemple #7
0
def _histoprint_csv(infile, **kwargs):
    """Interpret file as as CSV file."""

    import pandas as pd

    # Read the data
    data = pd.read_csv(infile)
    cut = kwargs.pop("cut", "")
    if cut is not None and len(cut) > 0:
        try:
            data = data[data.eval(cut)]
        except Exception as e:
            click.echo("Error interpreting the cut string:", err=True)
            click.echo(e, err=True)
            exit(1)

    # Interpret field numbers/names
    fields = list(kwargs.pop("fields", []))
    if len(fields) > 0:
        try:
            data = data[fields]
        except KeyError:
            click.echo("Unknown column name.", err=True)
            exit(1)

    # Get default columns labels
    if kwargs.get("labels", ("", )) == ("", ):
        kwargs["labels"] = data.columns

    # Convert to array
    data = data.to_numpy().T

    # Interpret bins
    bins = _bin_edges(kwargs, data)

    # Create the histogram(s)
    hist = ([], bins)  # type: Tuple[Any, Any]
    for d in data:
        hist[0].append(np.histogram(d, bins=bins)[0])

    # Print the histogram
    hp.print_hist(hist, **kwargs)
Exemple #8
0
def _histoprint_txt(infile, **kwargs):
    """Interpret file as as simple whitespace separated table."""

    # Read the data
    data = np.loadtxt(infile, ndmin=2)
    data = data.T
    cut = kwargs.pop("cut", "")
    if cut is not None and len(cut) > 0:
        try:
            data = data[:, eval(cut)]
        except Exception as e:
            click.echo("Error interpreting the cut string:", err=True)
            click.echo(e, err=True)
            exit(1)

    # Interpret field numbers
    fields = kwargs.pop("fields", [])
    if len(fields) > 0:
        try:
            fields = [int(f) for f in fields]
        except ValueError:
            click.echo("Fields for a TXT file must be integers.", err=True)
            exit(1)
        try:
            data = data[fields]
        except KeyError:
            click.echo("Field out of bounds.", err=True)
            exit(1)

    # Interpret bins
    bins = _bin_edges(kwargs, data)

    # Create the histogram(s)
    hist = ([], bins)  # type: Tuple[Any, Any]
    for d in data:
        hist[0].append(np.histogram(d, bins=bins)[0])

    # Print the histogram
    hp.print_hist(hist, **kwargs)
Exemple #9
0
def show_hists(h, title='', scaleH=False):
    hall = []
    names = []
    for name in h.axis(h.sparse_axes()[0]).identifiers():
        _h = h[name].project('msd').values()[()]
        if name.name.startswith("GluGlu") and scaleH:
            if np.isreal(scaleH):
                _h *= scaleH
            else:
                _h *= 100
        bins = h[name].axis('msd').edges()
        hall.append(_h)
        names.append(name.name)

    print_hist(
        (hall, bins),
        title=title,
        labels=names,
        stack=True,
        symbols=" ",
        #columns=100,
        bg_colors="rgbcmy")
Exemple #10
0
def test_hist():
    """Poor man's unit tests."""

    A = np.random.randn(1000) - 2
    B = np.random.randn(1000)
    C = np.random.randn(1000) + 2
    D = np.random.randn(500) * 2

    hp.text_hist(B)
    hp.text_hist(B,
                 bins=[-5, -3, -2, -1, -0.5, 0, 0.5, 1, 2, 3, 5],
                 title="Variable bin widths")

    histA = np.histogram(A, bins=15, range=(-5, 5))
    histB = np.histogram(B, bins=15, range=(-5, 5))
    histC = np.histogram(C, bins=15, range=(-5, 5))
    histD = np.histogram(D, bins=15, range=(-5, 5))
    histAll = ([histA[0], histB[0], histC[0], histD[0]], histA[1])

    hp.print_hist(histAll, title="Overlays", labels="ABCDE")
    hp.print_hist(
        histAll,
        title="Stacks",
        stack=True,
        symbols="      ",
        bg_colors="rgbcmy",
        labels="ABCDE",
    )
    hp.print_hist(
        histAll,
        title="Summaries",
        symbols=r"=|\/",
        fg_colors="0",
        bg_colors="0",
        labels=["AAAAAAAAAAAAAAAA", "B", "CCCCCCCCCCCCC", "D"],
        summary=True,
    )
    hp.print_hist(
        (histAll[0][:3], histAll[1]),
        title="No composition",
        labels=["A", "B", "C"],
    )
Exemple #11
0
    def show(self, **kwargs: Any) -> Any:
        """
        Pretty print histograms to the console.
        """

        return histoprint.print_hist(self, **kwargs)
Exemple #12
0
def _histoprint_root(infile, **kwargs):
    """Interpret file as as ROOT file."""

    # Import uproot
    try:
        import uproot as up
    except ImportError:
        click.echo("Cannot try ROOT file format. Uproot module not found.",
                   err=True)
        raise
    # Import awkward
    try:
        import awkward as ak
    except ImportError:
        click.echo("Cannot try ROOT file format. Awkward module not found.",
                   err=True)
        raise

    # Open root file
    F = up.open(infile)

    # Interpret field names
    fields = list(kwargs.pop("fields", []))
    if len(fields) == 0:
        click.echo("Must specify at least one field for ROOT files.", err=True)
        click.echo(F.keys(), err=True)
        exit(1)

    # Get default columns labels
    if kwargs.get("labels", ("", )) == ("", ):
        kwargs["labels"] = [field.split("/")[-1] for field in fields]
    labels = kwargs.pop("labels")

    # Get possible cut expression
    cut = kwargs.pop("cut", "")

    # Possibly a single histogram
    if len(fields) == 1:
        try:
            hist = F[fields[0]]
        except KeyError:
            pass  # Deal with key error further down
        else:
            try:
                hist = F[fields[0]].to_numpy()
            except AttributeError:
                pass
            else:
                kwargs.pop("bins", None)  # Get rid of useless parameter
                hp.print_hist(hist, **kwargs)
                return

    data = []
    # Find TTrees
    trees = []  # type: List[up.models.TTree.Model_TTree_v19]
    tree_fields = []  # type: List[List[Dict[str, Any]]]
    for field, label in zip(fields, labels):
        branch = F
        splitfield = field.split("/")
        for i, key in enumerate(splitfield):
            try:
                branch = branch[key]
            except KeyError:
                click.echo(
                    "Could not find key '%s'. Possible values: %s" %
                    (key, branch.keys()),
                    err=True,
                )
                exit(1)
            # Has `arrays` method?
            try:
                branch.arrays
            except AttributeError:
                pass
            else:
                # Found it
                path = "/".join(splitfield[i + 1:])
                if branch in trees:
                    tree_fields[trees.index(branch)].append({
                        "label": label,
                        "path": path
                    })
                else:
                    trees.append(branch)
                    tree_fields.append([{"label": label, "path": path}])

                break

    # Reassign labels in correct order
    labels = []
    # Get and flatten the data
    for tree, fields in zip(trees, tree_fields):
        aliases = {}
        d = []
        for field in fields:
            labels.append(field["label"])
            split = field["path"].split("[")
            path = split[0]
            if len(split) > 1:
                slic = "[" + "[".join(split[1:])
            else:
                slic = ""
            aliases[field["label"]] = path
            # Get the branches
            try:
                d.append(eval("tree[path].array()" + slic))
            except up.KeyInFileError as e:
                click.echo(e, err=True)
                click.echo("Possible keys: %s" % tree.keys(), err=True)
                exit(1)

        # Cut on values
        if cut is not None:
            try:
                index = tree.arrays("cut", aliases={"cut": cut}).cut
            except up.KeyInFileError as e:
                click.echo(e, err=True)
                click.echo("Possible keys: %s" % tree.keys(), err=True)
                exit(1)
            except Exception as e:
                click.echo("Error interpreting the cut string:", err=True)
                click.echo(e, err=True)
                exit(1)

            for i in range(len(d)):
                d[i] = d[i][index]

        # Flatten if necessary
        for i in range(len(d)):
            try:
                d[i] = ak.flatten(d[i])
            except ValueError:
                pass

            # Turn into flat numpy array
            d[i] = ak.to_numpy(d[i])

        data.extend(d)

    # Assign new label order
    kwargs["labels"] = labels

    # Interpret bins
    bins = _bin_edges(kwargs, data)

    # Create the histogram(s)
    hist = ([], bins)
    for d in data:
        hist[0].append(np.histogram(d, bins=bins)[0])

    # Print the histogram
    hp.print_hist(hist, **kwargs)
def main(
    in_vcf: str,
    out_vcf: str,
    overwrite: bool,
    verbose: bool,
    min_qual: float,
    min_depth: int,
    min_fed: float,
    max_depth: int,
    min_strand_bias: int,
    min_bqb: float,
    min_mqb: float,
    min_rpb: float,
    min_rpbz: Optional[float],
    max_rpbz: Optional[float],
    max_scbz: Optional[float],
    max_sgb: float,
    min_vdb: float,
    hist: bool,
    min_frs: float,
    min_mq: int,
):
    """Apply the following filters to a VCF:\n
    - Minimum proportion of the expected (median) depth\n
    - Maximum proportion of the expected (median) depth\n
    - Minimum QUAL threshold\n
    - Minimum Strand bias percentage
    """
    log_level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level
    )

    vcf_reader = VCF(in_vcf)
    if not vcf_reader.contains(Tags.Depth.value):
        raise DepthTagError(f"Depth tag {Tags.Depth} not found in header")

    if (not vcf_reader.contains(str(Tags.StrandDepth))) and min_strand_bias:
        logging.warning(
            f"Strand depth tag {Tags.StrandDepth} not found in header. "
            f"Turning off strand bias filter..."
        )
        min_strand_bias = 0

    logging.info("Calculating expected (median) depth...")
    depths = []
    quals = []
    for v in vcf_reader:
        depths.append(get_depth(v))
        quals.append(v.QUAL or 0)

    median_depth = np.median(depths)
    logging.info(f"Expected depth: {median_depth}")

    if hist:
        import histoprint

        tick_format = "% .1f"
        logging.info("Depth histogram:")
        histoprint.print_hist(
            np.histogram(depths, bins=HIST_BINS),
            title="Depth histogram",
            summary=True,
            tick_format=tick_format,
            file=click.get_text_stream("stderr"),
        )

        logging.info("QUAL histogram")
        histoprint.print_hist(
            np.histogram(quals, bins=HIST_BINS),
            title="QUAL histogram",
            summary=True,
            tick_format=tick_format,
            file=click.get_text_stream("stderr"),
        )

    assessor = Filter(
        expected_depth=int(median_depth),
        min_qual=min_qual,
        min_depth=min_depth,
        min_fed=min_fed,
        max_depth=max_depth,
        min_strand_bias=min_strand_bias,
        min_bqb=min_bqb,
        min_mqb=min_mqb,
        min_rpb=min_rpb,
        max_sgb=max_sgb,
        min_vdb=min_vdb,
        min_frs=min_frs,
        min_mq=min_mq,
        min_rpbz=min_rpbz,
        max_rpbz=max_rpbz,
        max_scbz=max_scbz,
    )

    vcf_reader = VCF(in_vcf)
    assessor.add_filters_to_header(vcf_reader)

    if not Path(out_vcf).parent.exists():
        Path(out_vcf).parent.mkdir(exist_ok=True, parents=True)

    vcf_writer = Writer(out_vcf, tmpl=vcf_reader)

    stats = Counter()
    logging.info("Filtering variants...")
    for variant in vcf_reader:
        filter_status = assessor.filter_status(variant)

        if (
            (not overwrite)
            and variant.FILTER is not None
            and filter_status != str(Tags.Pass)
        ):
            current_filter = variant.FILTER.rstrip(";")
            variant.FILTER = f"{current_filter};{filter_status}"
        else:
            variant.FILTER = filter_status

        vcf_writer.write_record(variant)

        stats.update(filter_status.split(";"))

    vcf_reader.close()
    vcf_writer.close()

    logging.info("FILTER STATISTICS")
    logging.info("=================")
    for filt, count in stats.items():
        logging.info(f"Filter: {filt}\tCount: {count}")

    logging.info("Done!")