Example #1
0
    def get_shingles(self, input_text, prefix=None):
        """Return a vector of shingles from a source text

        :param input_text: Input sequence
        :type input_text: collections.Iterable
        :param prefix: an object to prepend to token sequence
        :type prefix: object
        :return: A set of shingles (tuples)
        :rtype: set, list

        """
        normalizer = self._normalizer
        text = input_text \
            if normalizer is None \
            else normalizer.normalize(input_text)
        tokens = text if isiterable(text) else self._tokenize(text)
        span = self._span
        unique = self._unique
        kmin = self._kmin
        if not unique and kmin > 0:
            # cycle tokens until we can take kmin shingles
            token_count = len(tokens)
            prefix_token_count = 0 if prefix is None else 1
            num_shingles = token_count - span + prefix_token_count + 1
            append_num = kmin - num_shingles
            if append_num > 0:
                tokens = take(token_count + append_num, cycle(tokens))
        final_it = tokens if prefix is None else chain([prefix], tokens)
        shingles = self._shinglify(final_it, span, skip=self._skip)
        result = set(shingles) if unique else list(shingles)
        return result
Example #2
0
    def get_shingles(self, input_text, prefix=None):
        """Return a vector of shingles from a source text

        :param input_text: Input sequence
        :type input_text: collections.Iterable
        :param prefix: an object to prepend to token sequence
        :type prefix: object
        :return: A set of shingles (tuples)
        :rtype: set, list
        """
        normalizer = self._normalizer
        text = input_text \
            if normalizer is None \
            else normalizer.normalize(input_text)
        tokens = text if isiterable(text) else self._tokenize(text)
        span = self._span
        unique = self._unique
        kmin = self._kmin
        if not unique and kmin > 0:
            # cycle tokens until we can take kmin shingles
            token_count = len(tokens)
            prefix_token_count = 0 if prefix is None else 1
            num_shingles = token_count - span + prefix_token_count + 1
            append_num = kmin - num_shingles
            if append_num > 0:
                tokens = take(token_count + append_num, cycle(tokens))
        final_it = tokens if prefix is None else chain([prefix], tokens)
        shingles = self._shinglify(final_it, span, skip=self._skip)
        result = set(shingles) if unique else list(shingles)
        return result
Example #3
0
def create_plots(args, df):
    import jinja2
    import matplotlib.pyplot as plt
    from palettable import colorbrewer
    from matplotlib.font_manager import FontProperties

    fontP = FontProperties()
    fontP.set_size("xx-small")

    # groups = df.set_index(args.x_axis).groupby([args.group_by])
    groups = df.groupby([args.group_by])
    metrics = list(set(args.metrics) & set(df.keys()))
    colors = take(
        len(metrics),
        cycle(chain(colorbrewer.qualitative.Dark2_8.mpl_colors, colorbrewer.qualitative.Set2_8.mpl_colors)),
    )

    template_loader = jinja2.FileSystemLoader(os.path.join(args.output, ".."))
    template_env = jinja2.Environment(loader=template_loader)
    template_interactive = template_env.get_template("template_fig_interactive.html")
    template_static = template_env.get_template("template_fig_static.html")

    table_interactive = []
    table_static = []

    for group_name, group in groups:

        # always sort by X values
        group = group.sort([args.x_axis])

        if args.fig_title is None:
            fig_title = "%s=%s" % (args.group_by, group_name)
        else:
            fig_title = args.fig_title

        # compute AUC scores
        ys = []
        for metric, color in zip(metrics, colors):
            series = group[metric]
            score = auc_xscaled(group[args.x_axis].values, series.values)
            label = "%s (%.4f)" % (metric, score)
            ys.append((score, metric, label, color))
        ys.sort(reverse=True)

        lbls_old, lbls_new, colors = zip(*ys)[1:4]
        group = (
            group[[args.x_axis] + list(lbls_old)].set_index(args.x_axis).rename(columns=dict(zip(lbls_old, lbls_new)))
        )

        # create plots
        fig, ax = plt.subplots()
        group.plot(ax=ax, title=fig_title, color=list(colors))
        ax.set_xlim(*minmaxr(group.index.values))
        ax.set_ylim(0.4, 1.0)
        ax.legend(loc=args.legend_loc, prop=fontP)
        fig_name = "fig-%s.%s" % (group_name, args.fig_format)
        fig_path = os.path.join(args.output, fig_name)
        csv_name = "fig-%s.csv" % group_name
        csv_path = os.path.join(args.output, csv_name)
        group.to_csv(csv_path)

        table_interactive.append((csv_name, args.x_axis, "%s=%s" % (args.group_by, group_name)))
        table_static.append(fig_name)

        fig.savefig(fig_path, format=args.fig_format)
        plt.close(fig)

    with open(os.path.join(args.output, "fig_interactive.html"), "w") as fh:
        fh.write(template_interactive.render(table=table_interactive))

    with open(os.path.join(args.output, "fig_static.html"), "w") as fh:
        fh.write(template_static.render(table=table_static))
        cat_filter=cat_filter)
    data = dataset.data[:n_samples]

    samples = data

elif args.input is not None:
    if args.ground_tag is not None:
        get_ground_truth = partial(has_common_tags, TAG_MAP[args.ground_tag])
    elif args.ground_attr is not None:
        get_ground_truth = ATTR_MAP[args.ground_attr]
    else:
        raise ValueError("neither ground_tag nor ground_attr specified")

    with open(args.input, 'r') as fh:
        dataset = imap(json.loads, fh)
        data = take(n_samples, dataset)
    samples = [s['object'] for s in data]

else:
    raise ValueError("No input sources specified.")


if n_samples == float('inf'):
    n_samples = len(data)
    assert n_samples >= 2

if n_topics is None:
    n_topics = 2

if args.n_features is None:
    n_features = n_topics + 1
def create_plots(args, df):
    import jinja2
    import matplotlib.pyplot as plt
    from palettable import colorbrewer
    from matplotlib.font_manager import FontProperties

    fontP = FontProperties()
    fontP.set_size('xx-small')

    #groups = df.set_index(args.x_axis).groupby([args.group_by])
    groups = df.groupby([args.group_by])
    metrics = list(set(args.metrics) & set(df.keys()))
    colors = take(
        len(metrics),
        cycle(
            chain(
                colorbrewer.qualitative.Dark2_8.mpl_colors,
                colorbrewer.qualitative.Set2_8.mpl_colors,
            )))

    template_loader = jinja2.FileSystemLoader(os.path.join(args.output, '..'))
    template_env = jinja2.Environment(loader=template_loader)
    template_interactive = template_env.get_template(
        'template_fig_interactive.html')
    template_static = template_env.get_template('template_fig_static.html')

    table_interactive = []
    table_static = []

    for group_name, group in groups:

        # always sort by X values
        group = group.sort([args.x_axis])

        if args.fig_title is None:
            fig_title = '%s=%s' % (args.group_by, group_name)
        else:
            fig_title = args.fig_title

        # compute AUC scores
        ys = []
        for metric, color in zip(metrics, colors):
            series = group[metric]
            score = auc_xscaled(group[args.x_axis].values, series.values)
            label = "%s (%.4f)" % (metric, score)
            ys.append((score, metric, label, color))
        ys.sort(reverse=True)

        lbls_old, lbls_new, colors = zip(*ys)[1:4]
        group = group[[args.x_axis] + list(lbls_old)] \
            .set_index(args.x_axis) \
            .rename(columns=dict(zip(lbls_old, lbls_new)))

        # create plots
        fig, ax = plt.subplots()
        group.plot(ax=ax, title=fig_title, color=list(colors))
        ax.set_xlim(*minmaxr(group.index.values))
        ax.set_ylim(0.4, 1.0)
        ax.legend(loc=args.legend_loc, prop=fontP)
        fig_name = 'fig-%s.%s' % (group_name, args.fig_format)
        fig_path = os.path.join(args.output, fig_name)
        csv_name = 'fig-%s.csv' % group_name
        csv_path = os.path.join(args.output, csv_name)
        group.to_csv(csv_path)

        table_interactive.append((
            csv_name,
            args.x_axis,
            "%s=%s" % (args.group_by, group_name),
        ))
        table_static.append(fig_name)

        fig.savefig(fig_path, format=args.fig_format)
        plt.close(fig)

    with open(os.path.join(args.output, 'fig_interactive.html'), 'w') as fh:
        fh.write(template_interactive.render(table=table_interactive))

    with open(os.path.join(args.output, 'fig_static.html'), 'w') as fh:
        fh.write(template_static.render(table=table_static))