def dataset_stat_latex_print(dataset_name):
    """
    Print the avg precision, recall and F1 score in latex format
    to console. 
    """
    # get results
    txt_results = TextBasedResults()
    txt_results.load(dataset_name)
    txt_results.print_results()

    # package results
    elist = extractor_list_filter(txt_results.text_eval_results.keys())
    extractor_slugs = tuple([e.SLUG for e in elist])

    result_list = []
    for e in extractor_slugs:
        result_tuple = (
            get_extractor_cls(e).NAME,
            txt_results.precision_statistics(e)[0],
            txt_results.recall_statistics(e)[0],
            txt_results.f1score_statistics(e)[0],
        )
        result_list.append(result_tuple)
    result_list.sort(key=lambda i: i[3])
    result_list.reverse()

    for r in result_list:
        print "\\texttt{%s} & %.4f & %.4f & %.4f \\\\ \\hline" % r
def dataset_stat_latex_print(dataset_name):
    '''
    Print the avg precision, recall and F1 score in latex format
    to console. 
    '''
    # get results
    txt_results = TextBasedResults()
    txt_results.load(dataset_name)
    txt_results.print_results()

    #package results
    elist = extractor_list_filter(txt_results.text_eval_results.keys())
    extractor_slugs = tuple([e.SLUG for e in elist])

    result_list = []
    for e in extractor_slugs:
        result_tuple = (
            get_extractor_cls(e).NAME,
            txt_results.precision_statistics(e)[0],
            txt_results.recall_statistics(e)[0],
            txt_results.f1score_statistics(e)[0],
        )
        result_list.append(result_tuple)
    result_list.sort(key=lambda i: i[3])
    result_list.reverse()

    for r in result_list:
        print '\\texttt{%s} & %.4f & %.4f & %.4f \\\\ \\hline' % r
def local_evaluate(dataset_type, dataset_name, update_ext_slug = None):
    results = TextBasedResults()
    
    if update_ext_slug:
        results.load(dataset_name)
        ex_cls = get_extractor_cls(update_ext_slug)
        single_evaluation(ex_cls, results, dataset_type, dataset_name)
    else:
        for extractor_cls in extractor_list:
            single_evaluation(extractor_cls, results, dataset_type, dataset_name)

    results.dataset_len = len(LocalDatasetLoader(dataset_name))
    results.save(dataset_name)     
    results.print_results()
def local_evaluate(dataset_type, dataset_name, update_ext_slug=None):
    results = TextBasedResults()

    if update_ext_slug:
        results.load(dataset_name)
        ex_cls = get_extractor_cls(update_ext_slug)
        single_evaluation(ex_cls, results, dataset_type, dataset_name)
    else:
        for extractor_cls in extractor_list:
            single_evaluation(extractor_cls, results, dataset_type,
                              dataset_name)

    results.dataset_len = len(LocalDatasetLoader(dataset_name))
    results.save(dataset_name)
    results.print_results()
def local_extract(dataset_name, extractor_slug, timeout, retry_failed, skip_existing):
    # init storage and loader
    ex = get_extractor_cls(extractor_slug)
    
    failed_slug = extractor_slug if retry_failed else None
    skip_slug = extractor_slug if skip_existing else None
    
    loader = LocalDatasetLoader(dataset_name, 
                                load_failed=failed_slug, 
                                skip_existing=skip_slug)
    storage = LocalResultStorage(dataset_name, ex)
    
    logger.info('started extracting content from %s dataset using %s', dataset_name, ex.NAME)
    for doc in loader:
        storage.push_result(doc)
        if timeout:
            time.sleep(timeout)
        
    storage.dump_summary()
    logger.info('finished with %s dataset', dataset_name)
def dataset_stat_plot(dataset_name, img_name):
    """
    Plot the avg precision, recall and F1 score bar chart for the given dataset
    name.
    """
    # get results
    txt_results = TextBasedResults()
    txt_results.load(dataset_name)
    txt_results.print_results()

    # package results
    elist = extractor_list_filter(txt_results.text_eval_results.keys())
    extractor_slugs = tuple([e.SLUG for e in elist])
    packaged_data = (
        ("Precision", [(txt_results.precision_statistics(e), e) for e in extractor_slugs]),
        ("Recall", [(txt_results.recall_statistics(e), e) for e in extractor_slugs]),
        ("F1 score", [(txt_results.f1score_statistics(e), e) for e in extractor_slugs]),
    )

    bar_color = ("b", "c", "m")
    for i, pdata in enumerate(packaged_data):

        # package plotting values
        num_of_extractors = len(extractor_slugs)
        ind = np.arange(num_of_extractors)  # the x locations for the groups
        width = 0.6  # the width of the bars

        result_list = pdata[1]
        result_list.sort(key=lambda i: i[0][0])
        result_list.reverse()

        avg = [x[0][0] for x in result_list]
        stddev = [x[0][1] for x in result_list]

        # plot
        plt.subplot(3, 1, i + 1)
        plt.grid(True, alpha=0.5)

        rects_avg = plt.bar(ind, avg, width, color=bar_color[i], ecolor="g", yerr=stddev, linewidth=0.5, alpha=0.8)

        # lables and titles
        extractor_names = [get_extractor_cls(r[1]).NAME for r in result_list]
        plt.title(pdata[0])
        plt.xticks(ind + width / 2.0, extractor_names, size="xx-small", rotation="vertical")
        plt.legend((rects_avg[0],), ("avg",), fancybox=True, prop=dict(size="x-small"), loc=4)  # lower right
        for rect in rects_avg:
            height = rect.get_height()
            plt.text(
                rect.get_x() + rect.get_width() / 2.25,
                rect.get_height() + 0.01,
                "%1.2f" % height,
                ha="center",
                va="bottom",
                size="x-small",
            )

    # subplots adjusting
    plt.subplots_adjust(wspace=0.5, hspace=0.9)

    # adjust figure height
    fig = plt.gcf()
    w, h = fig.get_size_inches()
    fig.set_size_inches(w, h * 1.6)

    # output
    out_path = os.path.join(settings.PATH_LOCAL_DATA, "plot-output", img_name)
    plt.savefig(out_path)
def dataset_contents_plot(dataset_name, img_name):
    """Plot the error case analysis."""
    # get results
    txt_results = TextBasedResults()
    txt_results.load(dataset_name)
    txt_results.print_results()

    # package data
    elist = extractor_list_filter(txt_results.text_eval_results.keys())
    extractor_slugs = tuple([e.SLUG for e in elist])
    package = [
        ("|rel| = 0", "#9DFADE", [txt_results.result_contents(ex).rel_empty for ex in extractor_slugs]),
        (
            "|rel intersect ret| = 0",
            "#3C70A3",
            [txt_results.result_contents(ex).rel_ret_empty for ex in extractor_slugs],
        ),
        ("|ret| = 0", "#5CCBED", [txt_results.result_contents(ex).ret_empty for ex in extractor_slugs]),
        ("mismatch", "#A76CF5", [txt_results.result_contents(ex).missmatch for ex in extractor_slugs]),
        ("failed", "#C43156", [txt_results.result_contents(ex).fail for ex in extractor_slugs]),
        ("successful", "#31C460", [txt_results.result_contents(ex).succ for ex in extractor_slugs]),
    ]
    num_of_extractors = len(extractor_slugs)
    ind = np.arange(num_of_extractors)  # the x locations for the groups
    width = 0.6

    fig = plt.gcf()
    fig.legend(
        [plt.Rectangle((0, 0), 1, 1, fc=p[1]) for p in package],
        [p[0] for p in package],
        fancybox=True,
        prop=dict(size="x-small"),
    )

    # with successful instances
    ax1 = plt.subplot(121)
    bottom_y = np.zeros(num_of_extractors)
    for pdata in package:
        ax1.bar(ind, pdata[2], width, bottom=bottom_y, color=pdata[1], ecolor="g", linewidth=0.2, alpha=0.95)
        bottom_y += pdata[2]

    ax2 = plt.subplot(122)
    bottom_y = np.zeros(num_of_extractors)
    del package[-1]
    for pdata in package:
        ax2.bar(ind, pdata[2], width, bottom=bottom_y, color=pdata[1], ecolor="g", linewidth=0.2, alpha=0.95)
        bottom_y += pdata[2]

    # xticks labels
    extractor_names = [get_extractor_cls(e).NAME for e in extractor_slugs]
    ax1.set_xticks(ind + width / 2.0)
    ax1.set_xticklabels(extractor_names, size="xx-small", rotation="vertical")
    ax2.set_xticks(ind + width / 2.0)
    ax2.set_xticklabels(extractor_names, size="xx-small", rotation="vertical")

    # grid settings
    fig.suptitle("Boundary cases")
    ax1.grid(True, alpha=0.5)
    ax2.grid(True, alpha=0.5)

    # adjustment
    w, h = fig.get_size_inches()
    fig.set_size_inches(w * 1.5, h * 1.5)
    fig.subplots_adjust(bottom=0.2)

    # output
    out_path = os.path.join(settings.PATH_LOCAL_DATA, "plot-output", img_name)
    fig.savefig(out_path, bbox_inches="tight")
def dataset_stat_plot(dataset_name, img_name):
    '''
    Plot the avg precision, recall and F1 score bar chart for the given dataset
    name.
    '''
    # get results
    txt_results = TextBasedResults()
    txt_results.load(dataset_name)
    txt_results.print_results()

    #package results
    elist = extractor_list_filter(txt_results.text_eval_results.keys())
    extractor_slugs = tuple([e.SLUG for e in elist])
    packaged_data = (
        ('Precision', [(txt_results.precision_statistics(e), e)
                       for e in extractor_slugs]),
        ('Recall', [(txt_results.recall_statistics(e), e)
                    for e in extractor_slugs]),
        ('F1 score', [(txt_results.f1score_statistics(e), e)
                      for e in extractor_slugs]),
    )

    bar_color = ('b', 'c', 'm')
    for i, pdata in enumerate(packaged_data):

        # package plotting values
        num_of_extractors = len(extractor_slugs)
        ind = np.arange(num_of_extractors)  # the x locations for the groups
        width = 0.6  # the width of the bars

        result_list = pdata[1]
        result_list.sort(key=lambda i: i[0][0])
        result_list.reverse()

        avg = [x[0][0] for x in result_list]
        stddev = [x[0][1] for x in result_list]

        # plot
        plt.subplot(3, 1, i + 1)
        plt.grid(True, alpha=0.5)

        rects_avg = plt.bar(ind,
                            avg,
                            width,
                            color=bar_color[i],
                            ecolor='g',
                            yerr=stddev,
                            linewidth=0.5,
                            alpha=0.8)

        # lables and titles
        extractor_names = [get_extractor_cls(r[1]).NAME for r in result_list]
        plt.title(pdata[0])
        plt.xticks(ind + width / 2.,
                   extractor_names,
                   size='xx-small',
                   rotation='vertical')
        plt.legend(
            (rects_avg[0], ),
            ('avg', ),
            fancybox=True,
            prop=dict(size='x-small'),
            loc=4  # lower right 
        )
        for rect in rects_avg:
            height = rect.get_height()
            plt.text(rect.get_x() + rect.get_width() / 2.25,
                     rect.get_height() + 0.01,
                     '%1.2f' % height,
                     ha='center',
                     va='bottom',
                     size='x-small')

    #subplots adjusting
    plt.subplots_adjust(wspace=0.5, hspace=0.9)

    #adjust figure height
    fig = plt.gcf()
    w, h = fig.get_size_inches()
    fig.set_size_inches(w, h * 1.6)

    # output
    out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name)
    plt.savefig(out_path)
def dataset_contents_plot(dataset_name, img_name):
    '''Plot the error case analysis.'''
    # get results
    txt_results = TextBasedResults()
    txt_results.load(dataset_name)
    txt_results.print_results()

    # package data
    elist = extractor_list_filter(txt_results.text_eval_results.keys())
    extractor_slugs = tuple([e.SLUG for e in elist])
    package = [
        ('|rel| = 0', '#9DFADE',
         [txt_results.result_contents(ex).rel_empty
          for ex in extractor_slugs]),
        ('|rel intersect ret| = 0', '#3C70A3', [
            txt_results.result_contents(ex).rel_ret_empty
            for ex in extractor_slugs
        ]),
        ('|ret| = 0', '#5CCBED',
         [txt_results.result_contents(ex).ret_empty
          for ex in extractor_slugs]),
        ('mismatch', '#A76CF5',
         [txt_results.result_contents(ex).missmatch
          for ex in extractor_slugs]),
        ('failed', '#C43156',
         [txt_results.result_contents(ex).fail for ex in extractor_slugs]),
        ('successful', '#31C460',
         [txt_results.result_contents(ex).succ for ex in extractor_slugs]),
    ]
    num_of_extractors = len(extractor_slugs)
    ind = np.arange(num_of_extractors)  # the x locations for the groups
    width = 0.6

    fig = plt.gcf()
    fig.legend(
        [plt.Rectangle((0, 0), 1, 1, fc=p[1]) for p in package],
        [p[0] for p in package],
        fancybox=True,
        prop=dict(size='x-small'),
    )

    # with successful instances
    ax1 = plt.subplot(121)
    bottom_y = np.zeros(num_of_extractors)
    for pdata in package:
        ax1.bar(ind,
                pdata[2],
                width,
                bottom=bottom_y,
                color=pdata[1],
                ecolor='g',
                linewidth=0.2,
                alpha=0.95)
        bottom_y += pdata[2]

    ax2 = plt.subplot(122)
    bottom_y = np.zeros(num_of_extractors)
    del package[-1]
    for pdata in package:
        ax2.bar(ind,
                pdata[2],
                width,
                bottom=bottom_y,
                color=pdata[1],
                ecolor='g',
                linewidth=0.2,
                alpha=0.95)
        bottom_y += pdata[2]

    # xticks labels
    extractor_names = [get_extractor_cls(e).NAME for e in extractor_slugs]
    ax1.set_xticks(ind + width / 2.)
    ax1.set_xticklabels(extractor_names, size='xx-small', rotation='vertical')
    ax2.set_xticks(ind + width / 2.)
    ax2.set_xticklabels(extractor_names, size='xx-small', rotation='vertical')

    # grid settings
    fig.suptitle('Boundary cases')
    ax1.grid(True, alpha=0.5)
    ax2.grid(True, alpha=0.5)

    # adjustment
    w, h = fig.get_size_inches()
    fig.set_size_inches(w * 1.5, h * 1.5)
    fig.subplots_adjust(bottom=0.2)

    # output
    out_path = os.path.join(settings.PATH_LOCAL_DATA, 'plot-output', img_name)
    fig.savefig(out_path, bbox_inches='tight')