コード例 #1
0
def main():
    """Find character n-grams characteristic to specific sentiment classes.

    Returns:
    0 on success, non-0 otherwise

    """
    argparser = argparse.ArgumentParser(description="""Script for computing
correlation coefficients of sentiments and emotional expressions on the one
hand and topic and formal categories on the other hand.""")
    argparser.add_argument("src_corpus",
                           help="source corpus file")
    argparser.add_argument("src_dir",
                           help="directory containing XML corpus files")
    argparser.add_argument("basedata_dir",
                           help="directory containing basedata (tokens) "
                           "for MMAX project")
    argparser.add_argument("directory1",
                           help="directory containing markables"
                           " from the first annotator")
    argparser.add_argument("directory2",
                           help="directory containing markables"
                           "from the second annotator")
    # agreement schemes for spans
    argparser.add_argument("-b", "--binary-overlap",
                           help="consider two spans to agree on all"
                           " of tokens of their respective spans if"
                           " they overlap by at least one token"
                           " (default comparison mode)",
                           action="store_const", const=BINARY_OVERLAP,
                           default=0)
    argparser.add_argument("-p", "--proportional-overlap",
                           help="count as agreement only tokens that actually"
                           " ovelap in two spans", action="store_const",
                           const=PROPORTIONAL_OVERLAP, default=0)
    argparser.add_argument("--pattern",
                           help="shell pattern for files with markables",
                           type=str, default="*.xml")
    args = argparser.parse_args()

    # process raw corpus (populate TWEETID2CAT)
    read_src_corpus(args.src_corpus)

    # process basedata (populate TOKID2TWEETID)
    read_basedata(args.basedata_dir, args.src_dir, args.pattern)
    # compute the number of tokens pertaining to a tweet
    global TWEETTOK_CNT
    TWEETTOK_CNT = Counter(TOKID2TWEETID.itervalues())

    # check if comparison scheme was specified
    cmp_scheme = args.binary_overlap | args.proportional_overlap
    if cmp_scheme == 0:
        cmp_scheme = BINARY_OVERLAP
    # check existence and readability of directories
    dir1 = args.directory1
    dir2 = args.directory2

    assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir1)
    assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir2)
    compute_agr_stat(args.basedata_dir, dir1, dir2,
                     a_ptrn=args.pattern,
                     a_cmp=cmp_scheme)
    # do some sanity check and compute per-tweet statistics
    for t_id, tstats in TWEETID2MSTAT.iteritems():
        for mname, (m1, a1, m2, a2, marks) in tstats.iteritems():
            assert m1 <= a1, \
                "Number of matching annotations exceeds the " \
                "number of annotated tokens (1): tweet {:s} " \
                "markables {:s}". format(t_id, repr(marks))
            assert m2 <= a2, \
                "Number of matching annotations exceeds the " \
                "number of annotated tokens (2): tweet {:s} " \
                "markables {:s}". format(t_id, repr(marks))
            TWEETID2CNT_KAPPA[t_id][mname] = \
                (len(marks), _compute_kappa(m1, a1, m2, a2,
                                            max(DBL_ANNO[t_id][mname]) +
                                            TWEETTOK_CNT[t_id],
                                            cmp_scheme))
    # compute mean and variance per each category
    topic2idx, type2idx, rho_cnt, rho_kappa = \
        _compute_cat_stat(TWEETID2CNT_KAPPA)
    n = len(topic2idx) + len(type2idx)
    icnt = ikappa = 0.
    cnt_stat = kappa_stat = None
    for mname in REL_MARKABLES:
        print("{:s}".format(mname))
        cnt_stat, kappa_stat = rho_cnt[mname], rho_kappa[mname]
        print("{:20s}{:>15s}{:>25s}".format("Topic", "$\\rho_{cnt}$",
                                            "$\\rho_{\\kappa}$"))
        for itopic, idx in topic2idx.iteritems():
            icnt, ikappa = cnt_stat[idx][n], kappa_stat[idx][n]
            assert icnt == cnt_stat[idx][-1]
            assert ikappa == kappa_stat[idx][-1]
            print("{:20s}{:15.4f}{:25.4f}".format(itopic, icnt,
                                                  ikappa))
        print()
        print("{:20s}{:>15s}{:>25s}".format("Category", "$\\rho_{cnt}$",
                                            "$\\rho_{\\kappa}$"))
        for itype, idx in type2idx.iteritems():
            icnt, ikappa = cnt_stat[idx][n], kappa_stat[idx][n]
            assert icnt == cnt_stat[idx][-1]
            assert ikappa == kappa_stat[idx][-1]
            print("{:20s}{:15.4f}{:25.4f}".format(itype, icnt,
                                                  ikappa))
        print()
コード例 #2
0
def main():
    """Main method for measuring agreement and marking differences in corpus.

    Args:
    (void)

    """
    argparser = argparse.ArgumentParser(
        description="Script for plotting corpus statistics and agreement.")
    argparser.add_argument("src_corpus", help="XML corpus of source files")
    argparser.add_argument("src_dir",
                           help="directory containing XML corpus files")
    argparser.add_argument("basedata_dir",
                           help="directory containing basedata (tokens) "
                           "for MMAX project")
    argparser.add_argument("directory1",
                           help="directory containing markables"
                           " from the first annotator")
    argparser.add_argument("directory2",
                           help="directory containing markables"
                           "from the second annotator")
    # agreement schemes for spans
    argparser.add_argument("-b",
                           "--binary-overlap",
                           help="consider two spans to agree on all"
                           " of tokens of their respective spans if"
                           " they overlap by at least one token"
                           " (default comparison mode)",
                           action="store_const",
                           const=BINARY_OVERLAP,
                           default=0)
    argparser.add_argument("-p",
                           "--proportional-overlap",
                           help="count as agreement only tokens that actually"
                           " ovelap in two spans",
                           action="store_const",
                           const=PROPORTIONAL_OVERLAP,
                           default=0)
    argparser.add_argument("--pattern",
                           help="shell pattern for files with markables",
                           type=str,
                           default="*.xml")
    args = argparser.parse_args()
    # process raw corpus
    read_src_corpus(args.src_corpus)
    # process basedata
    read_basedata(args.basedata_dir, args.src_dir, args.pattern)
    # check if comparison scheme was specified
    cmp_scheme = args.binary_overlap | args.proportional_overlap
    if cmp_scheme == 0:
        cmp_scheme = BINARY_OVERLAP
    # check existence and readability of directory
    dir1 = args.directory1
    dir2 = args.directory2

    assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir1)
    assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir2)
    compute_stat(args.basedata_dir,
                 dir1,
                 dir2,
                 a_ptrn=args.pattern,
                 a_cmp=cmp_scheme)
    sorted_cats = STATISTICS.keys()
    sorted_cats.sort()

    n_toks = 0
    ikappa = 0.
    agr_mtx = None
    stat_mtx = None
    cat = stat1 = stat2 = None
    # plot statistics

    for mname in REL_MARKABLES:
        stat_mtx = np.zeros(MTX_DIM)
        agr_mtx = np.zeros(MTX_DIM)
        for icat, irow in iteritems(CAT2ROW):
            for itopic, icol in iteritems(TOP2COL):
                cat = (itopic, icat)
                n_toks = STATISTICS[cat][TOK]
                stat1 = STATISTICS[cat][ANNOTATOR][0][mname]
                stat2 = STATISTICS[cat][ANNOTATOR][-1][mname]
                total1, total2 = stat1[TOTAL_MTOK_IDX], \
                    stat2[TOTAL_MTOK_IDX]
                overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], \
                    stat2[OVERLAP_MTOK_IDX]
                ikappa = _compute_kappa(overlap1, total1, overlap2, total2,
                                        n_toks, cmp_scheme)
                agr_mtx[irow, icol] = ikappa
                stat_mtx[irow, icol] = stat2[TOTAL_MARK_IDX]
        print("mname = {:s}".format(mname), file=sys.stderr)
        print(repr(stat_mtx), file=sys.stderr)
        plot_mtx(stat_mtx, mname + STAT_MTX_SFX, "# of Elements")
        plot_mtx(agr_mtx, mname + AGR_MTX_SFX, "Agreement")

    # print("STATISTICS =", repr(STATISTICS))
    # print("POL_STAT =", repr(POL_STAT))
    # print("INT_STAT =", repr(INT_STAT))
    ialpha = 0.
    ipol_stat = None
    for mname in REL_MARKABLES:
        ipol_stat = POL_STAT[mname]
        n_toks = ipol_stat[TOTAL]
        stat1, stat2 = ipol_stat[ANNOTATOR]
        total1, total2 = stat1[TOTAL_MTOK_IDX], stat2[TOTAL_MTOK_IDX]
        overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], stat2[OVERLAP_MTOK_IDX]
        ikappa = _compute_kappa(overlap1, total1, overlap2, total2, n_toks,
                                BINARY_OVERLAP)
        ialpha = _compute_alpha(INT_STAT[mname])
        print("Kappa ({:s}): {:f}".format(mname, ikappa), file=sys.stderr)
        print("Alpha ({:s}): {:f}".format(mname, ialpha), file=sys.stderr)
コード例 #3
0
def main():
    """Find character n-grams characteristic to specific sentiment classes.

    Returns:
    0 on success, non-0 otherwise

    """
    argparser = argparse.ArgumentParser(description="""Script for computing
correlation coefficients of sentiments and emotional expressions on the one
hand and topic and formal categories on the other hand.""")
    argparser.add_argument("src_corpus", help="source corpus file")
    argparser.add_argument("src_dir",
                           help="directory containing XML corpus files")
    argparser.add_argument("basedata_dir",
                           help="directory containing basedata (tokens) "
                           "for MMAX project")
    argparser.add_argument("directory1",
                           help="directory containing markables"
                           " from the first annotator")
    argparser.add_argument("directory2",
                           help="directory containing markables"
                           "from the second annotator")
    # agreement schemes for spans
    argparser.add_argument("-b",
                           "--binary-overlap",
                           help="consider two spans to agree on all"
                           " of tokens of their respective spans if"
                           " they overlap by at least one token"
                           " (default comparison mode)",
                           action="store_const",
                           const=BINARY_OVERLAP,
                           default=0)
    argparser.add_argument("-p",
                           "--proportional-overlap",
                           help="count as agreement only tokens that actually"
                           " ovelap in two spans",
                           action="store_const",
                           const=PROPORTIONAL_OVERLAP,
                           default=0)
    argparser.add_argument("--pattern",
                           help="shell pattern for files with markables",
                           type=str,
                           default="*.xml")
    args = argparser.parse_args()

    # process raw corpus (populate TWEETID2CAT)
    read_src_corpus(args.src_corpus)

    # process basedata (populate TOKID2TWEETID)
    read_basedata(args.basedata_dir, args.src_dir, args.pattern)
    # compute the number of tokens pertaining to a tweet
    global TWEETTOK_CNT
    TWEETTOK_CNT = Counter(TOKID2TWEETID.itervalues())

    # check if comparison scheme was specified
    cmp_scheme = args.binary_overlap | args.proportional_overlap
    if cmp_scheme == 0:
        cmp_scheme = BINARY_OVERLAP
    # check existence and readability of directories
    dir1 = args.directory1
    dir2 = args.directory2

    assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir1)
    assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir2)
    compute_agr_stat(args.basedata_dir,
                     dir1,
                     dir2,
                     a_ptrn=args.pattern,
                     a_cmp=cmp_scheme)
    # do some sanity check and compute per-tweet statistics
    for t_id, tstats in TWEETID2MSTAT.iteritems():
        for mname, (m1, a1, m2, a2, marks) in tstats.iteritems():
            assert m1 <= a1, \
                "Number of matching annotations exceeds the " \
                "number of annotated tokens (1): tweet {:s} " \
                "markables {:s}". format(t_id, repr(marks))
            assert m2 <= a2, \
                "Number of matching annotations exceeds the " \
                "number of annotated tokens (2): tweet {:s} " \
                "markables {:s}". format(t_id, repr(marks))
            TWEETID2CNT_KAPPA[t_id][mname] = \
                (len(marks), _compute_kappa(m1, a1, m2, a2,
                                            max(DBL_ANNO[t_id][mname]) +
                                            TWEETTOK_CNT[t_id],
                                            cmp_scheme))
    # compute mean and variance per each category
    topic2idx, type2idx, rho_cnt, rho_kappa = \
        _compute_cat_stat(TWEETID2CNT_KAPPA)
    n = len(topic2idx) + len(type2idx)
    icnt = ikappa = 0.
    cnt_stat = kappa_stat = None
    for mname in REL_MARKABLES:
        print("{:s}".format(mname))
        cnt_stat, kappa_stat = rho_cnt[mname], rho_kappa[mname]
        print("{:20s}{:>15s}{:>25s}".format("Topic", "$\\rho_{cnt}$",
                                            "$\\rho_{\\kappa}$"))
        for itopic, idx in topic2idx.iteritems():
            icnt, ikappa = cnt_stat[idx][n], kappa_stat[idx][n]
            assert icnt == cnt_stat[idx][-1]
            assert ikappa == kappa_stat[idx][-1]
            print("{:20s}{:15.4f}{:25.4f}".format(itopic, icnt, ikappa))
        print()
        print("{:20s}{:>15s}{:>25s}".format("Category", "$\\rho_{cnt}$",
                                            "$\\rho_{\\kappa}$"))
        for itype, idx in type2idx.iteritems():
            icnt, ikappa = cnt_stat[idx][n], kappa_stat[idx][n]
            assert icnt == cnt_stat[idx][-1]
            assert ikappa == kappa_stat[idx][-1]
            print("{:20s}{:15.4f}{:25.4f}".format(itype, icnt, ikappa))
        print()
コード例 #4
0
ファイル: plot_stat.py プロジェクト: WladimirSidorenko/PotTS
def main():
    """Main method for measuring agreement and marking differences in corpus.

    Args:
    (void)

    """
    argparser = argparse.ArgumentParser(description=
                                        "Script for plotting corpus statistics"
                                        " and agreement.")
    argparser.add_argument("src_corpus",
                           help="XML corpus of source files")
    argparser.add_argument("src_dir",
                           help="directory containing XML corpus files")
    argparser.add_argument("basedata_dir",
                           help="directory containing basedata (tokens) "
                           "for MMAX project")
    argparser.add_argument("directory1",
                           help="directory containing markables"
                           " from the first annotator")
    argparser.add_argument("directory2",
                           help="directory containing markables"
                           "from the second annotator")
    # agreement schemes for spans
    argparser.add_argument("-b", "--binary-overlap",
                           help="consider two spans to agree on all"
                           " of tokens of their respective spans if"
                           " they overlap by at least one token"
                           " (default comparison mode)",
                           action="store_const", const=BINARY_OVERLAP,
                           default=0)
    argparser.add_argument("-p", "--proportional-overlap",
                           help="count as agreement only tokens that actually"
                           " ovelap in two spans", action="store_const",
                           const=PROPORTIONAL_OVERLAP, default=0)
    argparser.add_argument("--pattern",
                           help="shell pattern for files with markables",
                           type=str, default="*.xml")
    args = argparser.parse_args()
    # process raw corpus
    read_src_corpus(args.src_corpus)
    # process basedata
    read_basedata(args.basedata_dir, args.src_dir, args.pattern)
    # check if comparison scheme was specified
    cmp_scheme = args.binary_overlap | args.proportional_overlap
    if cmp_scheme == 0:
        cmp_scheme = BINARY_OVERLAP
    # check existence and readability of directory
    dir1 = args.directory1
    dir2 = args.directory2

    assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir1)
    assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \
        "Directory '{:s}' does not exist or cannot be accessed.".format(dir2)
    compute_stat(args.basedata_dir, dir1, dir2, a_ptrn=args.pattern,
                 a_cmp=cmp_scheme)
    sorted_cats = STATISTICS.keys()
    sorted_cats.sort()

    n_toks = 0
    ikappa = 0.
    agr_mtx = None
    stat_mtx = None
    pp = cat = stat1 = stat2 = None
    # plot statistics

    for mname in REL_MARKABLES:
        stat_mtx = np.zeros(MTX_DIM)
        agr_mtx = np.zeros(MTX_DIM)
        for icat, irow in CAT2ROW.iteritems():
            for itopic, icol in TOP2COL.iteritems():
                cat = (itopic, icat)
                n_toks = STATISTICS[cat][TOK]
                stat1 = STATISTICS[cat][ANNOTATOR][0][mname]
                stat2 = STATISTICS[cat][ANNOTATOR][-1][mname]
                total1, total2 = stat1[TOTAL_MTOK_IDX], \
                    stat2[TOTAL_MTOK_IDX]
                overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], \
                    stat2[OVERLAP_MTOK_IDX]
                ikappa = _compute_kappa(overlap1, total1, overlap2,
                                        total2, n_toks, cmp_scheme)
                agr_mtx[irow, icol] = ikappa
                stat_mtx[irow, icol] = stat2[TOTAL_MARK_IDX]
        print("mname = {:s}".format(mname), file=sys.stderr)
        print(repr(stat_mtx), file=sys.stderr)
        plot_mtx(stat_mtx, mname + STAT_MTX_SFX)
        plot_mtx(agr_mtx, mname + AGR_MTX_SFX)

    # print("STATISTICS =", repr(STATISTICS))
    # print("POL_STAT =", repr(POL_STAT))
    # print("INT_STAT =", repr(INT_STAT))
    n = 0
    ialpha = 0.
    ipol_stat = iint_stat = None
    for mname in REL_MARKABLES:
        ipol_stat, iint_stat = POL_STAT[mname], INT_STAT[mname]
        n_toks = ipol_stat[TOTAL]
        stat1, stat2 = ipol_stat[ANNOTATOR]
        total1, total2 = stat1[TOTAL_MTOK_IDX], stat2[TOTAL_MTOK_IDX]
        overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], stat2[OVERLAP_MTOK_IDX]
        ikappa = _compute_kappa(overlap1, total1, overlap2,
                                total2, n_toks, BINARY_OVERLAP)
        ialpha = _compute_alpha(INT_STAT[mname])
        print("Kappa ({:s}): {:f}".format(mname, ikappa), file=sys.stderr)
        print("Alpha ({:s}): {:f}".format(mname, ialpha), file=sys.stderr)