def main(): """Find character n-grams characteristic to specific sentiment classes. Returns: 0 on success, non-0 otherwise """ argparser = argparse.ArgumentParser(description="""Script for computing correlation coefficients of sentiments and emotional expressions on the one hand and topic and formal categories on the other hand.""") argparser.add_argument("src_corpus", help="source corpus file") argparser.add_argument("src_dir", help="directory containing XML corpus files") argparser.add_argument("basedata_dir", help="directory containing basedata (tokens) " "for MMAX project") argparser.add_argument("directory1", help="directory containing markables" " from the first annotator") argparser.add_argument("directory2", help="directory containing markables" "from the second annotator") # agreement schemes for spans argparser.add_argument("-b", "--binary-overlap", help="consider two spans to agree on all" " of tokens of their respective spans if" " they overlap by at least one token" " (default comparison mode)", action="store_const", const=BINARY_OVERLAP, default=0) argparser.add_argument("-p", "--proportional-overlap", help="count as agreement only tokens that actually" " ovelap in two spans", action="store_const", const=PROPORTIONAL_OVERLAP, default=0) argparser.add_argument("--pattern", help="shell pattern for files with markables", type=str, default="*.xml") args = argparser.parse_args() # process raw corpus (populate TWEETID2CAT) read_src_corpus(args.src_corpus) # process basedata (populate TOKID2TWEETID) read_basedata(args.basedata_dir, args.src_dir, args.pattern) # compute the number of tokens pertaining to a tweet global TWEETTOK_CNT TWEETTOK_CNT = Counter(TOKID2TWEETID.itervalues()) # check if comparison scheme was specified cmp_scheme = args.binary_overlap | args.proportional_overlap if cmp_scheme == 0: cmp_scheme = BINARY_OVERLAP # check existence and readability of directories dir1 = args.directory1 dir2 = args.directory2 assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \ "Directory '{:s}' does not exist or cannot be accessed.".format(dir1) assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \ "Directory '{:s}' does not exist or cannot be accessed.".format(dir2) compute_agr_stat(args.basedata_dir, dir1, dir2, a_ptrn=args.pattern, a_cmp=cmp_scheme) # do some sanity check and compute per-tweet statistics for t_id, tstats in TWEETID2MSTAT.iteritems(): for mname, (m1, a1, m2, a2, marks) in tstats.iteritems(): assert m1 <= a1, \ "Number of matching annotations exceeds the " \ "number of annotated tokens (1): tweet {:s} " \ "markables {:s}". format(t_id, repr(marks)) assert m2 <= a2, \ "Number of matching annotations exceeds the " \ "number of annotated tokens (2): tweet {:s} " \ "markables {:s}". format(t_id, repr(marks)) TWEETID2CNT_KAPPA[t_id][mname] = \ (len(marks), _compute_kappa(m1, a1, m2, a2, max(DBL_ANNO[t_id][mname]) + TWEETTOK_CNT[t_id], cmp_scheme)) # compute mean and variance per each category topic2idx, type2idx, rho_cnt, rho_kappa = \ _compute_cat_stat(TWEETID2CNT_KAPPA) n = len(topic2idx) + len(type2idx) icnt = ikappa = 0. cnt_stat = kappa_stat = None for mname in REL_MARKABLES: print("{:s}".format(mname)) cnt_stat, kappa_stat = rho_cnt[mname], rho_kappa[mname] print("{:20s}{:>15s}{:>25s}".format("Topic", "$\\rho_{cnt}$", "$\\rho_{\\kappa}$")) for itopic, idx in topic2idx.iteritems(): icnt, ikappa = cnt_stat[idx][n], kappa_stat[idx][n] assert icnt == cnt_stat[idx][-1] assert ikappa == kappa_stat[idx][-1] print("{:20s}{:15.4f}{:25.4f}".format(itopic, icnt, ikappa)) print() print("{:20s}{:>15s}{:>25s}".format("Category", "$\\rho_{cnt}$", "$\\rho_{\\kappa}$")) for itype, idx in type2idx.iteritems(): icnt, ikappa = cnt_stat[idx][n], kappa_stat[idx][n] assert icnt == cnt_stat[idx][-1] assert ikappa == kappa_stat[idx][-1] print("{:20s}{:15.4f}{:25.4f}".format(itype, icnt, ikappa)) print()
def main(): """Main method for measuring agreement and marking differences in corpus. Args: (void) """ argparser = argparse.ArgumentParser( description="Script for plotting corpus statistics and agreement.") argparser.add_argument("src_corpus", help="XML corpus of source files") argparser.add_argument("src_dir", help="directory containing XML corpus files") argparser.add_argument("basedata_dir", help="directory containing basedata (tokens) " "for MMAX project") argparser.add_argument("directory1", help="directory containing markables" " from the first annotator") argparser.add_argument("directory2", help="directory containing markables" "from the second annotator") # agreement schemes for spans argparser.add_argument("-b", "--binary-overlap", help="consider two spans to agree on all" " of tokens of their respective spans if" " they overlap by at least one token" " (default comparison mode)", action="store_const", const=BINARY_OVERLAP, default=0) argparser.add_argument("-p", "--proportional-overlap", help="count as agreement only tokens that actually" " ovelap in two spans", action="store_const", const=PROPORTIONAL_OVERLAP, default=0) argparser.add_argument("--pattern", help="shell pattern for files with markables", type=str, default="*.xml") args = argparser.parse_args() # process raw corpus read_src_corpus(args.src_corpus) # process basedata read_basedata(args.basedata_dir, args.src_dir, args.pattern) # check if comparison scheme was specified cmp_scheme = args.binary_overlap | args.proportional_overlap if cmp_scheme == 0: cmp_scheme = BINARY_OVERLAP # check existence and readability of directory dir1 = args.directory1 dir2 = args.directory2 assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \ "Directory '{:s}' does not exist or cannot be accessed.".format(dir1) assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \ "Directory '{:s}' does not exist or cannot be accessed.".format(dir2) compute_stat(args.basedata_dir, dir1, dir2, a_ptrn=args.pattern, a_cmp=cmp_scheme) sorted_cats = STATISTICS.keys() sorted_cats.sort() n_toks = 0 ikappa = 0. agr_mtx = None stat_mtx = None cat = stat1 = stat2 = None # plot statistics for mname in REL_MARKABLES: stat_mtx = np.zeros(MTX_DIM) agr_mtx = np.zeros(MTX_DIM) for icat, irow in iteritems(CAT2ROW): for itopic, icol in iteritems(TOP2COL): cat = (itopic, icat) n_toks = STATISTICS[cat][TOK] stat1 = STATISTICS[cat][ANNOTATOR][0][mname] stat2 = STATISTICS[cat][ANNOTATOR][-1][mname] total1, total2 = stat1[TOTAL_MTOK_IDX], \ stat2[TOTAL_MTOK_IDX] overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], \ stat2[OVERLAP_MTOK_IDX] ikappa = _compute_kappa(overlap1, total1, overlap2, total2, n_toks, cmp_scheme) agr_mtx[irow, icol] = ikappa stat_mtx[irow, icol] = stat2[TOTAL_MARK_IDX] print("mname = {:s}".format(mname), file=sys.stderr) print(repr(stat_mtx), file=sys.stderr) plot_mtx(stat_mtx, mname + STAT_MTX_SFX, "# of Elements") plot_mtx(agr_mtx, mname + AGR_MTX_SFX, "Agreement") # print("STATISTICS =", repr(STATISTICS)) # print("POL_STAT =", repr(POL_STAT)) # print("INT_STAT =", repr(INT_STAT)) ialpha = 0. ipol_stat = None for mname in REL_MARKABLES: ipol_stat = POL_STAT[mname] n_toks = ipol_stat[TOTAL] stat1, stat2 = ipol_stat[ANNOTATOR] total1, total2 = stat1[TOTAL_MTOK_IDX], stat2[TOTAL_MTOK_IDX] overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], stat2[OVERLAP_MTOK_IDX] ikappa = _compute_kappa(overlap1, total1, overlap2, total2, n_toks, BINARY_OVERLAP) ialpha = _compute_alpha(INT_STAT[mname]) print("Kappa ({:s}): {:f}".format(mname, ikappa), file=sys.stderr) print("Alpha ({:s}): {:f}".format(mname, ialpha), file=sys.stderr)
def main(): """Main method for measuring agreement and marking differences in corpus. Args: (void) """ argparser = argparse.ArgumentParser(description= "Script for plotting corpus statistics" " and agreement.") argparser.add_argument("src_corpus", help="XML corpus of source files") argparser.add_argument("src_dir", help="directory containing XML corpus files") argparser.add_argument("basedata_dir", help="directory containing basedata (tokens) " "for MMAX project") argparser.add_argument("directory1", help="directory containing markables" " from the first annotator") argparser.add_argument("directory2", help="directory containing markables" "from the second annotator") # agreement schemes for spans argparser.add_argument("-b", "--binary-overlap", help="consider two spans to agree on all" " of tokens of their respective spans if" " they overlap by at least one token" " (default comparison mode)", action="store_const", const=BINARY_OVERLAP, default=0) argparser.add_argument("-p", "--proportional-overlap", help="count as agreement only tokens that actually" " ovelap in two spans", action="store_const", const=PROPORTIONAL_OVERLAP, default=0) argparser.add_argument("--pattern", help="shell pattern for files with markables", type=str, default="*.xml") args = argparser.parse_args() # process raw corpus read_src_corpus(args.src_corpus) # process basedata read_basedata(args.basedata_dir, args.src_dir, args.pattern) # check if comparison scheme was specified cmp_scheme = args.binary_overlap | args.proportional_overlap if cmp_scheme == 0: cmp_scheme = BINARY_OVERLAP # check existence and readability of directory dir1 = args.directory1 dir2 = args.directory2 assert os.path.isdir(dir1) and os.access(dir1, os.X_OK), \ "Directory '{:s}' does not exist or cannot be accessed.".format(dir1) assert os.path.isdir(dir2) and os.access(dir2, os.X_OK), \ "Directory '{:s}' does not exist or cannot be accessed.".format(dir2) compute_stat(args.basedata_dir, dir1, dir2, a_ptrn=args.pattern, a_cmp=cmp_scheme) sorted_cats = STATISTICS.keys() sorted_cats.sort() n_toks = 0 ikappa = 0. agr_mtx = None stat_mtx = None pp = cat = stat1 = stat2 = None # plot statistics for mname in REL_MARKABLES: stat_mtx = np.zeros(MTX_DIM) agr_mtx = np.zeros(MTX_DIM) for icat, irow in CAT2ROW.iteritems(): for itopic, icol in TOP2COL.iteritems(): cat = (itopic, icat) n_toks = STATISTICS[cat][TOK] stat1 = STATISTICS[cat][ANNOTATOR][0][mname] stat2 = STATISTICS[cat][ANNOTATOR][-1][mname] total1, total2 = stat1[TOTAL_MTOK_IDX], \ stat2[TOTAL_MTOK_IDX] overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], \ stat2[OVERLAP_MTOK_IDX] ikappa = _compute_kappa(overlap1, total1, overlap2, total2, n_toks, cmp_scheme) agr_mtx[irow, icol] = ikappa stat_mtx[irow, icol] = stat2[TOTAL_MARK_IDX] print("mname = {:s}".format(mname), file=sys.stderr) print(repr(stat_mtx), file=sys.stderr) plot_mtx(stat_mtx, mname + STAT_MTX_SFX) plot_mtx(agr_mtx, mname + AGR_MTX_SFX) # print("STATISTICS =", repr(STATISTICS)) # print("POL_STAT =", repr(POL_STAT)) # print("INT_STAT =", repr(INT_STAT)) n = 0 ialpha = 0. ipol_stat = iint_stat = None for mname in REL_MARKABLES: ipol_stat, iint_stat = POL_STAT[mname], INT_STAT[mname] n_toks = ipol_stat[TOTAL] stat1, stat2 = ipol_stat[ANNOTATOR] total1, total2 = stat1[TOTAL_MTOK_IDX], stat2[TOTAL_MTOK_IDX] overlap1, overlap2 = stat1[OVERLAP_MTOK_IDX], stat2[OVERLAP_MTOK_IDX] ikappa = _compute_kappa(overlap1, total1, overlap2, total2, n_toks, BINARY_OVERLAP) ialpha = _compute_alpha(INT_STAT[mname]) print("Kappa ({:s}): {:f}".format(mname, ikappa), file=sys.stderr) print("Alpha ({:s}): {:f}".format(mname, ialpha), file=sys.stderr)