def get_vocab_label(corpus, vocab_label_tmp=None, cut_label=0):
    iob_labels = _get_iob_labels(corpus)
    cnt = Counter(_get_label_set(iob_labels))
    labels = [
        (w, c)
        for w, c in sorted(cnt.iteritems(), key=lambda x: x[1], reverse=True)
        if c > cut_label
    ]
    say(str(labels))
    return _create_vocab_label(vocab_label_tmp, iob_labels, labels)
Beispiel #2
0
def show_adr_chance_level(samples):
    total = float(len(samples))
    total_agents = 0.
    stats = defaultdict(int)

    for sample in samples:
        stats[sample.n_agents_in_ctx] += 1

    for n_agents, n_samples in stats.items():
        assert n_agents > 0
        total_agents += n_agents * n_samples

    say('\n\t  SAMPLES: {:>8}'.format(int(total)))
    say('\n\t  ADDRESSEE DETECTION CHANCE LEVEL: {:>7.2%}'.format(total / total_agents))
def load_ubuntu_corpus(path):
    empty_cnt = 0
    raw_corpus = {}
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            q_id, title, body = line.split("\t")
            if len(title) == 0:
                print q_id
                empty_cnt += 1
                continue
            title = title.strip().split()
            body = body.strip().split()
            raw_corpus[q_id] = (title, body)
    say("{} empty titles ignored.\n".format(empty_cnt))
    return raw_corpus
Beispiel #4
0
def load_ubuntu_corpus(path):
    empty_cnt = 0
    raw_corpus = {}
    fopen = gzip.open if path.endswith(".gz") else open
    with fopen(path) as fin:
        for line in fin:
            q_id, title, body = line.split("\t")
            if len(title) == 0:
                print q_id
                empty_cnt += 1
                continue
            title = title.strip().split()
            body = body.strip().split()
            raw_corpus[q_id] = (title, body)
    say("{} empty titles ignored.\n".format(empty_cnt))
    return raw_corpus
Beispiel #5
0
def show_adr_upper_bound(samples, max_n_agents):
    true_adr_stats = defaultdict(int)
    non_adr_stats = defaultdict(int)

    # sample.n_agents_in_lctx = agents appearing in the limited context (including the speaker of the response)
    for sample in samples:
        if sample.true_adr > -1:
            true_adr_stats[sample.n_agents_in_lctx] += 1
        else:
            non_adr_stats[sample.n_agents_in_lctx] += 1

    say('\n\t  ADDRESSEE DETECTION UPPER BOUND:')
    for n_agents in xrange(max_n_agents):
        n_agents += 1
        if n_agents in true_adr_stats:
            ttl1 = true_adr_stats[n_agents]
        else:
            ttl1 = 0
        if n_agents in non_adr_stats:
            ttl2 = non_adr_stats[n_agents]
        else:
            ttl2 = 0
        total = float(ttl1 + ttl2)

        if total == 0:
            ub = 0.
        else:
            ub = ttl1 / total

        say('\n\t\t# Cands {:>2}: {:>7.2%} | Total: {:>8} | Including true-adr: {:>8} | Not including: {:>8}'.format(
            n_agents, ub, int(total), ttl1, ttl2))
    say('\n')
Beispiel #6
0
def dataset_statistics(dataset):
    """
    :param dataset: 1D: n_docs, 2D: n_utterances, 3D: elem=(time, speaker_id, addressee_id, response1, ... , label)
    """
    n_docs = len(dataset)
    n_utterances = 0
    n_words = 0
    n_agents = 0
    max_n_agents = 0

    for thread in dataset:
        agents = set([])
        n_utterances += len(thread)
        for sent in thread:
            label = sent[-1]
            if label > -1:
                sent_len = len(sent[3+label])
            else:
                sent_len = len(sent[3])
            n_words += sent_len
            agents.add(sent[1])

        n_agents_tm = len(agents)
        n_agents += n_agents_tm
        if max_n_agents < n_agents_tm:
            max_n_agents = n_agents_tm

    say('\nDATASET STATS\n# Docs: {:>4} | # Utterances: {:>8} | # Words: {:>8}\n'.format(n_docs, n_utterances, n_words))
    say('# Agents: {:>8} | # Max agents/Doc: {:>3}\n'.format(n_agents, max_n_agents))
    say('Words/Utter: {:3.2f} | Agents/Doc: {:3.2f}\n'.format(n_words/float(n_utterances), n_agents/float(n_docs)))
Beispiel #7
0
def show_n_samples_binned_ctx(samples):
    ctx_stats = defaultdict(int)
    for sample in samples:
        ctx_stats[sample.binned_n_agents_in_ctx] += 1

    say('\n\t  THE BINNED NUMBER OF AGENTS IN CONTEXT:')
    for n_agents, ttl in sorted(ctx_stats.items(), key=lambda x: x[0]):
        say('\n\t\tBin {:>2}: {:>8}'.format(n_agents, ttl))
    say('\n')