Ejemplo n.º 1
0
def get_dialog_acts(dset_root):
    cr = CorpusReader(dset_root)
    act_tags = Counter()
    i = 0
    for utt in cr.iter_utterances():
        # print(utt.keys())
        # act_tags.append(utt.act_tag)
        act_tags.update([utt.act_tag])
    return act_tags
Ejemplo n.º 2
0
def tag_counts():
    """Gather and print counts of the tags."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Loop, counting tags:
    for utt in corpus.iter_utterances(display_progress=True):
        d[utt.act_tag] += 1
    # Print the results sorted by count, largest to smallest:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
Ejemplo n.º 3
0
def tag_counts():
    """Gather and print counts of the tags."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Loop, counting tags:
    for utt in corpus.iter_utterances(display_progress=True):
        d[utt.act_tag] += 1
    # Print the results sorted by count, largest to smallest:
    for key, val in sorted(d.items(), key=itemgetter(1), reverse=True):
        print key, val
Ejemplo n.º 4
0
def Atag():
        corpus = CorpusReader('swda')
        actTag = defaultdict(int)
        for utt in corpus.iter_utterances(display_progress=True):
            actTag[utt.damsl_act_tag()] +1
        i=1
        for key in actTag.keys():
            actTag[key] = i
            i=i+1
        print actTag
        return actTag
Ejemplo n.º 5
0
def count_matches():
    """Determine how many utterances have a single precisely matching tree."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    for utt in corpus.iter_utterances():
        if len(utt.trees) == 1:
            if utt.tree_is_perfect_match():
                d['match'] += 1
            else: 
                d['mismatch'] += 1
    print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
Ejemplo n.º 6
0
def count_matches():
    """Determine how many utterances have a single precisely matching tree."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    for utt in corpus.iter_utterances():
        if len(utt.trees) == 1:
            if utt.tree_is_perfect_match():
                d['match'] += 1
            else: 
                d['mismatch'] += 1
    print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
def preprocess_data():
    act_tags = defaultdict(lambda: 0)
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        act_tags[utt.damsl_act_tag()] += 1
    act_tags = act_tags.iteritems()
    act_tags = sorted(act_tags, key=itemgetter(1), reverse=True)
    f = open(tags_file, 'w')
    for k, v in act_tags:
        f.write('%s %d\n' % (k, v))
    f.close()
    return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
def preprocess_data():
    act_tags = defaultdict(lambda: 0)
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        act_tags[utt.damsl_act_tag()] += 1
    act_tags = act_tags.iteritems()
    act_tags = sorted(act_tags, key=itemgetter(1), reverse=True)
    f = open(tags_file, 'w')
    for k, v in act_tags:
        f.write('%s %d\n' % (k, v))
    f.close()
    return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
Ejemplo n.º 9
0
def act_tags_and_text():
    """
    Create a CSV file named swda-actags-and-text.csv in
    which each utterance utt has its own row consisting of

      utt.damsl_act_tag(), and clean-text utterance

    This data can be used for training a speechAct classifier
    """
    csvwriter = csv.writer(open('swda-acttags-and-text.csv', 'wt'))
    csvwriter.writerow(['DamslActTag', 'Text'])
    corpus = CorpusReader('swda')
    for utt in corpus.iter_utterances(display_progress=True):
        clean_words = utt.text_words(filter_disfluency=True)
        csvwriter.writerow([utt.damsl_act_tag(), " ".join(clean_words)])
def process_data(tags):
    x = []
    y = []
    model= {}
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        words = [w.lower() for w in utt.pos_words() if w not in except_words]
        for word in words:
            if word not in model:
                model[word] = random_vector(vector_size)
        words = [model[w] for w in words]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    return (x, y)
def process_data(tags):
    x = []
    y = []
    model = {}
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        words = [w.lower() for w in utt.pos_words() if w not in except_words]
        for word in words:
            if word not in model:
                model[word] = random_vector(vector_size)
        words = [model[w] for w in words]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    return (x, y)
Ejemplo n.º 12
0
def act_tags_and_rootlabels():
    """
    Create a CSV file named swda-actags-and-rootlabels.csv in
    which each utterance utt has its own row consisting of just

      utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node

    restricting attention to cases in which utt has a single,
    perfectly matching tree associated with it.
    """
    csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w'))
    csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode'])
    corpus = CorpusReader('swda')    
    for utt in corpus.iter_utterances(display_progress=True):
        if utt.tree_is_perfect_match():
            csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
Ejemplo n.º 13
0
def act_tags_and_rootlabels():
    """
    Create a CSV file named swda-actags-and-rootlabels.csv in
    which each utterance utt has its own row consisting of just

      utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node

    restricting attention to cases in which utt has a single,
    perfectly matching tree associated with it.
    """
    csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w'))
    csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode'])
    corpus = CorpusReader('swda')    
    for utt in corpus.iter_utterances(display_progress=True):
        if utt.tree_is_perfect_match():
            csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
Ejemplo n.º 14
0
def preprocess():
    stemmer = PorterStemmer()
    corpus = CorpusReader('swda')
    stoplist =set([line.strip() for line in open("corpus/stopword", 'r')])
    frequency = defaultdict(int)
    corpusDict = [[[stemmer.stem(word.translate(None, "?.,-").strip())
         for word in utt.text.lower().split() if word.translate(None, "?.,-") not in stoplist],utt.damsl_act_tag()]
         for utt in corpus.iter_utterances(display_progress=True)]
    texts =[]
    for i in corpusDict:
        texts.append(i[0])

    for text in texts:
       for token in text:
           frequency[token] += 1
    texts = [[token for token in text if frequency[token] > 10]  for text in texts]
    return texts
Ejemplo n.º 15
0
def load_dataset_OLD():
    corpus = CorpusReader('swda')
    data = defaultdict(list)
    N = 221616

    not_found_set = []
    found = []
    skipp_count = 0
    for utt in corpus.iter_utterances(display_progress=False):
        d = {
            "basename": get_basename(utt),
            "words": " ".join(utt.pos_words()),
            "label": utt.damsl_act_tag(),
        }

        if len(d["words"]) < 1:
            #print("skipping ... ")
            skipp_count += 1
            #print(utt.text_words())
            continue

        not_found = True
        for splitname in SwDA:
            if d["basename"] in SwDA[splitname]:
                not_found = False
                data[splitname].append(d)
                found.append(d["basename"])

        if not_found:
            not_found_set.append(d["basename"])

    not_found_set = set(not_found_set)
    print("not found count:", len(not_found_set))
    print("skipp count:", skipp_count)
    #for name in not_found_set:
    #    print(name)

    print("label counts:")
    for k, v in data.items():
        print("\t{} size:".format(k), len(v))

    # 1115 seen dialogs, 19 unseen dialogs.
    size = len(set(found))
    #assert size == 1115 + 19, "{} != 1115 + 19; difference = {}".format(size, 1115 + 19 - size)

    return data
Ejemplo n.º 16
0
def load_dataset():
    corpus = CorpusReader('swda')
    data = []
    skipp_count = 0

    for utt in corpus.iter_utterances(display_progress=False):
        d = {
            "basename": get_basename(utt),
            "words": " ".join(utt.pos_words()),
            "label": utt.damsl_act_tag(),
        }

        if len(d["words"]) < 1:
            skipp_count += 1
            continue
        data.append(d)

    print("skipp count:", skipp_count)
    return data
def process_data(model, tags):
    x = []
    y = []
    model_cache = {}
    non_modeled = set()
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        wordlist = str2wordlist(utt.text.lower())
        for word in wordlist:
            if word in model:
                if word not in model_cache:
                    model_cache[word] = model[word].tolist()
            else:
                non_modeled.add(word)
        words = [model_cache[w] for w in wordlist if w in model_cache]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    print 'Complete. The following words are not converted: '
    print list(non_modeled)
    return (x, y)
def process_data(model, tags):
    x = []
    y = []
    model_cache = {}
    non_modeled = set()
    corpus = CorpusReader(swda_path)
    for utt in corpus.iter_utterances():
        wordlist = str2wordlist(utt.text.lower())
        for word in wordlist:
            if word in model:
                if word not in model_cache:
                    model_cache[word] = model[word].tolist()
            else:
                non_modeled.add(word)
        words = [model_cache[w] for w in wordlist if w in model_cache]
        tag = tags[utt.damsl_act_tag()]
        x.append(words)
        y.append(tag)
    print 'Complete. The following words are not converted: '
    print list(non_modeled)
    return (x, y)