def get_dialog_acts(dset_root): cr = CorpusReader(dset_root) act_tags = Counter() i = 0 for utt in cr.iter_utterances(): # print(utt.keys()) # act_tags.append(utt.act_tag) act_tags.update([utt.act_tag]) return act_tags
def tag_counts(): """Gather and print counts of the tags.""" d = defaultdict(int) corpus = CorpusReader('swda') # Loop, counting tags: for utt in corpus.iter_utterances(display_progress=True): d[utt.act_tag] += 1 # Print the results sorted by count, largest to smallest: for key, val in sorted(d.items(), key=itemgetter(1), reverse=True): print key, val
def tag_counts(): """Gather and print counts of the tags.""" d = defaultdict(int) corpus = CorpusReader('swda') # Loop, counting tags: for utt in corpus.iter_utterances(display_progress=True): d[utt.act_tag] += 1 # Print the results sorted by count, largest to smallest: for key, val in sorted(d.items(), key=itemgetter(1), reverse=True): print key, val
def Atag(): corpus = CorpusReader('swda') actTag = defaultdict(int) for utt in corpus.iter_utterances(display_progress=True): actTag[utt.damsl_act_tag()] +1 i=1 for key in actTag.keys(): actTag[key] = i i=i+1 print actTag return actTag
def count_matches(): """Determine how many utterances have a single precisely matching tree.""" d = defaultdict(int) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(): if len(utt.trees) == 1: if utt.tree_is_perfect_match(): d['match'] += 1 else: d['mismatch'] += 1 print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
def count_matches(): """Determine how many utterances have a single precisely matching tree.""" d = defaultdict(int) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(): if len(utt.trees) == 1: if utt.tree_is_perfect_match(): d['match'] += 1 else: d['mismatch'] += 1 print "match: %s (%s percent)" % (d['match'], d['match']/float(sum(d.values())))
def preprocess_data(): act_tags = defaultdict(lambda: 0) corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): act_tags[utt.damsl_act_tag()] += 1 act_tags = act_tags.iteritems() act_tags = sorted(act_tags, key=itemgetter(1), reverse=True) f = open(tags_file, 'w') for k, v in act_tags: f.write('%s %d\n' % (k, v)) f.close() return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
def preprocess_data(): act_tags = defaultdict(lambda: 0) corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): act_tags[utt.damsl_act_tag()] += 1 act_tags = act_tags.iteritems() act_tags = sorted(act_tags, key=itemgetter(1), reverse=True) f = open(tags_file, 'w') for k, v in act_tags: f.write('%s %d\n' % (k, v)) f.close() return dict([(act_tags[i][0], i) for i in xrange(len(act_tags))])
def act_tags_and_text(): """ Create a CSV file named swda-actags-and-text.csv in which each utterance utt has its own row consisting of utt.damsl_act_tag(), and clean-text utterance This data can be used for training a speechAct classifier """ csvwriter = csv.writer(open('swda-acttags-and-text.csv', 'wt')) csvwriter.writerow(['DamslActTag', 'Text']) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(display_progress=True): clean_words = utt.text_words(filter_disfluency=True) csvwriter.writerow([utt.damsl_act_tag(), " ".join(clean_words)])
def process_data(tags): x = [] y = [] model= {} corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): words = [w.lower() for w in utt.pos_words() if w not in except_words] for word in words: if word not in model: model[word] = random_vector(vector_size) words = [model[w] for w in words] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) return (x, y)
def process_data(tags): x = [] y = [] model = {} corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): words = [w.lower() for w in utt.pos_words() if w not in except_words] for word in words: if word not in model: model[word] = random_vector(vector_size) words = [model[w] for w in words] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) return (x, y)
def act_tags_and_rootlabels(): """ Create a CSV file named swda-actags-and-rootlabels.csv in which each utterance utt has its own row consisting of just utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node restricting attention to cases in which utt has a single, perfectly matching tree associated with it. """ csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w')) csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode']) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(display_progress=True): if utt.tree_is_perfect_match(): csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
def act_tags_and_rootlabels(): """ Create a CSV file named swda-actags-and-rootlabels.csv in which each utterance utt has its own row consisting of just utt.act_tag, utt.damsl_act_tag(), and utt.trees[0].node restricting attention to cases in which utt has a single, perfectly matching tree associated with it. """ csvwriter = csv.writer(open('swda-actags-and-rootlabels.csv', 'w')) csvwriter.writerow(['ActTag', 'DamslActTag', 'RootNode']) corpus = CorpusReader('swda') for utt in corpus.iter_utterances(display_progress=True): if utt.tree_is_perfect_match(): csvwriter.writerow([utt.act_tag, utt.damsl_act_tag(), utt.trees[0].node])
def preprocess(): stemmer = PorterStemmer() corpus = CorpusReader('swda') stoplist =set([line.strip() for line in open("corpus/stopword", 'r')]) frequency = defaultdict(int) corpusDict = [[[stemmer.stem(word.translate(None, "?.,-").strip()) for word in utt.text.lower().split() if word.translate(None, "?.,-") not in stoplist],utt.damsl_act_tag()] for utt in corpus.iter_utterances(display_progress=True)] texts =[] for i in corpusDict: texts.append(i[0]) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 10] for text in texts] return texts
def load_dataset_OLD(): corpus = CorpusReader('swda') data = defaultdict(list) N = 221616 not_found_set = [] found = [] skipp_count = 0 for utt in corpus.iter_utterances(display_progress=False): d = { "basename": get_basename(utt), "words": " ".join(utt.pos_words()), "label": utt.damsl_act_tag(), } if len(d["words"]) < 1: #print("skipping ... ") skipp_count += 1 #print(utt.text_words()) continue not_found = True for splitname in SwDA: if d["basename"] in SwDA[splitname]: not_found = False data[splitname].append(d) found.append(d["basename"]) if not_found: not_found_set.append(d["basename"]) not_found_set = set(not_found_set) print("not found count:", len(not_found_set)) print("skipp count:", skipp_count) #for name in not_found_set: # print(name) print("label counts:") for k, v in data.items(): print("\t{} size:".format(k), len(v)) # 1115 seen dialogs, 19 unseen dialogs. size = len(set(found)) #assert size == 1115 + 19, "{} != 1115 + 19; difference = {}".format(size, 1115 + 19 - size) return data
def load_dataset(): corpus = CorpusReader('swda') data = [] skipp_count = 0 for utt in corpus.iter_utterances(display_progress=False): d = { "basename": get_basename(utt), "words": " ".join(utt.pos_words()), "label": utt.damsl_act_tag(), } if len(d["words"]) < 1: skipp_count += 1 continue data.append(d) print("skipp count:", skipp_count) return data
def process_data(model, tags): x = [] y = [] model_cache = {} non_modeled = set() corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): wordlist = str2wordlist(utt.text.lower()) for word in wordlist: if word in model: if word not in model_cache: model_cache[word] = model[word].tolist() else: non_modeled.add(word) words = [model_cache[w] for w in wordlist if w in model_cache] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) print 'Complete. The following words are not converted: ' print list(non_modeled) return (x, y)
def process_data(model, tags): x = [] y = [] model_cache = {} non_modeled = set() corpus = CorpusReader(swda_path) for utt in corpus.iter_utterances(): wordlist = str2wordlist(utt.text.lower()) for word in wordlist: if word in model: if word not in model_cache: model_cache[word] = model[word].tolist() else: non_modeled.add(word) words = [model_cache[w] for w in wordlist if w in model_cache] tag = tags[utt.damsl_act_tag()] x.append(words) y.append(tag) print 'Complete. The following words are not converted: ' print list(non_modeled) return (x, y)