def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_dir', default='../data/subtitles/subtitlesInTSV/') args = parser.parse_args() data_dir = args.data_dir dialogue_files = [ f for f in os.listdir(data_dir) if re.findall('S[1-9]E[0-9]+.tsv', f) ] dialogue_files = [os.path.join(data_dir, f) for f in dialogue_files] stops = get_stopwords('en') + [ 'will', 'don', 've', ] all_docs = {} for f in dialogue_files: ep_name = re.findall('S[1-9]E[0-9]+', f)[0] data = pd.read_csv(f, sep='\t') docs = [] for chunk, data_group in data.groupby('chunk'): clean_dialogue = [] for d in data_group['dialogue']: # print('raw dialogue %s'%(d)) cleaned = clean_text(str(d)) try: cleaned = cleaned.decode('utf-8') clean_dialogue.append(cleaned) except Exception, e: print('could not clean text %s because error %s' % (cleaned, e)) all_dialogue = ' '.join(clean_dialogue) docs.append(all_dialogue) episode_text = ' '.join(docs) # print('got full text %s'% # (episode_text)) all_docs[ep_name] = episode_text
print('processing episode %s' % (e)) e_data = episode_data[e] e_name = e.split('.tsv')[0] e_data.sort_values('chunk', ascending=True) # TODO: insert dummy values for empty chunks empty_chunks = full_chunk_list - set(e_data['chunk'].unique()) if (len(empty_chunks) > 0): print('filling %s with empty chunks %s' % (e_name, empty_chunks)) empty_chunk_rows = pd.DataFrame([{ 'chunk': c, 'dialogue': '' } for c in empty_chunks]) e_data = pd.concat([e_data, empty_chunk_rows], axis=0) chunk_iter = e_data.groupby('chunk') chunk_text = [ clean_text(' '.join(map(str, c[1]['dialogue'].tolist()))) for c in chunk_iter ] chunk_LIWC_counts = {c: [] for c in LIWC_categories} for t in chunk_text: tokens = TKNZR.tokenize(t) for c in LIWC_categories: counts = get_LIWC_counts(tokens, LIWC_words=LIWC_category_wordlists[c]) if (count_option == 'total'): total_counts = sum(counts.values()) elif (count_option == 'unique'): total_counts = len(counts) # TODO: store individual words as well as aggregate counts chunk_LIWC_counts[c].append(total_counts) chunk_LIWC_counts = pd.DataFrame(chunk_LIWC_counts)
tokenizer = WordPunctTokenizer() LIWC_words = {category: ["^" + l.strip() + "$" for l in open(os.path.join(LIWC_dir, category), 'r')] for category in categories} #print LIWC_words["positive_affect"] jsonList = [] for filename in files: print filename with open (filename) as fin: categoryCounts = {category:Counter() for category in categories} for line in fin: if "frameNo" in line: continue dialogue = line.strip().split("\t")[-1] dialogue = dialogue.replace ("$", " ") tokens = tokenizer.tokenize(clean_text (dialogue.strip())) #print tokens for category in categories: #print category #print LIWC_words[category] counts = get_LIWC_counts(tokens, LIWC_words=LIWC_words[category]) if len (counts) > 0: #print counts categoryCounts[category].update (counts) temp_dict = {"name": os.path.basename (filename)} temp_dict.update (categoryCounts) jsonList.append (temp_dict) # In[3]:
subtitle_dir = '../data/subtitles/subtitlesInTSV/' all_episodes = [f for f in os.listdir(subtitle_dir) if re.findall('S[0-9]E[0-9]+', f)] sorted_episodes = sorted(all_episodes) episode_data = {e : pd.read_csv(os.path.join(subtitle_dir, e), sep='\t') for e in sorted_episodes} all_counts = {} min_df = 1 stop_words = [] cv = CountVectorizer(min_df=min_df, encoding='utf-8', stop_words=stop_words) for e in sorted_episodes: print('processing episode %s'%(e)) e_data = episode_data[e] all_dialogue = [] for d in e_data['dialogue']: clean_dialogue = clean_text(str(d)).replace('$', ' ').strip() if(clean_dialogue != ''): try: cv.fit_transform([clean_dialogue]) all_dialogue.append(clean_dialogue) except Exception, e: print('clean dialogue %s caused error %s'%(clean_dialogue, e)) counts = cv.fit_transform(all_dialogue) # only care about per-episode counts count_sums = list(pd.np.array(counts.sum(axis=0))[0]) sorted_vocab = sorted(cv.vocabulary_.keys(), key=lambda x: cv.vocabulary_[x]) # make count dict from sorted vocab and sum counts count_dict = dict(zip(sorted_vocab, count_sums)) all_counts[e] = count_dict # now combine all counts