def process_corpus(corpus_name, to_download=TO_DOWNLOAD, min_wc_source=MIN_WC_SOURCE, max_wc_source=MAX_WC_SOURCE, min_wc_target=MIN_WC_TARGET, max_wc_target=MAX_WC_TARGET, source_filter=SOURCE_FILTER, target_filter=TARGET_FILTER, text_cols=TEXT_COLS, data_dir=DATA_DIR): if to_download: corpus = Corpus(download(corpus_name, data_dir=data_dir)) else: corpus = Corpus(os.path.join(data_dir, corpus_name)) corpus_name = corpus.get_meta()['name'] print(corpus_name) corpus.print_summary_stats() print('processing', corpus.get_meta()['name']) corpus.load_info('utterance', ['parsed']) corpus = text_prep_pipe().transform(corpus) source_df, target_df = get_train_subset(corpus, min_wc_source, max_wc_source, min_wc_target, max_wc_target, source_filter, target_filter, text_cols) source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'), sep='\t') target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'), sep='\t')
import matplotlib.pyplot as plt BASE_DIR = '/' # In 7 import warnings warnings.filterwarnings('ignore') # In 8 AWRY_ROOT_DIR = BASE_DIR + '/conversations-gone-awry-corpus' awry_corpus = Corpus(AWRY_ROOT_DIR) awry_corpus.load_info('utterance', ['parsed']) # In 9 # first, construct a table of conversations that meet the filter criteria (annotation_year = '2018') kept_conversations = { c.id: c for c in awry_corpus.iter_conversations() if c.meta['annotation_year'] == "2018" } # next, construct a filtered utterance table containing only the utterances in the filtered conversations kept_utterances = {} for convo_id in kept_conversations: for utterance in kept_conversations[convo_id].iter_utterances(): kept_utterances[utterance.id] = utterance
def indent(text, n=0, label=""): print(textwrap.indent(textwrap.fill(text), label + ' ' * n + '| ')) if __name__ == '__main__': if len(sys.argv) < 2: print( 'Must specify subreddit name; e.g.:\n\t $ python show_corpus.py coronavirus' ) exit(1) else: full_path = os.path.join(scrape.base_path, sys.argv[1]) if not os.path.isdir(full_path): print( f'Cannot find corpus at path {full_path} \n\t(path constructed using scrape.base_path)' ) full_path = sys.argv[1] if not os.path.isdir(full_path): print(f'Also cannot find corpus at path {full_path}') exit(1) else: print(f'Found corpus at path {full_path}') else: print(f'Found corpus at path {full_path}') corpus = Corpus(filename=full_path) if False: corpus.load_info('utterance', scrape.SKIP_FIELDS['utterance'], full_path) show_corpus(corpus)