def process_corpus(corpus_name,
                   to_download=TO_DOWNLOAD,
                   min_wc_source=MIN_WC_SOURCE,
                   max_wc_source=MAX_WC_SOURCE,
                   min_wc_target=MIN_WC_TARGET,
                   max_wc_target=MAX_WC_TARGET,
                   source_filter=SOURCE_FILTER,
                   target_filter=TARGET_FILTER,
                   text_cols=TEXT_COLS,
                   data_dir=DATA_DIR):

    if to_download:
        corpus = Corpus(download(corpus_name, data_dir=data_dir))
    else:
        corpus = Corpus(os.path.join(data_dir, corpus_name))
    corpus_name = corpus.get_meta()['name']
    print(corpus_name)
    corpus.print_summary_stats()
    print('processing', corpus.get_meta()['name'])
    corpus.load_info('utterance', ['parsed'])

    corpus = text_prep_pipe().transform(corpus)

    source_df, target_df = get_train_subset(corpus, min_wc_source,
                                            max_wc_source, min_wc_target,
                                            max_wc_target, source_filter,
                                            target_filter, text_cols)
    source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'),
                     sep='\t')
    target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'),
                     sep='\t')
Esempio n. 2
0
import matplotlib.pyplot as plt

BASE_DIR = '/'

# In 7

import warnings

warnings.filterwarnings('ignore')

# In 8

AWRY_ROOT_DIR = BASE_DIR + '/conversations-gone-awry-corpus'
awry_corpus = Corpus(AWRY_ROOT_DIR)
awry_corpus.load_info('utterance', ['parsed'])

# In 9

# first, construct a table of conversations that meet the filter criteria (annotation_year = '2018')
kept_conversations = {
    c.id: c
    for c in awry_corpus.iter_conversations()
    if c.meta['annotation_year'] == "2018"
}

# next, construct a filtered utterance table containing only the utterances in the filtered conversations
kept_utterances = {}
for convo_id in kept_conversations:
    for utterance in kept_conversations[convo_id].iter_utterances():
        kept_utterances[utterance.id] = utterance
Esempio n. 3
0
def indent(text, n=0, label=""):
    print(textwrap.indent(textwrap.fill(text), label + '   ' * n + '| '))


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(
            'Must specify subreddit name; e.g.:\n\t $ python show_corpus.py coronavirus'
        )
        exit(1)
    else:
        full_path = os.path.join(scrape.base_path, sys.argv[1])
        if not os.path.isdir(full_path):
            print(
                f'Cannot find corpus at path {full_path} \n\t(path constructed using scrape.base_path)'
            )
            full_path = sys.argv[1]
            if not os.path.isdir(full_path):
                print(f'Also cannot find corpus at path {full_path}')
                exit(1)
            else:
                print(f'Found corpus at path {full_path}')
        else:
            print(f'Found corpus at path {full_path}')

        corpus = Corpus(filename=full_path)
        if False:
            corpus.load_info('utterance', scrape.SKIP_FIELDS['utterance'],
                             full_path)
        show_corpus(corpus)