Example #1
0
def prepare_word_filling(corpus_data_path: str, words_data_path: str):
    print('Preparing word filling')
    DROP = '__DROP__'
    CONTEXT_SIZE = 3 # Limiting context for char-level encoder

    print('Reading data...')
    lines = read_corpus(corpus_data_path)
    words = set(read_corpus(words_data_path))
    src, trg = [], []

    for s in tqdm(lines):
        tokens = s.split()

        for i, t in enumerate(tokens):
            if not (t in words): continue

            context_left = tokens[max(i - CONTEXT_SIZE, 0) : i]
            context_right = tokens[i + 1 : i + CONTEXT_SIZE + 1]
            src.append(' '.join(context_left + [DROP] + context_right))
            trg.append(t)

    print('Saving...')
    save_corpus(src, 'data/generated/classics-word-filling.src')
    save_corpus(trg, 'data/generated/classics-word-filling.trg')

    print('Done!')
Example #2
0
def make_pipe01_dataset(name="CORPUS_physorg_articles_pipe01", mini_size=0):
    corpus = pipe01(limit=0)
    utils.save_corpus(name, corpus)
    if mini_size > 0:
        perm = numpy.random.permutation(range(corpus.n_docs))
        remove = set(perm[:(corpus.n_docs - mini_size)])
        corpus.remove(lambda doc: doc.corpus_index in remove)
        utils.save_corpus(name + '_mini' + str(mini_size), corpus)
Example #3
0
def filter_subs(input_data_path: str, output_data_path: str):
    print('Reading data...')
    subs = read_corpus(input_data_path)
    pattern = re.compile('^(?:[A-z]|[А-я]|[ёЁ\d\s.,!:?\-––\'"%$()`])+$')
    print('Filtering...')
    filtered = [s for s in tqdm(subs) if pattern.match(s)]
    print('Removing too long sentences...')
    short = [s for s in tqdm(filtered) if len(s.split()) <= 50 and len(s) <= 250]
    print('Saving...')
    save_corpus(short, output_data_path)
    print('Done!')
def detokenize(input_file, output_file):
    print('Reading data...')
    texts = read_corpus(input_file)
    print('Data reading finished!')

    print('Detokenizing...')
    detokenized = [detokenizer.detokenize(s.split()) for s in tqdm(texts)]
    print('Detokenized!')

    print('Saving...')
    save_corpus(detokenized, output_file)
    print('Saved!')
Example #5
0
def cut_long_dialogs(data_path:str, output_path:str, max_len:int=128):
    print('Reading data...')
    data = read_corpus(data_path)

    print('Splitting dialogs...')
    dialogs = [d.split('|') for d in data]
    print('Cutting data...')
    dialogs = [cut_dialog(d, max_len) for d in tqdm(dialogs)]
    num_dialogs_before = len(dialogs)

    print('Saving data...')
    dialogs = list(filter(len, dialogs))
    save_corpus(dialogs, output_path)
    print('Done! Num dialogs reduced: {} -> {}'.format(num_dialogs_before, len(dialogs)))
Example #6
0
def dialogs_from_lines(input_data_path:str, output_data_path:str, n_lines: int, eos:str, n_dialogs:int):
    n_lines, n_dialogs = int(n_lines), int(n_dialogs) # TODO: argparse?

    print('Reading data...')
    lines = read_corpus(input_data_path)
    lines = lines[:n_dialogs * n_lines]

    print('Generating dialogs')
    dialogs = [lines[i:i+n_lines] for i in range(0, len(lines) - n_lines)]
    dialogs = [eos.join(d) for d in dialogs]

    print('Saving corpus')
    save_corpus(dialogs, output_data_path)
    print('Done!')
Example #7
0
def filter_dialogs_in_classics(input_data_path: str, output_data_path: str):
    print('Reading data...')
    classics = read_corpus(input_data_path)

    print('Finding dialogs...')
    dialogs = [s for s in tqdm(classics) if s.strip().startswith(SPEC_DASH)]

    print('Removing markup chars from dialogs...')
    dialogs = [s.replace('\xa0', ' ') for s in tqdm(dialogs)]

    print('Removing degenerate lines...')
    dialogs = [s for s in tqdm(dialogs) if s != SPEC_DASH]

    print('Filtering direct speech...')
    dialogs = [filter_direct_speech(s) for s in tqdm(dialogs)]

    print('Saving...')
    save_corpus(dialogs, output_data_path)
    print('Done!')
Example #8
0
def generate_sentiment_words(neg_input_path:str, pos_input_path:str,
                             neg_output_path:str, pos_output_path:str,
                             keep_n_most_popular_words:int=3000):
    print('Reading data...')
    neg_lines = read_corpus(neg_input_path)
    pos_lines = read_corpus(pos_input_path)

    print('Counting words')
    neg_counter = Counter([w.lower() for s in tqdm(neg_lines) for w in s.split()])
    pos_counter = Counter([w.lower() for s in tqdm(pos_lines) for w in s.split()])

    print('Getting most popular')
    neg_top_words = set(w for w, _ in neg_counter.most_common(keep_n_most_popular_words))
    pos_top_words = set(w for w, _ in pos_counter.most_common(keep_n_most_popular_words))

    only_neg_top_words = neg_top_words - pos_top_words
    only_pos_top_words = pos_top_words - neg_top_words

    print('Saving')
    save_corpus(list(only_neg_top_words), neg_output_path)
    save_corpus(list(only_pos_top_words), pos_output_path)
    print('Done!')
Example #9
0
def prepare_subs_for_open_nmt(data_path):
    print('Preparing subs for open-nmt')
    print('Reading data...')
    data = read_corpus(data_path)
    # data = [s for s in data if 5 <= len(s.split()) <= 20] # Removing noise
    src = data[:-1]
    trg = data[1:]

    print('Splitting into train/val...')
    splits = train_test_split(src, trg, test_size=5000, random_state=42)
    src_train, src_val, trg_train, trg_val = splits

    print('Saving...')
    save_corpus(src_train, data_path + '.open-nmt.train.src')
    save_corpus(trg_train, data_path + '.open-nmt.train.trg')
    save_corpus(src_val, data_path + '.open-nmt.val.src')
    save_corpus(trg_val, data_path + '.open-nmt.val.trg')

    print('Done!')
        msg = 'The corpus already exist. Overwrite?'
        overwrite = input('{} (y/N) '.format(msg)).lower() in ('y', 'yes')
        if not overwrite: sys.exit(1)

    corpus = {
        'gw_20p_wannot_dirpath': args.gw_20p_wannot_dirpath,
        'contrast_threshold': args.contrast_threshold,
        'n_octave_layers': args.n_octave_layers,
        'pages': list(),
        'keypoints': list(),
        'descriptors': list()
    }

    sift = cv2.xfeatures2d.SIFT_create(contrastThreshold=args.contrast_threshold, nOctaveLayers=args.n_octave_layers)

    for page_filename in pages_filenames:
        # load the page image as grey scale, detect its keypoints and compute its descriptors
        print(f'Processing: {page_filename}')
        page_image = cv2.imread(os.path.join(args.gw_20p_wannot_dirpath, page_filename))
        page_image = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY)

        page_keypoints, page_descriptors = sift.detectAndCompute(page_image, None)
        assert len(page_keypoints) > 0
        assert len(page_keypoints) == len(page_descriptors)

        corpus['pages'].append(page_filename)
        corpus['keypoints'].append(utils.cv2_to_namedtuple_keypoints(page_keypoints))
        corpus['descriptors'].append(page_descriptors)

    utils.save_corpus(corpus, corpus_filepath)
Example #11
0
def main():
    parser = argparse.ArgumentParser(description="Create corpora according to historic or balanced slicing")
    parser.add_argument("--slicing_criterion", "-s", nargs='?', choices = ['historic', 'balanced'], help="Whether to apply historic or balanced slicing",required=True)
    args = parser.parse_args()

    # Sort all original Reichstag corpora
    corpus_1 = sort_corpus(1895)
    corpus_2 = sort_corpus(1918)
    corpus_3 = sort_corpus(1933)
    corpus_4 = sort_corpus(1942)

    if args.slicing_criterion == 'historic':
        # Create historically appropriate corpora
        
        # Split up corpus_1 as one part belongs to first historic slice, and the other to the second historic slice
        end_bismarck = 0
        for i in range(len(corpus_1)):
            # 25 January 1990 is the split date, so search for the first protocol having a higher date
            if corpus_1[i][0] > datetime(1890,1,25):
                end_bismarck = i
                break

        # Discard the dates
        corpus_1 = [sublist[1] for sublist in corpus_1]
        corpus_2 = [sublist[1] for sublist in corpus_2]
        corpus_3 = [sublist[1] for sublist in corpus_3]
        corpus_4 = [sublist[1] for sublist in corpus_4]

        # Create first and second historic slices
        kaiserreich_1 = corpus_1[:end_bismarck]
        kaiserreich_2 = corpus_1[end_bismarck:] + corpus_2

        save_corpus(kaiserreich_1, 'kaiserreich_1')
        save_corpus(kaiserreich_2, 'kaiserreich_2')
        save_corpus(corpus_3, 'weimar')
        save_corpus(corpus_4, 'ns')

    elif args.slicing_criterion == 'balanced':
        # Create balanced slices in terms of year of protocols per slice

        # Discard the dates
        corpus_1 = [sublist[1] for sublist in corpus_1]
        corpus_2 = [sublist[1] for sublist in corpus_2]
        corpus_3 = [sublist[1] for sublist in corpus_3]
        corpus_4 = [sublist[1] for sublist in corpus_4]

        # Slice into equal parts 
        full_corpus = corpus_1 + corpus_2 + corpus_3 + corpus_4
        slice_border = round(len(full_corpus) / 5)

        slice_1 = full_corpus[:slice_border]
        slice_2 = full_corpus[slice_border:(2*slice_border)]
        slice_3 = full_corpus[(2*slice_border):(3*slice_border)]
        slice_4 = full_corpus[(3*slice_border):(4*slice_border)]
        slice_5 = full_corpus[(4*slice_border):]

        save_corpus(slice_1, 'rt_slice_1')
        save_corpus(slice_2, 'rt_slice_2')
        save_corpus(slice_3, 'rt_slice_3')
        save_corpus(slice_4, 'rt_slice_4')
        save_corpus(slice_5, 'rt_slice_5')
Example #12
0
def train():
    phrase = request.form.get('phrase')
    class_name = request.form.get('class')
    save_corpus(learning([{'class': class_name, 'phrase': phrase}]))
    return create_response(200, {"status": "phrase included"})
Example #13
0
def train_with_examples():
    save_corpus(learning(sample()))
    return create_response(200, {"status": "sample phrases included"})