def prepare_word_filling(corpus_data_path: str, words_data_path: str): print('Preparing word filling') DROP = '__DROP__' CONTEXT_SIZE = 3 # Limiting context for char-level encoder print('Reading data...') lines = read_corpus(corpus_data_path) words = set(read_corpus(words_data_path)) src, trg = [], [] for s in tqdm(lines): tokens = s.split() for i, t in enumerate(tokens): if not (t in words): continue context_left = tokens[max(i - CONTEXT_SIZE, 0) : i] context_right = tokens[i + 1 : i + CONTEXT_SIZE + 1] src.append(' '.join(context_left + [DROP] + context_right)) trg.append(t) print('Saving...') save_corpus(src, 'data/generated/classics-word-filling.src') save_corpus(trg, 'data/generated/classics-word-filling.trg') print('Done!')
def make_pipe01_dataset(name="CORPUS_physorg_articles_pipe01", mini_size=0): corpus = pipe01(limit=0) utils.save_corpus(name, corpus) if mini_size > 0: perm = numpy.random.permutation(range(corpus.n_docs)) remove = set(perm[:(corpus.n_docs - mini_size)]) corpus.remove(lambda doc: doc.corpus_index in remove) utils.save_corpus(name + '_mini' + str(mini_size), corpus)
def filter_subs(input_data_path: str, output_data_path: str): print('Reading data...') subs = read_corpus(input_data_path) pattern = re.compile('^(?:[A-z]|[А-я]|[ёЁ\d\s.,!:?\-––\'"%$()`])+$') print('Filtering...') filtered = [s for s in tqdm(subs) if pattern.match(s)] print('Removing too long sentences...') short = [s for s in tqdm(filtered) if len(s.split()) <= 50 and len(s) <= 250] print('Saving...') save_corpus(short, output_data_path) print('Done!')
def detokenize(input_file, output_file): print('Reading data...') texts = read_corpus(input_file) print('Data reading finished!') print('Detokenizing...') detokenized = [detokenizer.detokenize(s.split()) for s in tqdm(texts)] print('Detokenized!') print('Saving...') save_corpus(detokenized, output_file) print('Saved!')
def cut_long_dialogs(data_path:str, output_path:str, max_len:int=128): print('Reading data...') data = read_corpus(data_path) print('Splitting dialogs...') dialogs = [d.split('|') for d in data] print('Cutting data...') dialogs = [cut_dialog(d, max_len) for d in tqdm(dialogs)] num_dialogs_before = len(dialogs) print('Saving data...') dialogs = list(filter(len, dialogs)) save_corpus(dialogs, output_path) print('Done! Num dialogs reduced: {} -> {}'.format(num_dialogs_before, len(dialogs)))
def dialogs_from_lines(input_data_path:str, output_data_path:str, n_lines: int, eos:str, n_dialogs:int): n_lines, n_dialogs = int(n_lines), int(n_dialogs) # TODO: argparse? print('Reading data...') lines = read_corpus(input_data_path) lines = lines[:n_dialogs * n_lines] print('Generating dialogs') dialogs = [lines[i:i+n_lines] for i in range(0, len(lines) - n_lines)] dialogs = [eos.join(d) for d in dialogs] print('Saving corpus') save_corpus(dialogs, output_data_path) print('Done!')
def filter_dialogs_in_classics(input_data_path: str, output_data_path: str): print('Reading data...') classics = read_corpus(input_data_path) print('Finding dialogs...') dialogs = [s for s in tqdm(classics) if s.strip().startswith(SPEC_DASH)] print('Removing markup chars from dialogs...') dialogs = [s.replace('\xa0', ' ') for s in tqdm(dialogs)] print('Removing degenerate lines...') dialogs = [s for s in tqdm(dialogs) if s != SPEC_DASH] print('Filtering direct speech...') dialogs = [filter_direct_speech(s) for s in tqdm(dialogs)] print('Saving...') save_corpus(dialogs, output_data_path) print('Done!')
def generate_sentiment_words(neg_input_path:str, pos_input_path:str, neg_output_path:str, pos_output_path:str, keep_n_most_popular_words:int=3000): print('Reading data...') neg_lines = read_corpus(neg_input_path) pos_lines = read_corpus(pos_input_path) print('Counting words') neg_counter = Counter([w.lower() for s in tqdm(neg_lines) for w in s.split()]) pos_counter = Counter([w.lower() for s in tqdm(pos_lines) for w in s.split()]) print('Getting most popular') neg_top_words = set(w for w, _ in neg_counter.most_common(keep_n_most_popular_words)) pos_top_words = set(w for w, _ in pos_counter.most_common(keep_n_most_popular_words)) only_neg_top_words = neg_top_words - pos_top_words only_pos_top_words = pos_top_words - neg_top_words print('Saving') save_corpus(list(only_neg_top_words), neg_output_path) save_corpus(list(only_pos_top_words), pos_output_path) print('Done!')
def prepare_subs_for_open_nmt(data_path): print('Preparing subs for open-nmt') print('Reading data...') data = read_corpus(data_path) # data = [s for s in data if 5 <= len(s.split()) <= 20] # Removing noise src = data[:-1] trg = data[1:] print('Splitting into train/val...') splits = train_test_split(src, trg, test_size=5000, random_state=42) src_train, src_val, trg_train, trg_val = splits print('Saving...') save_corpus(src_train, data_path + '.open-nmt.train.src') save_corpus(trg_train, data_path + '.open-nmt.train.trg') save_corpus(src_val, data_path + '.open-nmt.val.src') save_corpus(trg_val, data_path + '.open-nmt.val.trg') print('Done!')
msg = 'The corpus already exist. Overwrite?' overwrite = input('{} (y/N) '.format(msg)).lower() in ('y', 'yes') if not overwrite: sys.exit(1) corpus = { 'gw_20p_wannot_dirpath': args.gw_20p_wannot_dirpath, 'contrast_threshold': args.contrast_threshold, 'n_octave_layers': args.n_octave_layers, 'pages': list(), 'keypoints': list(), 'descriptors': list() } sift = cv2.xfeatures2d.SIFT_create(contrastThreshold=args.contrast_threshold, nOctaveLayers=args.n_octave_layers) for page_filename in pages_filenames: # load the page image as grey scale, detect its keypoints and compute its descriptors print(f'Processing: {page_filename}') page_image = cv2.imread(os.path.join(args.gw_20p_wannot_dirpath, page_filename)) page_image = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY) page_keypoints, page_descriptors = sift.detectAndCompute(page_image, None) assert len(page_keypoints) > 0 assert len(page_keypoints) == len(page_descriptors) corpus['pages'].append(page_filename) corpus['keypoints'].append(utils.cv2_to_namedtuple_keypoints(page_keypoints)) corpus['descriptors'].append(page_descriptors) utils.save_corpus(corpus, corpus_filepath)
def main(): parser = argparse.ArgumentParser(description="Create corpora according to historic or balanced slicing") parser.add_argument("--slicing_criterion", "-s", nargs='?', choices = ['historic', 'balanced'], help="Whether to apply historic or balanced slicing",required=True) args = parser.parse_args() # Sort all original Reichstag corpora corpus_1 = sort_corpus(1895) corpus_2 = sort_corpus(1918) corpus_3 = sort_corpus(1933) corpus_4 = sort_corpus(1942) if args.slicing_criterion == 'historic': # Create historically appropriate corpora # Split up corpus_1 as one part belongs to first historic slice, and the other to the second historic slice end_bismarck = 0 for i in range(len(corpus_1)): # 25 January 1990 is the split date, so search for the first protocol having a higher date if corpus_1[i][0] > datetime(1890,1,25): end_bismarck = i break # Discard the dates corpus_1 = [sublist[1] for sublist in corpus_1] corpus_2 = [sublist[1] for sublist in corpus_2] corpus_3 = [sublist[1] for sublist in corpus_3] corpus_4 = [sublist[1] for sublist in corpus_4] # Create first and second historic slices kaiserreich_1 = corpus_1[:end_bismarck] kaiserreich_2 = corpus_1[end_bismarck:] + corpus_2 save_corpus(kaiserreich_1, 'kaiserreich_1') save_corpus(kaiserreich_2, 'kaiserreich_2') save_corpus(corpus_3, 'weimar') save_corpus(corpus_4, 'ns') elif args.slicing_criterion == 'balanced': # Create balanced slices in terms of year of protocols per slice # Discard the dates corpus_1 = [sublist[1] for sublist in corpus_1] corpus_2 = [sublist[1] for sublist in corpus_2] corpus_3 = [sublist[1] for sublist in corpus_3] corpus_4 = [sublist[1] for sublist in corpus_4] # Slice into equal parts full_corpus = corpus_1 + corpus_2 + corpus_3 + corpus_4 slice_border = round(len(full_corpus) / 5) slice_1 = full_corpus[:slice_border] slice_2 = full_corpus[slice_border:(2*slice_border)] slice_3 = full_corpus[(2*slice_border):(3*slice_border)] slice_4 = full_corpus[(3*slice_border):(4*slice_border)] slice_5 = full_corpus[(4*slice_border):] save_corpus(slice_1, 'rt_slice_1') save_corpus(slice_2, 'rt_slice_2') save_corpus(slice_3, 'rt_slice_3') save_corpus(slice_4, 'rt_slice_4') save_corpus(slice_5, 'rt_slice_5')
def train(): phrase = request.form.get('phrase') class_name = request.form.get('class') save_corpus(learning([{'class': class_name, 'phrase': phrase}])) return create_response(200, {"status": "phrase included"})
def train_with_examples(): save_corpus(learning(sample())) return create_response(200, {"status": "sample phrases included"})