def write_sentences(f): output_dir = fh.makedirs(dirs.data_semafor_dir, 'temp') index = 0 sent_index = {} responses = fh.read_json(f) keys = responses.keys() keys.sort() #all_items = ds.get_all_documents() #unlabeled = list(set(keys) - all_items) #print len(unlabeled) for k in keys: sentence_filename = os.path.join(output_dir, k + '.txt') #index_filename = fh.make_filename(output_dir, fh.get_basename(f), 'json') with codecs.open(sentence_filename, 'w', encoding='utf-8') as output_file: text = responses[k] paragraphs = text.split('\n\n') paragraphs = [p for p in paragraphs if p != ''] for p in paragraphs: sentences = tokenizer.split_sentences(p) for sent in sentences: sent = sent.lstrip() sent = sent.rstrip() if len(sent) > 0: output_file.write(sent + '\n')
def preprocess_for_easysrl(): input_filename = dirs.data_processed_text_file articles = fh.read_json(input_filename) keys = articles.keys() keys.sort() labeled = list(ds.get_all_documents()) labeled.sort() processed_dict = {} output_filename = fh.make_filename(dirs.data_easysrl_dir, 'input', 'txt') with codecs.open(output_filename, 'w', encoding='utf-8') as output_file: count = 0 for k in labeled: output_file.write(k + ' starts here\n') text = articles[k] paragraphs = text.split('\n\n') for p in paragraphs: sentences = tokenizer.split_sentences(p.strip()) for s in sentences: output_file.write(s.strip() + '\n')