def write_sentences(f):
    output_dir = fh.makedirs(dirs.data_semafor_dir, 'temp')

    index = 0
    sent_index = {}
    responses = fh.read_json(f)
    keys = responses.keys()
    keys.sort()

    #all_items = ds.get_all_documents()
    #unlabeled = list(set(keys) - all_items)
    #print len(unlabeled)

    for k in keys:
        sentence_filename = os.path.join(output_dir, k + '.txt')
        #index_filename = fh.make_filename(output_dir, fh.get_basename(f), 'json')
        with codecs.open(sentence_filename, 'w', encoding='utf-8') as output_file:
            text = responses[k]
            paragraphs = text.split('\n\n')
            paragraphs = [p for p in paragraphs if p != '']
            for p in paragraphs:
                sentences = tokenizer.split_sentences(p)
                for sent in sentences:
                    sent = sent.lstrip()
                    sent = sent.rstrip()
                    if len(sent) > 0:
                        output_file.write(sent + '\n')
def preprocess_for_easysrl():

    input_filename = dirs.data_processed_text_file
    articles = fh.read_json(input_filename)
    keys = articles.keys()
    keys.sort()

    labeled = list(ds.get_all_documents())
    labeled.sort()

    processed_dict = {}
    output_filename = fh.make_filename(dirs.data_easysrl_dir, 'input', 'txt')

    with codecs.open(output_filename, 'w', encoding='utf-8') as output_file:
        count = 0
        for k in labeled:
            output_file.write(k + ' starts here\n')
            text = articles[k]
            paragraphs = text.split('\n\n')
            for p in paragraphs:
                sentences = tokenizer.split_sentences(p.strip())
                for s in sentences:
                    output_file.write(s.strip() + '\n')