import os import pickle import util INPUT_FOLDER = '../data/plaintext/' OUTPUT_FOLDER = '../data/frogged/' files = util.todo_filepaths(INPUT_FOLDER, '.txt', OUTPUT_FOLDER, '.frog.out') files = sorted(files) print "N files TODO:", len(files), files[:10] with open('../data/_frog_todo.p', 'wb') as f: pickle.dump(files, f)
#Using a partial function did not work, as that can not be pickled. def __extract_plaintext_as_tuple(filename_outfolder_tuple): filename, out_folder = filename_outfolder_tuple return extract_plaintext(filename, out_folder) def extract_all_plaintext(filenames, out_folder=PLAINTEXT_FOLDER): print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format(len(filenames),out_folder) #Zip the filename input with the output folder tuple_input = zip(filenames, [out_folder]*len(filenames)) pool = Pool(processes=util.CPU_COUNT) #pool = Pool(processes=1) num_tasks = len(filenames) for i, _ in enumerate(pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1): sys.stderr.write('\rdone {0:%}'.format(i/num_tasks)) pool.close() print "\nDONE" if __name__ == '__main__': in_folder = RAW_DATA_FOLDER out_folder = PLAINTEXT_FOLDER todo_filenames = util.todo_filepaths(in_folder,'.xml', out_folder,'.txt', blacklist=BLACKLIST) all_filenames = util.todo_filepaths(in_folder,'.xml', blacklist=BLACKLIST) #extract_all_plaintext(todo_filenames, out_folder) extract_all_labels(all_filenames, DATA_FOLDER+'labels.p')
import os import pickle import util INPUT_FOLDER = '../data/plaintext/' OUTPUT_FOLDER = '../data/frogged/' files = util.todo_filepaths(INPUT_FOLDER, '.txt', OUTPUT_FOLDER, '.frog.out') files = sorted(files) print "N files TODO:", len(files), files[:10] with open('../data/_frog_todo.p','wb') as f: pickle.dump(files, f)
print "EXTRACTING PLAINTEXT FROM {0} FILES INTO {1}".format( len(filenames), out_folder) #Zip the filename input with the output folder tuple_input = zip(filenames, [out_folder] * len(filenames)) pool = Pool(processes=util.CPU_COUNT) #pool = Pool(processes=1) num_tasks = len(filenames) for i, _ in enumerate( pool.imap_unordered(__extract_plaintext_as_tuple, tuple_input), 1): sys.stderr.write('\rdone {0:%}'.format(i / num_tasks)) pool.close() print "\nDONE" if __name__ == '__main__': in_folder = RAW_DATA_FOLDER out_folder = PLAINTEXT_FOLDER todo_filenames = util.todo_filepaths(in_folder, '.xml', out_folder, '.txt', blacklist=BLACKLIST) all_filenames = util.todo_filepaths(in_folder, '.xml', blacklist=BLACKLIST) #extract_all_plaintext(todo_filenames, out_folder) extract_all_labels(all_filenames, DATA_FOLDER + 'labels.p')