def parse_sgml(r, table): start_time = time.time() total_topics, total_texts = prepare_reuters21578(r) dict = dict_by_dict(r, total_texts, table) madlib = open(table + '.madlib', 'w') j = 1 for key, value in total_texts.iteritems(): #print key lines = value.split('\n') words = utility.get_file_words(lines) if len(words) == 0: continue #for madlib line = utility.get_madlib_line(words, dict) madlib.write(('%s : %s') % (str(j), line)) j = j + 1 madlib.close() elapsed_time = time.time() - start_time print r, 'parsing time:', elapsed_time
def parse_normal(r, parse_method, dict_method, table): dataset = table start_time = time.time() #topic_method(r) file_path_list, file_name_list = utility.get_file_list(r) dict = dict_method(r, table, parse_method) madlib = open(dataset + '.madlib', 'w') file_count = len(file_path_list) for i in range(0, file_count): lines = parse_method(file_path_list[i]) words = utility.get_file_words(lines) if len(words) == 0: continue #for madlib line = utility.get_madlib_line(words, dict) madlib.write(('%s : %s') % (str(i + 1), line)) madlib.close() elapsed_time = time.time() - start_time print r, 'parsing time:', elapsed_time