def parse_directory(path, **kwargs): logfile = initialize_logfile(kwargs['logdir']) for file in os.listdir(path): print file print path # we don't process the daily digest or front matter. if file.find('FrontMatter') != -1 or file.find('PgD') != -1: continue # Makes text versions for the parser elif file.endswith('.htm'): old_file = os.path.join(path, file) content = open(old_file, 'r').read() # eliminates extra title and leaves expected space at the top content = re.sub(r'<title>.+?</title>', '', content) # need to eliminate particular blank lines, should sill get the tags out if expected line breaks aren't there. extras = ['<html>\n','<html>', '</html>', '<head>\n', '</head>\n', '<head>', '</head>', '<body><pre>\n', '<pre>', '</pre>', '<body>','</body>', ] for tag in extras: content = content.replace(tag, '') new_name = file[:-3] + 'txt' new_path = os.path.join(path, new_name) text_doc = open(new_path, 'w') text_doc = text_doc.write(content) file = new_name os.remove(old_file) if not file.endswith('.txt'): continue if kwargs.get('interactive', False): resp = raw_input("process file %s? (y/n/q) " % file) if resp == 'n': print 'skipping\n' continue elif resp == 'q': sys.exit() abspath = os.path.join(path, file) try: del kwargs['interactive'] except: pass try: del kwargs['logdir'] except: pass parser = CRParser(abspath, **kwargs) do_parse(parser, logfile) old_mods = os.path.join(path,"mods.xml") os.remove(old_mods) time.sleep(15) return kwargs['outdir']
def parse_single(infile, **kwargs): logfile = initialize_logfile(kwargs['logdir']) try: del kwargs['interactive'] except: pass try: del kwargs['logdir'] except: pass parser = CRParser(infile, **kwargs) do_parse(parser, logfile) return os.path.join(kwargs['outdir'], os.path.split(infile)[1].replace('.txt', '.xml'))
def parse_directory(path, **kwargs): logfile = initialize_logfile(kwargs['logdir']) for file in os.listdir(path): print file print path # we don't process the daily digest or front matter. if file.find('FrontMatter') != -1 or file.find('PgD') != -1: continue # Makes text versions for the parser elif file.endswith('.htm'): old_file = os.path.join(path, file) content = open(old_file, 'r').read() # eliminates extra title and leaves expected space at the top content = re.sub(r'<title>.+?</title>', '', content) # need to eliminate particular blank lines, should sill get the tags out if expected line breaks aren't there. extras = [ '<html>\n', '<html>', '</html>', '<head>\n', '</head>\n', '<head>', '</head>', '<body><pre>\n', '<pre>', '</pre>', '<body>', '</body>', ] for tag in extras: content = content.replace(tag, '') new_name = file[:-3] + 'txt' new_path = os.path.join(path, new_name) text_doc = open(new_path, 'w') text_doc = text_doc.write(content) file = new_name os.remove(old_file) if not file.endswith('.txt'): continue if kwargs.get('interactive', False): resp = raw_input("process file %s? (y/n/q) " % file) if resp == 'n': print 'skipping\n' continue elif resp == 'q': sys.exit() abspath = os.path.join(path, file) try: del kwargs['interactive'] except: pass try: del kwargs['logdir'] except: pass parser = CRParser(abspath, **kwargs) do_parse(parser, logfile) old_mods = os.path.join(path, "mods.xml") os.remove(old_mods) time.sleep(15) return kwargs['outdir']