コード例 #1
0
    f = ArchiveIterator(open(options.input, 'rb'))
elif options.input == sys.stdin:
    f = ArchiveIterator(options.input.buffer)
else:
    f = ArchiveIterator(open(options.input, 'rb'))

if options.output == sys.stdout:
    fo = WARCWriter(options.output.buffer, gzip=True)
else:
    fo = WARCWriter(open(options.output, 'wb'), gzip=True)

if options.pdfpass is not None:
    po = WARCWriter(open(options.pdfpass, 'wb'), gzip=True)

if not options.pdfpass and options.pdfextract:
    extractor = ExtrP()

cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False)

if options.output == sys.stdout:
    filename = options.input
else:
    filename = options.output

fo.write_record(fo.create_warcinfo_record(filename=filename, info={'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0'}))

for record in f:
    # Initial checks
    if record.rec_type != 'response' and record.rec_type != 'resource':
        continue
    if record.rec_headers.get_header('WARC-Target-URI')[0] == '<' and record.rec_headers.get_header('WARC-Target-URI')[-1] == '>':
コード例 #2
0
    f = ArchiveIterator(open(options.input, 'rb'))

if options.output == sys.stdout or options.output == '-':
    fo = WARCWriter(sys.stdout.buffer, gzip=True)
else:
    fo = WARCWriter(open(options.output, 'wb'),
                    gzip=not options.disable_output_gzip)

if options.pdfpass is not None:
    po = WARCWriter(open(options.pdfpass, 'wb'),
                    gzip=not options.disable_pdfs_gzip)

if not options.pdfpass and options.pdfextract:
    from pdfextract.extract import Extractor as ExtrP
    extractor = ExtrP(configFile=options.configFile,
                      sentenceJoinPath=options.sentenceJoinPath,
                      kenlmPath=options.kenlmPath)

cleaner = None
if options.cleanhtml:
    from lxml.html.clean import Cleaner
    cleaner = Cleaner(style=True,
                      links=True,
                      add_nofollow=True,
                      page_structure=False,
                      safe_attrs_only=False)

if options.output == sys.stdout or options.output == '-':
    filename = ""
else:
    filename = options.output