def split(program, k, dump): """Usage: <k> <dump> Splits the dumpfile `dump` into separate files, each with at most `k` documents. """ from freequery.formats.warc import WARCParser, WARCWriter if not os.path.isfile(dump): print "fq-docset: cannot access '%s': no such dump" % dump exit(1) try: k = int(k) except ValueError: print "fq-docset: must provide positive integer `k`" exit(1) if k < 1: print "fq-docset: must provide positive integer `k`" exit(1) outfile_name = lambda i: os.path.join(os.path.dirname(dump), "%s-%04d" % (os.path.basename(dump), i)) outfile_i = 0 outfile = open(outfile_name(outfile_i), 'w+b') writer = WARCWriter(outfile) outfile_docs = 0 with open(dump, 'rb') as infile: for doc in WARCParser(infile): if outfile_docs >= k: outfile.close() outfile_docs = 0 outfile_i += 1 outfile = open(outfile_name(outfile_i), 'w+b') writer = WARCWriter(outfile) writer.write(doc) outfile_docs += 1
def test_writes_file1(self): out = StringIO.StringIO() writer = WARCWriter(out) writer.write(fixtures.example) writer.write(fixtures.apple) out.seek(0) parser = WARCParser(out) self.assertEquals(fixtures.example, parser.next()) self.assertEquals(fixtures.apple, parser.next())
import sys from freequery.formats.warc import WARCParser, WARCWriter infile = open(sys.argv[1], 'rb') uri = sys.argv[2] for doc in WARCParser(infile): if doc.uri == uri: w = WARCWriter(sys.stdout) w.write(doc) break
import os, sys from freequery.document import Document from freequery.formats.warc import WARCWriter if len(sys.argv) != 2: print "Usage: %s <wiki-path>" % sys.argv[0] exit(1) wikipath = os.path.join(sys.argv[1], 'articles') dump_writer = WARCWriter(sys.stdout) for root, dirs, files in os.walk(wikipath): for f in files: if f.endswith('.html'): path = os.path.join(root, f) with open(path, 'rb') as ff: raw = ff.read() doc = Document(path, raw) dump_writer.write(doc) sys.stderr.write(path + "\n")
import sys from freequery.document import Document from freequery.formats.warc import WARCWriter inpath = sys.argv[1] outpath = inpath + '.warc' infile = open(inpath, 'rb') outfile = open(outpath, 'w+b') warcwriter = WARCWriter(outfile) WB_DELIM = "==P=>>>>=i===<<<<=T===>=A===<=!Junghoo!==>\n" uri = None raw = None state = 'raw' for line in infile: if state == 'raw': if line == WB_DELIM: if uri: doc = Document(uri, "".join(raw)) warcwriter.write(doc) uri = None raw = [] state = 'webbaseheaders' else: raw.append(line) elif state == 'webbaseheaders': if line.startswith('URL: '): uri = line[5:].strip() if line == "\r\n" or line == "\n": state = 'httpheaders'
import sys from freequery.document import Document from freequery.formats.warc import WARCWriter if len(sys.argv) != 2: print "usage: %s <uri>" % sys.argv[0] exit(1) doc = Document(sys.argv[1], sys.stdin.read()) writer = WARCWriter(sys.stdout) writer.write(doc)