Example #1
0
def split(program, k, dump):
    """Usage: <k> <dump>

    Splits the dumpfile `dump` into separate files, each with at most `k` documents.
    """
    from freequery.formats.warc import WARCParser, WARCWriter
    if not os.path.isfile(dump):
        print "fq-docset: cannot access '%s': no such dump" % dump
        exit(1)
    try:
        k = int(k)
    except ValueError:
        print "fq-docset: must provide positive integer `k`"
        exit(1)
    if k < 1:
        print "fq-docset: must provide positive integer `k`"
        exit(1)
        
    outfile_name = lambda i: os.path.join(os.path.dirname(dump), "%s-%04d" % (os.path.basename(dump), i))
    outfile_i = 0
    outfile = open(outfile_name(outfile_i), 'w+b')
    writer = WARCWriter(outfile)
    outfile_docs = 0
    with open(dump, 'rb') as infile:
        for doc in WARCParser(infile):
            if outfile_docs >= k:
                outfile.close()
                outfile_docs = 0
                outfile_i += 1
                outfile = open(outfile_name(outfile_i), 'w+b')
                writer = WARCWriter(outfile)
            writer.write(doc)
            outfile_docs += 1
Example #2
0
 def test_writes_file1(self):
     out = StringIO.StringIO()
     writer = WARCWriter(out)
     writer.write(fixtures.example)
     writer.write(fixtures.apple)
     out.seek(0)
     parser = WARCParser(out)
     self.assertEquals(fixtures.example, parser.next())
     self.assertEquals(fixtures.apple, parser.next())
Example #3
0
import sys
from freequery.formats.warc import WARCParser, WARCWriter

infile = open(sys.argv[1], 'rb')
uri = sys.argv[2]

for doc in WARCParser(infile):
    if doc.uri == uri:
        w = WARCWriter(sys.stdout)
        w.write(doc)
        break
import os, sys
from freequery.document import Document
from freequery.formats.warc import WARCWriter

if len(sys.argv) != 2:
    print "Usage: %s <wiki-path>" % sys.argv[0]
    exit(1)

    
wikipath = os.path.join(sys.argv[1], 'articles')

dump_writer = WARCWriter(sys.stdout)

for root, dirs, files in os.walk(wikipath):
    for f in files:
        if f.endswith('.html'):
            path = os.path.join(root, f)
            with open(path, 'rb') as ff:
                raw = ff.read()
            doc = Document(path, raw)
            dump_writer.write(doc)
            sys.stderr.write(path + "\n")
Example #5
0
import sys
from freequery.document import Document
from freequery.formats.warc import WARCWriter

inpath = sys.argv[1]
outpath = inpath + '.warc'
infile = open(inpath, 'rb')
outfile = open(outpath, 'w+b')
warcwriter = WARCWriter(outfile)

WB_DELIM = "==P=>>>>=i===<<<<=T===>=A===<=!Junghoo!==>\n"

uri = None
raw = None
state = 'raw'
for line in infile:
    if state == 'raw':
        if line == WB_DELIM:
            if uri:
                doc = Document(uri, "".join(raw))
                warcwriter.write(doc)
            uri = None
            raw = []
            state = 'webbaseheaders'
        else:
            raw.append(line)
    elif state == 'webbaseheaders':
        if line.startswith('URL: '):
            uri = line[5:].strip()
        if line == "\r\n" or line == "\n":
            state = 'httpheaders'
Example #6
0
import sys
from freequery.document import Document
from freequery.formats.warc import WARCWriter

if len(sys.argv) != 2:
    print "usage: %s <uri>" % sys.argv[0]
    exit(1)

doc = Document(sys.argv[1], sys.stdin.read())
writer = WARCWriter(sys.stdout)
writer.write(doc)