Ejemplo n.º 1
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-L] [-d] [-o output] [-P pathpat] [-c codec] [-T] [-Z] '
               '[file ...]') % argv[0]
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'Ldo:P:c:TZ')
    except getopt.GetoptError:
        return usage()
    args = args or ['-']
    errfp = None
    output = '-'
    codec = 'utf-8'
    ext = ''
    pathpat = None
    titleline = False
    klass = WikiTextExtractor
    for (k, v) in opts:
        if k == '-d': errfp = sys.stderr
        elif k == '-o': output = v
        elif k == '-P': pathpat = v
        elif k == '-c': codec = v 
        elif k == '-T': titleline = True
        elif k == '-Z': ext = '.gz'
        elif k == '-L': klass = WikiLinkExtractor
    if output.endswith('.cdb'):
        writer = WikiDBWriter(output, codec=codec, ext=ext)
    else:
        writer = WikiFileWriter(
            output=output, pathpat=pathpat,
            codec=codec, titleline=titleline)
    try:
        converter = Converter(writer, klass, errfp=errfp)
        for path in args:
            if path.endswith('.cdb'):
                reader = WikiDBReader(path, codec=codec, ext=ext)
                for pageid in reader:
                    (title, revids) = reader[pageid]
                    converter.add_page(pageid, title)
                    for revid in revids:
                        wiki = reader.get_wiki(pageid, revid)
                        converter.add_revid(pageid, revid)
                        converter.feed_text(pageid, revid, wiki)
            else:
                (path,fp) = getfp(path)
                if path.endswith('.xml'):
                    parser = MWDump2Text(converter)
                    parser.feed_file(fp)
                    parser.close()
                else:
                    converter.add_page(0, path)
                    converter.add_revid(0, 0)
                    converter.feed_file(0, 0, fp, codec=codec)
                fp.close()
        converter.close()
    finally:
        writer.close()
    return
Ejemplo n.º 2
0
def main(argv):
    args = argv[1:] or ['-']
    for path in args:
        print >>sys.stderr, path
        (_,fp) = getfp(path)
        parser = WikiAgeExtractor(time.time())
        parser.feed_file(fp)
        parser.close()
        fp.close()
    return
Ejemplo n.º 3
0
def main(argv):
    args = argv[1:] or ['-']
    for path in args:
        print >> sys.stderr, path
        (_, fp) = getfp(path)
        parser = WikiAgeExtractor(time.time())
        parser.feed_file(fp)
        parser.close()
        fp.close()
    return
Ejemplo n.º 4
0
def main(argv):
    import getopt

    def usage():
        print("usage: %s {-w} [-c codec] [-o output] [-T] [-Z] " "cdbfile [pageid ...]" % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], "wo:c:TZ")
    except getopt.GetoptError:
        return usage()
    text = True
    output = "-"
    codec = "utf-8"
    ext = ""
    titleline = False
    for (k, v) in opts:
        if k == "-o":
            output = v
        elif k == "-c":
            codec = v
        elif k == "-w":
            text = False
        elif k == "-T":
            titleline = True
        elif k == "-Z":
            ext = ".gz"
    if not args:
        return usage()
    (_, outfp) = getfp(output, "w")
    readers = []
    pageids = []
    for arg in args:
        if os.path.isfile(arg):
            readers.append(WikiDBReader(arg, codec=codec, ext=ext))
        else:
            pageids.append(arg)
    for reader in readers:
        for pageid in pageids or iter(reader):
            try:
                (title, revids) = reader[pageid]
            except KeyError:
                continue
            if titleline:
                outfp.write(title.encode(codec, "ignore") + "\n")
            for revid in revids:
                try:
                    if text:
                        data = reader.get_text(pageid, revid)
                    else:
                        data = reader.get_wiki(pageid, revid)
                except KeyError:
                    continue
                outfp.write(data.encode(codec, "ignore"))
    return
Ejemplo n.º 5
0
def main(argv):
    import getopt

    def usage():
        print('usage: %s {-w} [-c codec] [-o output] [-T] [-Z] '
              'cdbfile [pageid ...]' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'wo:c:TZ')
    except getopt.GetoptError:
        return usage()
    text = True
    output = '-'
    codec = 'utf-8'
    ext = ''
    titleline = False
    for (k, v) in opts:
        if k == '-o': output = v
        elif k == '-c': codec = v
        elif k == '-w': text = False
        elif k == '-T': titleline = True
        elif k == '-Z': ext = '.gz'
    if not args: return usage()
    (_, outfp) = getfp(output, 'w')
    readers = []
    pageids = []
    for arg in args:
        if os.path.isfile(arg):
            readers.append(WikiDBReader(arg, codec=codec, ext=ext))
        else:
            pageids.append(arg)
    for reader in readers:
        for pageid in (pageids or iter(reader)):
            try:
                (title, revids) = reader[pageid]
            except KeyError:
                continue
            if titleline:
                outfp.write(title.encode(codec, 'ignore') + '\n')
            for revid in revids:
                try:
                    if text:
                        data = reader.get_text(pageid, revid)
                    else:
                        data = reader.get_wiki(pageid, revid)
                except KeyError:
                    continue
                outfp.write(data.encode(codec, 'ignore'))
    return
Ejemplo n.º 6
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-o output] [-P pathpat] [-c codec] [-T] [-Z] '
               '[file ...]' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'o:P:c:TZ')
    except getopt.GetoptError:
        return usage()
    args = args or ['-']
    output = '-'
    codec = 'utf-8'
    ext = ''
    pathpat = None
    titleline = False
    for (k, v) in opts:
        if k == '-o': output = v
        elif k == '-P': pathpat = v
        elif k == '-c': codec = v 
        elif k == '-T': titleline = True
        elif k == '-Z': ext = '.gz'
    if output.endswith('.cdb'):
        writer = WikiDBWriter(output, codec=codec, ext=ext)
    else:
        writer = WikiFileWriter(
            output=output, pathpat=pathpat,
            codec=codec, titleline=titleline)
    parser = MWXMLDump2DB(writer)
    for path in args:
        (_,fp) = getfp(path)
        try:
            parser.feed_file(fp)
        finally:
            fp.close()
            parser.close()
    return
Ejemplo n.º 7
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-L|-C] [-d] [-o output] [-P pathpat] [-c codec] [-T] [-Z] '
            '[file ...]') % argv[0]
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'LCdo:P:c:m:TZ')
    except getopt.GetoptError:
        return usage()
    args = args or ['-']
    errfp = None
    output = '-'
    codec = 'utf-8'
    ext = ''
    pathpat = None
    mode = 'page'
    titleline = False
    klass = WikiTextExtractor
    for (k, v) in opts:
        if k == '-d': errfp = sys.stderr
        elif k == '-o': output = v
        elif k == '-P': pathpat = v
        elif k == '-c': codec = v
        elif k == '-m': mode = v
        elif k == '-T': titleline = True
        elif k == '-Z': ext = '.gz'
        elif k == '-L': klass = WikiLinkExtractor
        elif k == '-C': klass = WikiCategoryExtractor
    if output.endswith('.cdb'):
        writer = WikiDBWriter(output, codec=codec, ext=ext)
    else:
        writer = WikiFileWriter(output=output,
                                pathpat=pathpat,
                                codec=codec,
                                titleline=titleline,
                                mode=mode)
    try:
        converter = Converter(writer, klass, errfp=errfp)
        for path in args:
            if path.endswith('.cdb'):
                reader = WikiDBReader(path, codec=codec, ext=ext)
                for pageid in reader:
                    (title, revids) = reader[pageid]
                    converter.add_page(pageid, title)
                    for revid in revids:
                        wiki = reader.get_wiki(pageid, revid)
                        converter.add_revid(pageid, revid)
                        converter.feed_text(pageid, revid, wiki)
            else:
                (path, fp) = getfp(path)
                if path.endswith('.xml'):
                    parser = MWDump2Text(converter)
                    parser.feed_file(fp)
                    parser.close()
                else:
                    converter.add_page(0, path)
                    converter.add_revid(0, 0)
                    converter.feed_file(0, 0, fp, codec=codec)
                fp.close()
        converter.close()
    finally:
        writer.close()
    return