Exemple #1
0
def main(argv):
    from utils import getfp
    args = argv[1:] or ['-']
    codec = 'utf-8'
    for path in args:
        print >>sys.stderr, path
        (_,fp) = getfp(path)
        parser = WikiTextParser()
        parser.feed_file(fp, codec=codec)
        parser.close()
        fp.close()
        def f(x, i=0):
            if isinstance(x, WikiTree):
                print (' '*i+'('+repr(x))
                for c in x:
                    f(c, i+1)
                print (' '*i+')')
            elif isinstance(x, WikiToken):
                print (' '*i+repr(x))
            elif isinstance(x, XMLTagToken):
                print (' '*i+repr(x))
            elif isinstance(x, basestring):
                print (' '*i+repr(x))
            else:
                assert 0, x
        f(parser.get_root())
    return
Exemple #2
0
 def __init__(self, output=None, pathpat=None, codec="utf-8", titleline=False, mode="page"):
     assert output is not None or pathpat is not None
     self.pathpat = pathpat
     self.codec = codec
     self.titleline = titleline
     self.mode = mode
     self._fp = None
     if output is not None:
         (_, self._fp) = getfp(output, mode="w")
     self._pageid = None
     self._title = None
     self._revid = None
     return
Exemple #3
0
def main(argv):
    from utils import getfp
    class TitleExtractor(MWXMLDumpParser):
        def start_revision(self, pageid, title, revid, timestamp):
            print (pageid, title)
            return
    args = argv[1:] or ['-']
    for path in args:
        print >>sys.stderr, path
        (_,fp) = getfp(path)
        parser = TitleExtractor()
        parser.feed_file(fp)
        parser.close()
        fp.close()
    return 0
Exemple #4
0
 def add_data(self, pageid, revid, data):
     assert self._pageid == pageid
     assert self._title is not None
     assert self._revid == revid
     if self.pathpat is not None:
         if self._fp is not None:
             self._fp.close()
         name = self._title.encode('utf-8').encode('quopri_codec')
         path = (self.pathpat % {'name':name, 'pageid':pageid})
         (_,self._fp) = getfp(path, 'w')
     assert self._fp is not None
     if self.titleline:
         title = self._title.encode(self.codec, 'ignore')
         self._fp.write(title+'\n')
     self._fp.write(data.encode(self.codec, 'ignore'))
     self._fp.write('\n\f')
     return
Exemple #5
0
 def __init__(self,
              output=None,
              pathpat=None,
              codec='utf-8',
              titleline=False,
              mode='page'):
     assert output is not None or pathpat is not None
     self.pathpat = pathpat
     self.codec = codec
     self.titleline = titleline
     self.mode = mode
     self._fp = None
     if output is not None:
         (_, self._fp) = getfp(output, mode='w')
     self._pageid = None
     self._title = None
     self._revid = None
     return
Exemple #6
0
 def add_data(self, pageid, revid, data):
     assert self._pageid == pageid
     assert self._title is not None
     assert self._revid == revid
     if self.pathpat is not None:
         if self._fp is not None:
             self._fp.close()
         name = self._title.encode("utf-8").encode("quopri_codec")
         path = self.pathpat % {"name": name, "pageid": pageid}
         (_, self._fp) = getfp(path, "w")
     assert self._fp is not None
     if self.mode == "page":
         if self.titleline:
             title = self._title.encode(self.codec, "ignore")
             self._fp.write(title + "\n")
         self._fp.write(data.encode(self.codec, "ignore"))
         self._fp.write("\n\f")
     else:
         self._fp.write(self._title.encode(self.codec, "ignore") + "\t")
         self._fp.write(data.encode(self.codec, "ignore") + "\n")
     return
Exemple #7
0
 def add_data(self, pageid, revid, data):
     assert self._pageid == pageid
     assert self._title is not None
     assert self._revid == revid
     if self.pathpat is not None:
         if self._fp is not None:
             self._fp.close()
         name = self._title.encode('utf-8').encode('quopri_codec')
         path = (self.pathpat % {'name': name, 'pageid': pageid})
         (_, self._fp) = getfp(path, 'w')
     assert self._fp is not None
     if self.mode == 'page':
         if self.titleline:
             title = self._title.encode(self.codec, 'ignore')
             self._fp.write(title + '\n')
         self._fp.write(data.encode(self.codec, 'ignore'))
         self._fp.write('\n\f')
     else:
         self._fp.write(self._title.encode(self.codec, 'ignore') + '\t')
         self._fp.write(data.encode(self.codec, 'ignore') + '\n')
     return
Exemple #8
0
def main(argv):
    from utils import getfp

    class Tokenizer(WikiTextTokenizer):
        def handle_text(self, pos, text):
            print(pos, text)
            return

        def handle_token(self, pos, token):
            print(pos, token)
            return

    args = argv[1:] or ["-"]
    codec = "utf-8"
    for path in args:
        print >>sys.stderr, path
        (_, fp) = getfp(path)
        tokenizer = Tokenizer()
        tokenizer.feed_file(fp, codec=codec)
        tokenizer.close()
        fp.close()
    return
Exemple #9
0
def main(argv):
    from utils import getfp

    class Tokenizer(WikiTextTokenizer):
        def handle_text(self, pos, text):
            print(pos, text)
            return

        def handle_token(self, pos, token):
            print(pos, token)
            return

    args = argv[1:] or ['-']
    codec = 'utf-8'
    for path in args:
        print >> sys.stderr, path
        (_, fp) = getfp(path)
        tokenizer = Tokenizer()
        tokenizer.feed_file(fp, codec=codec)
        tokenizer.close()
        fp.close()
    return