#!/usr/bin/env python from parser import DmozParser from handlers import JSONWriter class LawrenceFilter: def __init__(self): self._file = open("seeds.txt", 'w') def page(self, page, content): if page != None and page != "": topic = content['topic'] if topic.find('United_States/Kansas/Localities/L/Lawrence') > 0 : self._file.write(page + "\n") print "found page %s in topic %s" % (page , topic) def finish(self): self._file.close() parser = DmozParser() parser.add_handler( LawrenceFilter() #JSONWriter('output.json') ) parser.run()
#!/usr/bin/env python from parser import DmozParser from handlers import JSONWriter parser = DmozParser() parser.add_handler(JSONWriter('output.json')) parser.run()
# <d:Title>Animation World Network</d:Title> # <d:Description>Provides information resources to the international animation community. Features include searchable database archives, monthly magazine, web animation guide, the Animation Village, discussion forums and other useful resources.</d:Description> # <priority>1</priority> # <topic>Top/Arts/Animation</topic> # </ExternalPage> # This assumption is strictly checked, and processing will abort if it is violated. # To use this parser, one should unpack the content.rdf.u8.gz first class Filter: def __init__(self): self._file = open("seeds.txt", 'w') def page(self, page, content): if page != None and page != "": topic = content['topic'] with open("category.txt") as f: ctg = f.readline().strip() if topic.find(ctg) > 0 : self._file.write(page + "\n") print "found page %s in topic %s" % (page , topic) def finish(self): self._file.close() parser = DmozParser() parser.add_handler( Filter() ) parser.run()
# <ExternalPage about="http://www.awn.com/"> # <d:Title>Animation World Network</d:Title> # <d:Description>Provides information resources to the international animation community. Features include searchable database archives, monthly magazine, web animation guide, the Animation Village, discussion forums and other useful resources.</d:Description> # <priority>1</priority> # <topic>Top/Arts/Animation</topic> # </ExternalPage> # This assumption is strictly checked, and processing will abort if it is violated. # To use this parser, one should unpack the content.rdf.u8.gz first class Filter: def __init__(self): self._file = open("seeds.txt", 'w') def page(self, page, content): if page != None and page != "": topic = content['topic'] with open("category.txt") as f: ctg = f.readline().strip() if topic.find(ctg) > 0: self._file.write(page + "\n") print "found page %s in topic %s" % (page, topic) def finish(self): self._file.close() parser = DmozParser() parser.add_handler(Filter()) parser.run()
#!/usr/bin/env python from parser import DmozParser from handlers import JSONWriter class LawrenceFilter: def __init__(self): self._file = open("seeds.txt", 'w') def page(self, page, content): if page != None and page != "": topic = content['topic'] if topic.find('Venture') > 0 or topic.find( 'Financial_Services') > 0: self._file.write(page + " " + topic + "\n") print("found page %s in topic %s" % (page, topic)) def finish(self): self._file.close() parser = DmozParser() parser.add_handler(LawrenceFilter() #JSONWriter('output.json') ) parser.run()
#!/usr/bin/env python import logging from parser import DmozParser from handlers import JSONWriter from handlers import TaxonomieWriter logger = logging.getLogger(__name__) logging.basicConfig( format='%(asctime)s : %(levelname)s : %(module)s:%(funcName)s:%(lineno)d : %(message)s', level=logging.INFO) parser = DmozParser() parser.input_path = '../content.rdf.u8.gz' parser.add_handler(TaxonomieWriter('output.json')) parser.run()