コード例 #1
0
ファイル: sample.py プロジェクト: munichong/dmoz-parser
#!/usr/bin/env python

from parser import DmozParser
from handlers import JSONWriter

class LawrenceFilter:
  def __init__(self):
    self._file = open("seeds.txt", 'w')

  def page(self, page, content):
      if page != None and page != "":
          topic = content['topic']
          if topic.find('United_States/Kansas/Localities/L/Lawrence') >  0 :
              self._file.write(page + "\n")
              print "found page %s in topic %s" % (page , topic)

  def finish(self):
    self._file.close()


parser = DmozParser()
parser.add_handler(
    LawrenceFilter()
    #JSONWriter('output.json')
)
parser.run()
コード例 #2
0
#!/usr/bin/env python

from parser import DmozParser
from handlers import JSONWriter

parser = DmozParser()
parser.add_handler(JSONWriter('output.json'))
parser.run()
コード例 #3
0
#   <d:Title>Animation World Network</d:Title>
#   <d:Description>Provides information resources to the international animation community. Features include searchable database archives, monthly magazine, web animation guide, the Animation Village, discussion forums and other useful resources.</d:Description>
#   <priority>1</priority>
#   <topic>Top/Arts/Animation</topic>
# </ExternalPage>
# This assumption is strictly checked, and processing will abort if it is violated.
# To use this parser, one should unpack the content.rdf.u8.gz first

class Filter:
  def __init__(self):
    self._file = open("seeds.txt", 'w')

  def page(self, page, content):
      if page != None and page != "":
          topic = content['topic']
          with open("category.txt") as f:
                ctg = f.readline().strip()
          if topic.find(ctg) > 0 :
              self._file.write(page + "\n")
              print "found page %s in topic %s" % (page , topic)

  def finish(self):
    self._file.close()


parser = DmozParser()
parser.add_handler(
    Filter()
)
parser.run()
コード例 #4
0
# <ExternalPage about="http://www.awn.com/">
#   <d:Title>Animation World Network</d:Title>
#   <d:Description>Provides information resources to the international animation community. Features include searchable database archives, monthly magazine, web animation guide, the Animation Village, discussion forums and other useful resources.</d:Description>
#   <priority>1</priority>
#   <topic>Top/Arts/Animation</topic>
# </ExternalPage>
# This assumption is strictly checked, and processing will abort if it is violated.
# To use this parser, one should unpack the content.rdf.u8.gz first


class Filter:
    def __init__(self):
        self._file = open("seeds.txt", 'w')

    def page(self, page, content):
        if page != None and page != "":
            topic = content['topic']
            with open("category.txt") as f:
                ctg = f.readline().strip()
            if topic.find(ctg) > 0:
                self._file.write(page + "\n")
                print "found page %s in topic %s" % (page, topic)

    def finish(self):
        self._file.close()


parser = DmozParser()
parser.add_handler(Filter())
parser.run()
コード例 #5
0
ファイル: sample.py プロジェクト: tsa87/newscrawl
#!/usr/bin/env python

from parser import DmozParser
from handlers import JSONWriter


class LawrenceFilter:
    def __init__(self):
        self._file = open("seeds.txt", 'w')

    def page(self, page, content):
        if page != None and page != "":
            topic = content['topic']
            if topic.find('Venture') > 0 or topic.find(
                    'Financial_Services') > 0:
                self._file.write(page + " " + topic + "\n")
                print("found page %s in topic %s" % (page, topic))

    def finish(self):
        self._file.close()


parser = DmozParser()
parser.add_handler(LawrenceFilter()
                   #JSONWriter('output.json')
                   )
parser.run()
コード例 #6
0
ファイル: writeTaxomie.py プロジェクト: chris3456/dmoz-parser
#!/usr/bin/env python

import logging

from parser import DmozParser
from handlers import JSONWriter
from handlers import TaxonomieWriter

logger = logging.getLogger(__name__)

logging.basicConfig(
	format='%(asctime)s : %(levelname)s : %(module)s:%(funcName)s:%(lineno)d : %(message)s',
      	level=logging.INFO)

parser = DmozParser()
parser.input_path = '../content.rdf.u8.gz'
parser.add_handler(TaxonomieWriter('output.json'))
parser.run()