Exemple #1
0
 def main(self):
     
     documents = []
     queue = []
     opener = URLOpener()
     parser = URLParser()
     db = BotDB(self.conf)
     parsed = []
     
     queue += self.conf['initial']['sites']
     print queue
     
     while len(queue) > 0:
         site = queue.pop(0)
         
         if site in parsed:
             continue
 
         parsed.append(site)
         self.logger.info("Parsing site: {0}".format(site))
         self.logger.info("Len of queue: {0}".format(len(queue)))
         headers, data = opener.open(site)
         
         if 'Content-Type' in headers:
             if headers['Content-Type'].split(';')[0] == 'text/html':
                 quad = parser.parse(site)
     
                 doc = Document(quad[0], quad[1], quad[2], quad[3], headers, data)
                 documents.append(doc)
                 self._follow(doc, parser, queue, parsed, quad)
Exemple #2
0
def main():
    conf = get_config('../conf/config.yaml')
    documents = []
    opener = URLOpener()
    parser = URLParser()
    
    sites = conf['initial']['sites']
    
    for site in sites:
        headers, data = opener.open(site)
        if headers.getheader('Content-Type').split(';')[0] == 'text/html':
            typ = parser.parse(site)
            
            doc = Document(typ[0], typ[1], typ[2], typ[3], headers, data)
            documents.append(doc)
            
            print doc.get_text()