Beispiel #1
0
 def insert_url(self,url):
     "insert url directly into data and next_url to seeds"
     info = Page({"url": url, "source_url": "url", "depth": 0}, self.task)
     info.process(False)
     
     try:
         self.data.insert_one(info.set_data())
     except pymongo.errors.DuplicateKeyError:
         date = self.date.replace(hour=0)
         p_date = (info.date[-1]).replace(hour=0)
         if p_date == date:
             print "Already in processing queue today. No need to update then!"
             #self.queue.delete_one({"url":info.url})
             #return self.queue
             pass
         else:
             self.data.update_one({"url":url, "depth":0}, {"$push":info.add_data()})
     
     if self.task["repeat"]:
         self.data.update_one({"url":url}, {"$inc":{"crawl_nb":1}})
         
     if info.status:
         for link in info.outlinks:
             try:
                 self.queue.insert_one(link)
             except pymongo.errors.DuplicateKeyError:
                 continue
             except pymongo.errors.WriteError:
                 print "Error", link
                 pass
     return self.queue
Beispiel #2
0
 def global_crawl(self):
     logger.debug("***************CRAWL********")
     while self.queue.count() > 0:
         print "%i urls in process" %self.queue.count()
         print "in which %i sources in process" %self.queue.count({"depth":0})
         self.report.report("mail")
         for item in self.queue.find(no_cursor_timeout=True).sort([('depth', pymongo.ASCENDING)]):
             print "%i urls in process" %self.queue.count()
             
             #~ #Once a day
             #~ if self.task["repeat"] is False:
                 #~ date = self.date.replace(hour=0)
                 #~ p_date = p.date[-1].replace(hour=0)
                 #~ if p_date == date:
                     #~ print "Already treated today"
                     #~ self.queue.delete_one({"url":p.url})
                     #~ continue
               
             #si c'est une source
             #~ if item["depth"] == 0:
                 #~ print "is source"
                 #~ self.queue.delete_one({"url": item["url"]})
                 #~ continue
             #~ else:
             
                 
             page = Page(item, self.task)
             #pertinence
             status = page.process()                    
             try:
                 
                 #on cree et insere la page
                 self.data.insert_one(page.set_data())
                 #self.data.update_one({"url":item["url"]}, {"$set":page.set_last(), "$inc":{"crawl_nb":1}})
                 
                 if page.status:
                     cpt = 0
                     if page.depth+1 < page.max_depth:
                         for outlink in page.outlinks:
                             if outlink["url"] not in self.data.distinct("url"):
                                 try:
                                    cpt = cpt+1
                                    self.queue.insert_one(outlink)
                                 except pymongo.errors.DuplicateKeyError:
                                     continue
                             else: continue
                         print "adding %i new urls in queue  with depth %i" %(cpt, page.depth+1)
                         self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"}})
                 else:
                     self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"}})
                 
                 self.data.update_one({"url":item["url"]}, {"$push":page.add_data()})
                 self.queue.delete_one({"url": item["url"]})
                 continue
                 
             except pymongo.errors.DuplicateKeyError:
                 #~ if page.status:
                     #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"})
                 #~ else:
                     #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"})
                 #self.data.update_one({"url":item["url"]}, {"$push":page.add_data()}
                 
                     
                 self.queue.delete_one({"url": item["url"]})
                 continue
                 #check_last_modif
                 #####################"
                 #check_last_crawl
                 ########################
                 #~ date = self.date.replace(hour=0)
                 #~ p_date = page.date[-1]
                 #~ p_date = (p_date).replace(hour=0, day=p_date.day+1)
                 #~ print p_date, date
                 #~ if p_date == date:
                     #~ print "Already treated today"
                     #~ self.queue.delete_one({"url":item['url']})
                     #~ continue
                 #~ else:
                 
                     #check_last_modif
                     #####################"
                     #~ #if self.has_modification():
                         #~ if page.status:
                             #diff btw page.outlinks and last_page.outlinks
                         
                             #~ for outlink in page.outlinks:
                                 #~ try:
                                     #~ self.queue.insert_one(outlink)
                                 #~ except pymongo.errors.DuplicateKeyError:
                                     #~ continue
                         
                         #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_info(),"$set":page.set_last(), "$inc":{"crawl_nb":1}})
                     #~ else:
                        #~ pass
                     #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_data(), "$inc":{"crawl_nb":1}})
                     #~ self.queue.delete_one({"url": item["url"]})
                     #~ continue
             #~ except Exception as e:
                 #~ self.data.update_one({"url":item["url"]}, {"$push": {"msg":str(e), "status":False, "code":909, "date": self.date }})
                 #~ self.queue.delete_one({"url": item["url"]})
                 #~ continue
         s.report("mail")
                 
     logger.debug("***************END********")
     #s = Stats(self.name)
     #s.show(self)
     self.report.report("mail")
     return True