def insert_url(self,url): "insert url directly into data and next_url to seeds" info = Page({"url": url, "source_url": "url", "depth": 0}, self.task) info.process(False) try: self.data.insert_one(info.set_data()) except pymongo.errors.DuplicateKeyError: date = self.date.replace(hour=0) p_date = (info.date[-1]).replace(hour=0) if p_date == date: print "Already in processing queue today. No need to update then!" #self.queue.delete_one({"url":info.url}) #return self.queue pass else: self.data.update_one({"url":url, "depth":0}, {"$push":info.add_data()}) if self.task["repeat"]: self.data.update_one({"url":url}, {"$inc":{"crawl_nb":1}}) if info.status: for link in info.outlinks: try: self.queue.insert_one(link) except pymongo.errors.DuplicateKeyError: continue except pymongo.errors.WriteError: print "Error", link pass return self.queue
def controled_crawl(self): while self.queue.count() > 0: for item in self.queue.find().sort('depth', pymongo.ASCENDING): logger.info(item["depth"]) #logger.info("url %s depth %d" %(item["url"], item['depth'])) p = Page(item["url"], item["source_url"],item["depth"], item["date"], True) if p.fetch(): a = Article(p.url,p.html, p.source_url, p.depth,p.date, True) if a.extract(): logging.info("extracted") if a.filter(self.query, self.directory): logging.info("valid") if a.check_depth(a.depth): a.fetch_links() if len(a.links) > 0: for url, domain in zip(a.links, a.domains): if url not in self.queue.distinct("url") and url not in self.results.distinct("url") and url not in self.logs.distinct("url"): self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date}) logging.info("Inserted %d nexts url" %len(a.links)) try: self.results.insert(a.export()) except pymongo.errors.DuplicateKeyError: logging.info("Exists already") else: try: self.logs.insert(a.log()) except pymongo.errors.DuplicateKeyError: logging.info("Exists already") else: try: self.logs.insert(p.log()) except pymongo.errors.DuplicateKeyError: logging.info("Exists already") self.queue.remove(item) logging.info("Processing %i urls"%self.queue.count()) if self.queue.count() == 0: break if self.queue.count() == 0: break if self.results.count() > 200000: self.queue.drop() break
def get_albums(artist): artist_page = Page(artist) discog_section = artist_page.get_section("Discography") album_page_names = parse_discog_section(discog_section) albums = {} for album_name in album_page_names: album_page_name = album_page_names[album_name] album_page = Page(album_page_name) track_section = album_page.get_section("Track listing") albums[album_name] = parse_tracklist_section(track_section) return albums
def crawler(self): logging.info("Crawler activated with query filter %s" % self.target) # if self.sources.nb == 0: # sys.exit("Error: no sources found in the project.") try: self.project.load_sources() self.project.load_queue() self.project.load_logs() except AttributeError: self.load_project() #logging.info("Begin crawl with %i active urls"%self.sources.active_nb) self.push_to_queue() logging.info("Processing %i urls" % self.queue.count()) #print self.queue.list while self.queue.count() > 0: for item in self.queue.find().sort([("depth", 1)]): if item["url"] in self.results.distinct("url"): logging.info("in results") self.queue.remove(item) elif item["url"] in self.logs.distinct("url"): logging.info("in logs") self.queue.remove(item) else: #print "Treating", item["url"], item["depth"] try: p = Page(item["url"], item["source_url"], item["depth"], item["date"], True) except KeyError: p = Page(item["url"], item["source_url"], item["depth"], self.date, True) if p.download(): a = Article(p.url, p.html, p.source_url, p.depth, p.date, True) if a.extract(): #Targeted crawk filtering for pertinency if self.target: if a.filter(self.query, self.directory): if a.check_depth(a.depth): a.fetch_links() if len(a.links) > 0: for url, domain in zip( a.links, a.domains): if url not in self.queue.distinct( "url" ) and url not in self.results.distinct( "url"): self.queue.insert({ "url": url, "source_url": item['url'], "depth": int(item['depth']) + 1, "domain": domain, "date": a.date }) if self.debug: logging.info( "\t-inserted %d nexts url" % len(a.links)) try: self.results.insert( a.export()) except pymongo.errors.DuplicateKeyError: #self.results.update(a.export()) pass else: logging.debug("depth exceeded") self.logs.insert(a.log()) else: logging.debug("Not relevant") self.logs.insert(a.log()) else: if a.check_depth(a.depth): a.fetch_links() if len(a.links) > 0: for url, domain in zip( a.links, a.domains): try: self.queue.insert({ "url": url, "source_url": item['url'], "depth": int(item['depth']) + 1, "domain": domain, "date": a.date }) except pymongo.errors.DuplicateKeyError: pass if self.debug: logging.info( "\t-inserted %d nexts url" % len(a.links)) try: self.results.insert(a.export()) except pymongo.errors.DuplicateKeyError: pass else: logging.debug("Depth exceeded") try: self.logs.insert(a.log()) except pymongo.errors.DuplicateKeyError: self.logs.update( {"url": a.url}, {"$push": { "msg": a.msg }}) else: logging.debug("Error Extracting") try: self.logs.insert(a.log()) except pymongo.errors.DuplicateKeyError: self.logs.update({"url": a.url}, {"$push": { "msg": a.msg }}) else: logging.debug("Error Downloading") self.logs.insert(p.log()) self.queue.remove(item) logging.info("Processing %i urls" % self.queue.count()) if self.queue.nb == 0: break if self.queue.nb == 0: break if self.results.count() > 200000: self.queue.drop() break return sys.exit(1)
def crawler(self): logging.info("Crawler activated with query filter %s" %self.target) # if self.sources.nb == 0: # sys.exit("Error: no sources found in the project.") try: self.project.load_sources() self.project.load_queue() self.project.load_logs() except AttributeError: self.load_project() #logging.info("Begin crawl with %i active urls"%self.sources.active_nb) self.push_to_queue() logging.info("Processing %i urls"%self.queue.count()) #print self.queue.list while self.queue.count() > 0: for item in self.queue.find().sort([("depth", 1)]): if item["url"] in self.results.distinct("url"): logging.info("in results") self.queue.remove(item) elif item["url"] in self.logs.distinct("url"): logging.info("in logs") self.queue.remove(item) else: #print "Treating", item["url"], item["depth"] try: p = Page(item["url"], item["source_url"],item["depth"], item["date"], True) except KeyError: p = Page(item["url"], item["source_url"],item["depth"], self.date, True) if p.download(): a = Article(p.url,p.html, p.source_url, p.depth,p.date, True) if a.extract(): #Targeted crawk filtering for pertinency if self.target: if a.filter(self.query, self.directory): if a.check_depth(a.depth): a.fetch_links() if len(a.links) > 0: for url, domain in zip(a.links, a.domains): if url not in self.queue.distinct("url") and url not in self.results.distinct("url"): self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date}) if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links)) try: self.results.insert(a.export()) except pymongo.errors.DuplicateKeyError: #self.results.update(a.export()) pass else: logging.debug("depth exceeded") self.logs.insert(a.log()) else: logging.debug("Not relevant") self.logs.insert(a.log()) else: if a.check_depth(a.depth): a.fetch_links() if len(a.links) > 0: for url, domain in zip(a.links, a.domains): try: self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date}) except pymongo.errors.DuplicateKeyError: pass if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links)) try: self.results.insert(a.export()) except pymongo.errors.DuplicateKeyError: pass else: logging.debug("Depth exceeded") try: self.logs.insert(a.log()) except pymongo.errors.DuplicateKeyError: self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}}) else: logging.debug("Error Extracting") try: self.logs.insert(a.log()) except pymongo.errors.DuplicateKeyError: self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}}) else: logging.debug("Error Downloading") self.logs.insert(p.log()) self.queue.remove(item) logging.info("Processing %i urls"%self.queue.count()) if self.queue.nb == 0: break if self.queue.nb == 0: break if self.results.count() > 200000: self.queue.drop() break return sys.exit(1)
def global_crawl(self): logger.debug("***************CRAWL********") while self.queue.count() > 0: print "%i urls in process" %self.queue.count() print "in which %i sources in process" %self.queue.count({"depth":0}) self.report.report("mail") for item in self.queue.find(no_cursor_timeout=True).sort([('depth', pymongo.ASCENDING)]): print "%i urls in process" %self.queue.count() #~ #Once a day #~ if self.task["repeat"] is False: #~ date = self.date.replace(hour=0) #~ p_date = p.date[-1].replace(hour=0) #~ if p_date == date: #~ print "Already treated today" #~ self.queue.delete_one({"url":p.url}) #~ continue #si c'est une source #~ if item["depth"] == 0: #~ print "is source" #~ self.queue.delete_one({"url": item["url"]}) #~ continue #~ else: page = Page(item, self.task) #pertinence status = page.process() try: #on cree et insere la page self.data.insert_one(page.set_data()) #self.data.update_one({"url":item["url"]}, {"$set":page.set_last(), "$inc":{"crawl_nb":1}}) if page.status: cpt = 0 if page.depth+1 < page.max_depth: for outlink in page.outlinks: if outlink["url"] not in self.data.distinct("url"): try: cpt = cpt+1 self.queue.insert_one(outlink) except pymongo.errors.DuplicateKeyError: continue else: continue print "adding %i new urls in queue with depth %i" %(cpt, page.depth+1) self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"}}) else: self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"}}) self.data.update_one({"url":item["url"]}, {"$push":page.add_data()}) self.queue.delete_one({"url": item["url"]}) continue except pymongo.errors.DuplicateKeyError: #~ if page.status: #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"}) #~ else: #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"}) #self.data.update_one({"url":item["url"]}, {"$push":page.add_data()} self.queue.delete_one({"url": item["url"]}) continue #check_last_modif #####################" #check_last_crawl ######################## #~ date = self.date.replace(hour=0) #~ p_date = page.date[-1] #~ p_date = (p_date).replace(hour=0, day=p_date.day+1) #~ print p_date, date #~ if p_date == date: #~ print "Already treated today" #~ self.queue.delete_one({"url":item['url']}) #~ continue #~ else: #check_last_modif #####################" #~ #if self.has_modification(): #~ if page.status: #diff btw page.outlinks and last_page.outlinks #~ for outlink in page.outlinks: #~ try: #~ self.queue.insert_one(outlink) #~ except pymongo.errors.DuplicateKeyError: #~ continue #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_info(),"$set":page.set_last(), "$inc":{"crawl_nb":1}}) #~ else: #~ pass #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_data(), "$inc":{"crawl_nb":1}}) #~ self.queue.delete_one({"url": item["url"]}) #~ continue #~ except Exception as e: #~ self.data.update_one({"url":item["url"]}, {"$push": {"msg":str(e), "status":False, "code":909, "date": self.date }}) #~ self.queue.delete_one({"url": item["url"]}) #~ continue s.report("mail") logger.debug("***************END********") #s = Stats(self.name) #s.show(self) self.report.report("mail") return True
if is_list_item(line): album_page,album_name = handle_list_item(line) if album_page: album_pages.append(album_page) return album_pages if __name__=="__main__": import article article.DEBUG = True # name = "Godsmack" name = "OSI_(band)" # name = "Depswa" artist_page = Page(name) discog_section = artist_page.get_section("Discography") album_page_names = parse_discog_section(discog_section) albums = {} for album_page_name in album_page_names: album_page = Page(album_page_name) track_section = album_page.get_section("Track listing") albums[album_page_name] = parse_track_names(track_section) for album in albums: print album.upper() for track_name in albums[album]: print track_name print