コード例 #1
0
ファイル: crawtext.py プロジェクト: gollivier/crawtext
 def insert_url(self,url):
     "insert url directly into data and next_url to seeds"
     info = Page({"url": url, "source_url": "url", "depth": 0}, self.task)
     info.process(False)
     
     try:
         self.data.insert_one(info.set_data())
     except pymongo.errors.DuplicateKeyError:
         date = self.date.replace(hour=0)
         p_date = (info.date[-1]).replace(hour=0)
         if p_date == date:
             print "Already in processing queue today. No need to update then!"
             #self.queue.delete_one({"url":info.url})
             #return self.queue
             pass
         else:
             self.data.update_one({"url":url, "depth":0}, {"$push":info.add_data()})
     
     if self.task["repeat"]:
         self.data.update_one({"url":url}, {"$inc":{"crawl_nb":1}})
         
     if info.status:
         for link in info.outlinks:
             try:
                 self.queue.insert_one(link)
             except pymongo.errors.DuplicateKeyError:
                 continue
             except pymongo.errors.WriteError:
                 print "Error", link
                 pass
     return self.queue
コード例 #2
0
ファイル: crawtext.py プロジェクト: malstor/crawtext
	def controled_crawl(self):
		while self.queue.count() > 0:
			for item in self.queue.find().sort('depth', pymongo.ASCENDING):
				logger.info(item["depth"])
				#logger.info("url %s depth %d" %(item["url"], item['depth']))
				
				p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
				
				if p.fetch():
					a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
					if a.extract(): 
						logging.info("extracted")
						if a.filter(self.query, self.directory):
							logging.info("valid")
							if a.check_depth(a.depth):
								
								a.fetch_links()
								if len(a.links) > 0:
									for url, domain in zip(a.links, a.domains):
										if url not in self.queue.distinct("url") and url not in self.results.distinct("url") and url not in self.logs.distinct("url"):
											self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
											
									logging.info("Inserted %d nexts url" %len(a.links))
								try:
									
									self.results.insert(a.export())
								except pymongo.errors.DuplicateKeyError:
									logging.info("Exists already")
									
									
					else:
						try:
							self.logs.insert(a.log())
						except pymongo.errors.DuplicateKeyError:
							logging.info("Exists already")
							
				else:
					try:
						self.logs.insert(p.log())
					except pymongo.errors.DuplicateKeyError:
						logging.info("Exists already")
						
						
				self.queue.remove(item)
				logging.info("Processing %i urls"%self.queue.count())
				if self.queue.count() == 0:
					break
			if self.queue.count() == 0:
				break
			if self.results.count() > 200000:
				self.queue.drop()
				break
コード例 #3
0
ファイル: wikipedia.py プロジェクト: Auzzy/personal
def get_albums(artist):
	artist_page = Page(artist)
	discog_section = artist_page.get_section("Discography")
	album_page_names = parse_discog_section(discog_section)

	albums = {}
	for album_name in album_page_names:
		album_page_name = album_page_names[album_name]
		album_page = Page(album_page_name)
		track_section = album_page.get_section("Track listing")
		albums[album_name] = parse_tracklist_section(track_section)
	
	return albums
コード例 #4
0
    def crawler(self):
        logging.info("Crawler activated with query filter %s" % self.target)
        # if self.sources.nb == 0:
        # 	sys.exit("Error: no sources found in the project.")
        try:
            self.project.load_sources()
            self.project.load_queue()
            self.project.load_logs()
        except AttributeError:
            self.load_project()

        #logging.info("Begin crawl with %i active urls"%self.sources.active_nb)
        self.push_to_queue()
        logging.info("Processing %i urls" % self.queue.count())

        #print self.queue.list

        while self.queue.count() > 0:
            for item in self.queue.find().sort([("depth", 1)]):
                if item["url"] in self.results.distinct("url"):
                    logging.info("in results")
                    self.queue.remove(item)

                elif item["url"] in self.logs.distinct("url"):
                    logging.info("in logs")
                    self.queue.remove(item)
                else:
                    #print "Treating", item["url"], item["depth"]
                    try:
                        p = Page(item["url"], item["source_url"],
                                 item["depth"], item["date"], True)
                    except KeyError:
                        p = Page(item["url"], item["source_url"],
                                 item["depth"], self.date, True)
                    if p.download():
                        a = Article(p.url, p.html, p.source_url, p.depth,
                                    p.date, True)
                        if a.extract():
                            #Targeted crawk filtering for pertinency
                            if self.target:
                                if a.filter(self.query, self.directory):
                                    if a.check_depth(a.depth):
                                        a.fetch_links()
                                        if len(a.links) > 0:
                                            for url, domain in zip(
                                                    a.links, a.domains):
                                                if url not in self.queue.distinct(
                                                        "url"
                                                ) and url not in self.results.distinct(
                                                        "url"):
                                                    self.queue.insert({
                                                        "url":
                                                        url,
                                                        "source_url":
                                                        item['url'],
                                                        "depth":
                                                        int(item['depth']) + 1,
                                                        "domain":
                                                        domain,
                                                        "date":
                                                        a.date
                                                    })
                                                    if self.debug:
                                                        logging.info(
                                                            "\t-inserted %d nexts url"
                                                            % len(a.links))
                                                try:
                                                    self.results.insert(
                                                        a.export())
                                                except pymongo.errors.DuplicateKeyError:
                                                    #self.results.update(a.export())
                                                    pass

                                    else:
                                        logging.debug("depth exceeded")
                                        self.logs.insert(a.log())
                                else:
                                    logging.debug("Not relevant")
                                    self.logs.insert(a.log())
                            else:
                                if a.check_depth(a.depth):
                                    a.fetch_links()
                                    if len(a.links) > 0:
                                        for url, domain in zip(
                                                a.links, a.domains):
                                            try:
                                                self.queue.insert({
                                                    "url":
                                                    url,
                                                    "source_url":
                                                    item['url'],
                                                    "depth":
                                                    int(item['depth']) + 1,
                                                    "domain":
                                                    domain,
                                                    "date":
                                                    a.date
                                                })
                                            except pymongo.errors.DuplicateKeyError:
                                                pass
                                                if self.debug:
                                                    logging.info(
                                                        "\t-inserted %d nexts url"
                                                        % len(a.links))
                                            try:
                                                self.results.insert(a.export())
                                            except pymongo.errors.DuplicateKeyError:
                                                pass
                                else:
                                    logging.debug("Depth exceeded")
                                    try:
                                        self.logs.insert(a.log())
                                    except pymongo.errors.DuplicateKeyError:
                                        self.logs.update(
                                            {"url": a.url},
                                            {"$push": {
                                                "msg": a.msg
                                            }})

                        else:
                            logging.debug("Error Extracting")
                            try:
                                self.logs.insert(a.log())
                            except pymongo.errors.DuplicateKeyError:
                                self.logs.update({"url": a.url},
                                                 {"$push": {
                                                     "msg": a.msg
                                                 }})
                    else:
                        logging.debug("Error Downloading")
                        self.logs.insert(p.log())

                    self.queue.remove(item)
                    logging.info("Processing %i urls" % self.queue.count())
                if self.queue.nb == 0:
                    break
            if self.queue.nb == 0:
                break
            if self.results.count() > 200000:
                self.queue.drop()
                break

        return sys.exit(1)
コード例 #5
0
ファイル: crawtext.py プロジェクト: malstor/crawtext
	def crawler(self):
		logging.info("Crawler activated with query filter %s" %self.target)
		# if self.sources.nb == 0:
		# 	sys.exit("Error: no sources found in the project.")
		try:
			self.project.load_sources()
			self.project.load_queue()
			self.project.load_logs()
		except AttributeError:
			self.load_project()





		#logging.info("Begin crawl with %i active urls"%self.sources.active_nb)
		self.push_to_queue()
		logging.info("Processing %i urls"%self.queue.count())



		#print self.queue.list

		while self.queue.count() > 0:
			for item in self.queue.find().sort([("depth", 1)]):
				if item["url"] in self.results.distinct("url"):
					logging.info("in results")
					self.queue.remove(item)

				elif item["url"] in self.logs.distinct("url"):
					logging.info("in logs")
					self.queue.remove(item)
				else:
					#print "Treating", item["url"], item["depth"]
					try:
						p = Page(item["url"], item["source_url"],item["depth"], item["date"], True)
					except KeyError:
						p = Page(item["url"], item["source_url"],item["depth"], self.date, True)
					if p.download():
						a = Article(p.url,p.html, p.source_url, p.depth,p.date, True)
						if a.extract():
							#Targeted crawk filtering for pertinency
							if self.target:
								if a.filter(self.query, self.directory):
									if a.check_depth(a.depth):
										a.fetch_links()
										if len(a.links) > 0:
											for url, domain in zip(a.links, a.domains):
												if url not in self.queue.distinct("url") and url not in self.results.distinct("url"):
													self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
													if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
												try:
													self.results.insert(a.export())
												except pymongo.errors.DuplicateKeyError:
													#self.results.update(a.export())
													pass

									else:
										logging.debug("depth exceeded")
										self.logs.insert(a.log())
								else:
									logging.debug("Not relevant")
									self.logs.insert(a.log())
							else:
								if a.check_depth(a.depth):
									a.fetch_links()
									if len(a.links) > 0:
										for url, domain in zip(a.links, a.domains):
											try:
												self.queue.insert({"url": url, "source_url": item['url'], "depth": int(item['depth'])+1, "domain": domain, "date": a.date})
											except pymongo.errors.DuplicateKeyError:
												pass
												if self.debug: logging.info("\t-inserted %d nexts url" %len(a.links))
											try:
												self.results.insert(a.export())
											except pymongo.errors.DuplicateKeyError:
												pass
								else:
									logging.debug("Depth exceeded")
									try:
										self.logs.insert(a.log())
									except pymongo.errors.DuplicateKeyError:
										self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})

						else:
							logging.debug("Error Extracting")
							try:
								self.logs.insert(a.log())
							except pymongo.errors.DuplicateKeyError:
								self.logs.update({"url":a.url}, {"$push":{"msg": a.msg}})
					else:
						logging.debug("Error Downloading")
						self.logs.insert(p.log())

					self.queue.remove(item)
					logging.info("Processing %i urls"%self.queue.count())
				if self.queue.nb == 0:
					break
			if self.queue.nb == 0:
				break
			if self.results.count() > 200000:
				self.queue.drop()
				break

		return sys.exit(1)
コード例 #6
0
ファイル: crawtext.py プロジェクト: gollivier/crawtext
 def global_crawl(self):
     logger.debug("***************CRAWL********")
     while self.queue.count() > 0:
         print "%i urls in process" %self.queue.count()
         print "in which %i sources in process" %self.queue.count({"depth":0})
         self.report.report("mail")
         for item in self.queue.find(no_cursor_timeout=True).sort([('depth', pymongo.ASCENDING)]):
             print "%i urls in process" %self.queue.count()
             
             #~ #Once a day
             #~ if self.task["repeat"] is False:
                 #~ date = self.date.replace(hour=0)
                 #~ p_date = p.date[-1].replace(hour=0)
                 #~ if p_date == date:
                     #~ print "Already treated today"
                     #~ self.queue.delete_one({"url":p.url})
                     #~ continue
               
             #si c'est une source
             #~ if item["depth"] == 0:
                 #~ print "is source"
                 #~ self.queue.delete_one({"url": item["url"]})
                 #~ continue
             #~ else:
             
                 
             page = Page(item, self.task)
             #pertinence
             status = page.process()                    
             try:
                 
                 #on cree et insere la page
                 self.data.insert_one(page.set_data())
                 #self.data.update_one({"url":item["url"]}, {"$set":page.set_last(), "$inc":{"crawl_nb":1}})
                 
                 if page.status:
                     cpt = 0
                     if page.depth+1 < page.max_depth:
                         for outlink in page.outlinks:
                             if outlink["url"] not in self.data.distinct("url"):
                                 try:
                                    cpt = cpt+1
                                    self.queue.insert_one(outlink)
                                 except pymongo.errors.DuplicateKeyError:
                                     continue
                             else: continue
                         print "adding %i new urls in queue  with depth %i" %(cpt, page.depth+1)
                         self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"}})
                 else:
                     self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"}})
                 
                 self.data.update_one({"url":item["url"]}, {"$push":page.add_data()})
                 self.queue.delete_one({"url": item["url"]})
                 continue
                 
             except pymongo.errors.DuplicateKeyError:
                 #~ if page.status:
                     #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "page"})
                 #~ else:
                     #~ self.data.update_one({"url":item["url"]}, {"$set":{"type": "log"})
                 #self.data.update_one({"url":item["url"]}, {"$push":page.add_data()}
                 
                     
                 self.queue.delete_one({"url": item["url"]})
                 continue
                 #check_last_modif
                 #####################"
                 #check_last_crawl
                 ########################
                 #~ date = self.date.replace(hour=0)
                 #~ p_date = page.date[-1]
                 #~ p_date = (p_date).replace(hour=0, day=p_date.day+1)
                 #~ print p_date, date
                 #~ if p_date == date:
                     #~ print "Already treated today"
                     #~ self.queue.delete_one({"url":item['url']})
                     #~ continue
                 #~ else:
                 
                     #check_last_modif
                     #####################"
                     #~ #if self.has_modification():
                         #~ if page.status:
                             #diff btw page.outlinks and last_page.outlinks
                         
                             #~ for outlink in page.outlinks:
                                 #~ try:
                                     #~ self.queue.insert_one(outlink)
                                 #~ except pymongo.errors.DuplicateKeyError:
                                     #~ continue
                         
                         #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_info(),"$set":page.set_last(), "$inc":{"crawl_nb":1}})
                     #~ else:
                        #~ pass
                     #~ self.data.update_one({"url":item["url"]}, {"$push": page.add_data(), "$inc":{"crawl_nb":1}})
                     #~ self.queue.delete_one({"url": item["url"]})
                     #~ continue
             #~ except Exception as e:
                 #~ self.data.update_one({"url":item["url"]}, {"$push": {"msg":str(e), "status":False, "code":909, "date": self.date }})
                 #~ self.queue.delete_one({"url": item["url"]})
                 #~ continue
         s.report("mail")
                 
     logger.debug("***************END********")
     #s = Stats(self.name)
     #s.show(self)
     self.report.report("mail")
     return True
コード例 #7
0
ファイル: wikipedia.py プロジェクト: Auzzy/personal
		if is_list_item(line):
			album_page,album_name = handle_list_item(line)
			if album_page:
				album_pages.append(album_page)
	return album_pages


if __name__=="__main__":
	import article
	article.DEBUG = True

	# name = "Godsmack"
	name = "OSI_(band)"
	# name = "Depswa"

	artist_page = Page(name)
	discog_section = artist_page.get_section("Discography")
	album_page_names = parse_discog_section(discog_section)

	albums = {}
	for album_page_name in album_page_names:
		album_page = Page(album_page_name)
		track_section = album_page.get_section("Track listing")
		albums[album_page_name] = parse_track_names(track_section)
	
	for album in albums:
		print album.upper()
		for track_name in albums[album]:
			print track_name
		print