if len(links) == 0: self.distributor.stop() return for link in links: title = link.text relative_url = link.get("href") # logging.info(title, relative_url) self.distributor.add_task(ReviewScrapeTask, { "link" : relative_url, "title" : title, "start" : ReviewScrapeTask.START }) self.distributor.add_task(MovieScrapeTask, start + self.MOVIES_PER_PAGE) yield None if __name__ == "__main__": logging.basicConfig(\ level = logging.DEBUG, \ filename="imdb/log.txt", \ filemode="w", \ format="%(asctime)s\t%(levelname)s\t(%(threadName)-10s)\t%(message)s"\ ) distributor = ThreadDistributor(N_THREADS) distributor.add_task(MovieScrapeTask, MovieScrapeTask.START) distributor.run()
imdb_id = url.split("/")[-2] image.retrieve(image_url, "../www/static/images/posters/%s.png" % imdb_id) cursor.execute( "INSERT IGNORE INTO movies (imdb_id, name, image_url, plot, created_time)" "VALUES (%s, %s, %s, %s, %s)", (imdb_id, name.encode("utf-8"), image_url, plot.encode("utf-8"), datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) if cursor.lastrowid != 0: self.distributor.add_task(DetailScrapeTask, { "url": url, "imdb_id": imdb_id, "id": cursor.lastrowid }) yield None if __name__ == "__main__": logging.basicConfig( level=logging.DEBUG, filename="logs/imdb_nowplaying.log", filemode="w", format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s") distributor = ThreadDistributor(1) distributor.add_task(NowPlayingScrapeTask, "http://www.imdb.com/nowplaying/") distributor.run()
for e in doms.cssselect(".result"): found = True try: code = e.get("name").split("_")[0] or "" except: code = "" try: name = e.cssselect("a.title")[0].text or "" except: name = "" self.distributor.add_task(ReviewScrapeTask, { "name": name, "code": code, "page": 1 }) logging.info("SEARCH dvd: %s, code: %s, page: %d" \ % (name, code, page)) if found: self.distributor.add_task(SearchTask, page + 1) yield None if __name__ == "__main__": logging.basicConfig(\ level = logging.DEBUG, \ filename="Amazon/DVD/log.txt", \ filemode="w", \ format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s"\ ) distributor = ThreadDistributor(20) distributor.add_task(SearchTask, SEARCH_START_PAGE) distributor.run()
except: code = "" try: name = e.cssselect("a.title")[0].text or "" except: name = "" self.distributor.add_task(ReviewScrapeTask, { "name": name, "code": code, "page": 1 }) logging.info("SEARCH dvd: %s, code: %s, page: %d" \ % (name, code, page)) if found: self.distributor.add_task(SearchTask, page + 1) yield None if __name__ == "__main__": logging.basicConfig(\ level = logging.DEBUG, \ filename="Amazon/DVD/log.txt", \ filemode="w", \ format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s"\ ) distributor = ThreadDistributor(20) distributor.add_task(SearchTask, SEARCH_START_PAGE) distributor.run()
image = urllib.URLopener() for name, url, image_url, plot in zip(names, urls, image_urls, plots): cursor = db.cursor() imdb_id = url.split("/")[-2] image.retrieve(image_url, "../www/static/images/posters/%s.png" % imdb_id) cursor.execute("INSERT IGNORE INTO movies (imdb_id, name, image_url, plot, created_time)" "VALUES (%s, %s, %s, %s, %s)", (imdb_id, name.encode("utf-8"), image_url, plot.encode("utf-8"), datetime.now().strftime("%Y-%m-%d %H:%M:%S") )) if cursor.lastrowid != 0: self.distributor.add_task( DetailScrapeTask, {"url": url, "imdb_id": imdb_id, "id": cursor.lastrowid} ) yield None if __name__ == "__main__": logging.basicConfig( level = logging.DEBUG, filename="logs/imdb_nowplaying.log", filemode="w", format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s" ) distributor = ThreadDistributor(1) distributor.add_task(NowPlayingScrapeTask, "http://www.imdb.com/nowplaying/") distributor.run()
db_connect = MySQLdb.connect(DB["host"], DB["user"], DB["passwd"], DB["name"]) cursor = db_connect.cursor() class InitializeTask(Task): """ Read tweet from database for recent movie """ def run(self): cursor.execute("""SELECT `id`, `name` FROM movies WHERE DATEDIFF(NOW(), created_time) <= %s""", TIME_TH) movies = cursor.fetchall() print movies yield None if __name__ == "__main__": logging.basicConfig( level = logging.DEBUG, filename = "logs/tweets_error.log", filemod = "w", format="%(asctime)s\t%(levelname)s\t(%(threadName)-10s)\t%(message)s" ) distributor = ThreadDistributor(N_THREADS) distributor.add_task(InitializeTask) distributor.run() cursor.close()
self.distributor.stop() return for link in links: title = link.text relative_url = link.get("href") # logging.info(title, relative_url) self.distributor.add_task( ReviewScrapeTask, { "link": relative_url, "title": title, "start": ReviewScrapeTask.START }) self.distributor.add_task(MovieScrapeTask, start + self.MOVIES_PER_PAGE) yield None if __name__ == "__main__": logging.basicConfig(\ level = logging.DEBUG, \ filename="imdb/log.txt", \ filemode="w", \ format="%(asctime)s\t%(levelname)s\t(%(threadName)-10s)\t%(message)s"\ ) distributor = ThreadDistributor(N_THREADS) distributor.add_task(MovieScrapeTask, MovieScrapeTask.START) distributor.run()