Esempio n. 1
0
        if len(links) == 0:
            self.distributor.stop()
            return
            
        for link in links:
            title = link.text
            relative_url = link.get("href")
            # logging.info(title, relative_url)
            self.distributor.add_task(ReviewScrapeTask, {
                "link"  : relative_url,
                "title" : title,
                "start" : ReviewScrapeTask.START
            })
            
        self.distributor.add_task(MovieScrapeTask, start + self.MOVIES_PER_PAGE)
            
        yield None


if __name__ == "__main__":
    logging.basicConfig(\
        level = logging.DEBUG, \
        filename="imdb/log.txt", \
        filemode="w", \
        format="%(asctime)s\t%(levelname)s\t(%(threadName)-10s)\t%(message)s"\
    )
    distributor = ThreadDistributor(N_THREADS)
    distributor.add_task(MovieScrapeTask, MovieScrapeTask.START)
    distributor.run()
    
Esempio n. 2
0
            imdb_id = url.split("/")[-2]
            image.retrieve(image_url,
                           "../www/static/images/posters/%s.png" % imdb_id)
            cursor.execute(
                "INSERT IGNORE INTO movies (imdb_id, name, image_url,  plot, created_time)"
                "VALUES (%s, %s, %s, %s, %s)",
                (imdb_id,
                 name.encode("utf-8"), image_url, plot.encode("utf-8"),
                 datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

            if cursor.lastrowid != 0:
                self.distributor.add_task(DetailScrapeTask, {
                    "url": url,
                    "imdb_id": imdb_id,
                    "id": cursor.lastrowid
                })

        yield None


if __name__ == "__main__":
    logging.basicConfig(
        level=logging.DEBUG,
        filename="logs/imdb_nowplaying.log",
        filemode="w",
        format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s")
    distributor = ThreadDistributor(1)
    distributor.add_task(NowPlayingScrapeTask,
                         "http://www.imdb.com/nowplaying/")
    distributor.run()
Esempio n. 3
0
                    for e in doms.cssselect(".result"): 
                        found = True
                    
                        try:    code = e.get("name").split("_")[0] or ""
                        except: code = ""
                    
                        try:    name = e.cssselect("a.title")[0].text or ""
                        except: name = ""
                        
                        self.distributor.add_task(ReviewScrapeTask, {
                            "name": name, 
                            "code": code, 
                            "page": 1 
                        }) 
                        logging.info("SEARCH dvd: %s, code: %s, page: %d" \
                                     % (name, code, page))
                        
        if found: self.distributor.add_task(SearchTask, page + 1)
        yield None
    
if __name__ == "__main__":
    logging.basicConfig(\
        level = logging.DEBUG, \
        filename="Amazon/DVD/log.txt", \
        filemode="w", \
        format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s"\
    )
    distributor = ThreadDistributor(20)
    distributor.add_task(SearchTask, SEARCH_START_PAGE)
    distributor.run()
Esempio n. 4
0
                        except:
                            code = ""

                        try:
                            name = e.cssselect("a.title")[0].text or ""
                        except:
                            name = ""

                        self.distributor.add_task(ReviewScrapeTask, {
                            "name": name,
                            "code": code,
                            "page": 1
                        })
                        logging.info("SEARCH dvd: %s, code: %s, page: %d" \
                                     % (name, code, page))

        if found: self.distributor.add_task(SearchTask, page + 1)
        yield None


if __name__ == "__main__":
    logging.basicConfig(\
        level = logging.DEBUG, \
        filename="Amazon/DVD/log.txt", \
        filemode="w", \
        format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s"\
    )
    distributor = ThreadDistributor(20)
    distributor.add_task(SearchTask, SEARCH_START_PAGE)
    distributor.run()
Esempio n. 5
0
        image = urllib.URLopener()

        for name, url, image_url, plot in zip(names, urls, image_urls, plots):
            cursor = db.cursor()
            imdb_id = url.split("/")[-2]
            image.retrieve(image_url, "../www/static/images/posters/%s.png" % imdb_id)
            cursor.execute("INSERT IGNORE INTO movies (imdb_id, name, image_url,  plot, created_time)"
                           "VALUES (%s, %s, %s, %s, %s)",
                           (imdb_id, name.encode("utf-8"), image_url, plot.encode("utf-8"),
                            datetime.now().strftime("%Y-%m-%d %H:%M:%S") ))

            if cursor.lastrowid != 0:                              
                self.distributor.add_task(
                    DetailScrapeTask, 
                    {"url": url, "imdb_id": imdb_id, "id": cursor.lastrowid}
                )
        
        yield None


if __name__ == "__main__":
    logging.basicConfig(
        level = logging.DEBUG, 
        filename="logs/imdb_nowplaying.log", 
        filemode="w", 
        format="%(asctime)s\t%(levelname)s\t%(threadName)s\t%(message)s"
    )
    distributor = ThreadDistributor(1)
    distributor.add_task(NowPlayingScrapeTask, "http://www.imdb.com/nowplaying/")
    distributor.run()
Esempio n. 6
0
db_connect = MySQLdb.connect(DB["host"], DB["user"], DB["passwd"], DB["name"])
cursor = db_connect.cursor()


class InitializeTask(Task):
    """ Read tweet from database for recent movie
    
    """
    
    def run(self):
        cursor.execute("""SELECT `id`, `name` FROM movies
                       WHERE DATEDIFF(NOW(), created_time) <= %s""", 
                       TIME_TH)
        movies = cursor.fetchall()
        print movies
        
        yield None


if __name__ == "__main__":
    logging.basicConfig(
        level = logging.DEBUG,
        filename = "logs/tweets_error.log",
        filemod = "w",
        format="%(asctime)s\t%(levelname)s\t(%(threadName)-10s)\t%(message)s"
    )
    distributor = ThreadDistributor(N_THREADS)
    distributor.add_task(InitializeTask)
    distributor.run()
    cursor.close()
Esempio n. 7
0
            self.distributor.stop()
            return

        for link in links:
            title = link.text
            relative_url = link.get("href")
            # logging.info(title, relative_url)
            self.distributor.add_task(
                ReviewScrapeTask, {
                    "link": relative_url,
                    "title": title,
                    "start": ReviewScrapeTask.START
                })

        self.distributor.add_task(MovieScrapeTask,
                                  start + self.MOVIES_PER_PAGE)

        yield None


if __name__ == "__main__":
    logging.basicConfig(\
        level = logging.DEBUG, \
        filename="imdb/log.txt", \
        filemode="w", \
        format="%(asctime)s\t%(levelname)s\t(%(threadName)-10s)\t%(message)s"\
    )
    distributor = ThreadDistributor(N_THREADS)
    distributor.add_task(MovieScrapeTask, MovieScrapeTask.START)
    distributor.run()