def __mtime_get(movie_site): # 断点重续:将爬取成功的年份与页数记录下来,当下次调用函数时可以直接从下一页开始 with open("../movie_sites/" + movie_site + "/crawled_year.txt", "r") as f: lines = f.readlines() if len(lines) > 0: crawling_year = lines[-1].split("`")[0] total_page = lines[-1].split("`")[1] crawling_page = lines[-1].split("`")[2].replace("\n", "") print crawling_year, total_page, crawling_page else: # 第一次调用函数时,将总页数记为-1,从0页开始 crawling_year = '2007' total_page = '-1' crawling_page = '0' print crawling_year, total_page, crawling_page # 当总页数与已获取页数不一样时,年份不变 if total_page != crawling_page: movie_year = int(crawling_year) else: # 一样时,年份增加一年,已获取页数为0 movie_year = int(crawling_year) + 1 crawling_page = '0' for year in range(movie_year, 2019): year_url = 'http://movie.mtime.com/movie/search/section/#year=' + str( year) browser.get(year_url) time.sleep(2) try: WebDriverWait( browser, 20, 5).until(lambda browser: browser.find_element_by_xpath( "//div[@class='mt15 mr15']")) movie_num = browser.find_element_by_xpath( "//div[@class='mt15 mr15']//h4[@class='px14']").text.split( '共')[1].replace('部', '') if int(movie_num) % 20 == 0: for page in range( int(crawling_page) + 1, int(movie_num) / 20 + 1): page_url = year_url + '&pageIndex=' + str(page) print "共有", int( movie_num) / 20, "页,", "第", page, "页:", page_url browser.get(page_url) time.sleep(1) browser.refresh() time.sleep(2) try: WebDriverWait(browser, 15, 5).until( lambda browser: browser.find_element_by_xpath( "//ul[@class='ser_mlist2']")) movies = browser.find_elements_by_xpath( "//ul[@class='ser_mlist2']//h3[@class='normal mt6']" ) print len(movies) movie_list = [] for movie in movies: time.sleep(0.2) print year, movie_name = movie.find_element_by_xpath("a").text if len(movie_name) > 0: print movie_name, movie_url = movie.find_element_by_xpath( "a").get_attribute("href") print movie_url, movie_id = movie_url.split("com/")[1].replace( "/", "") print movie_id time.sleep(0.2) movie_list.append(year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) else: print "movie name does not exist!" pass # 每获取20部电影,按年份,名字,链接,id存成列表,一起存入redis中 name = movie_site + str(year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) # 爬取成功,记录文件 with open( "../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + str(int(movie_num) / 20) + "`" + str(page) + "\n") except: time.sleep(120) __mtime_get(first_movie_website) if int(movie_num) % 20 != 0: for page in range( int(crawling_page) + 1, int(movie_num) / 20 + 2): page_url = year_url + '&pageIndex=' + str(page) print "共有", int( movie_num) / 20 + 1, "页,", "第", page, "页:", page_url browser.get(page_url) time.sleep(1) browser.refresh() time.sleep(2) try: WebDriverWait(browser, 15, 5).until( lambda browser: browser.find_element_by_xpath( "//ul[@class='ser_mlist2']")) movies = browser.find_elements_by_xpath( "//ul[@class='ser_mlist2']//h3[@class='normal mt6']" ) print len(movies) movie_list = [] for movie in movies: time.sleep(0.2) print year, movie_name = movie.find_element_by_xpath("a").text print movie_name, movie_url = movie.find_element_by_xpath( "a").get_attribute("href") print movie_url, movie_id = movie_url.split("com/")[1].replace( "/", "") print movie_id time.sleep(0.2) movie_list.append(year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) name = movie_site + str(year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) with open( "../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + str(int(movie_num) / 20 + 1) + "`" + str(page) + "\n") except: time.sleep(120) __mtime_get(first_movie_website) except: browser.quit() sys.exit()
def __1905_get(movie_site): with open("../movie_sites/" + movie_site + "/crawled_year.txt", "r") as f: lines = f.readlines() if len(lines) > 0: crawling_year = lines[-1].split("`")[0] total_page = lines[-1].split("`")[1] crawling_page = lines[-1].split("`")[2].replace("\n", "") print crawling_year, crawling_page else: crawling_year = '2007' total_page = '-1' crawling_page = '0' print crawling_year, crawling_page if total_page != crawling_page: movie_year = int(crawling_year) else: movie_year = int(crawling_year) + 1 crawling_page = '0' for year in range(movie_year, 2019): year_url = 'http://www.1905.com/mdb/film/list/year-' + str(year) + '/' browser.get(year_url) time.sleep(1) try: WebDriverWait( browser, 10, 2).until(lambda browser: browser.find_element_by_xpath( "//div[@class='lineG pl10 pb12']")) movie_num = browser.find_element_by_xpath( "//div[@class='lineG pl10 pb12']").text.replace("共", "").replace( "部影片", "") print movie_num if int(movie_num) % 30 != 0: for page in range( int(crawling_page) + 1, int(movie_num) / 30 + 2): page_url = year_url + 'o0d0p' + str(page) + '.html' print "共有", int( movie_num) / 30 + 1, "页,", "第", page, "页:", page_url browser.get(page_url) time.sleep(2) try: WebDriverWait(browser, 15, 3).until( lambda browser: browser.find_element_by_xpath( "//div[@class='leftArea']")) movies = browser.find_elements_by_xpath( "//div[@class='leftArea']/ul[@class='inqList pt18']/li" ) movie_list = [] for movie in movies: print year, time.sleep(0.1) movie_name = movie.find_element_by_xpath( "div[@class='text']/p/a").text print movie_name, movie_url = movie.find_element_by_xpath( "div[@class='text']/p/a").get_attribute("href") print movie_url, movie_id = movie_url.split('/film/')[1].replace( "/", "") print movie_id time.sleep(0.1) movie_list.append(year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) name = movie_site + str(year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) with open( "../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + str(int(movie_num) / 30 + 1) + "`" + str(page) + "\n") except: browser.close() time.sleep(120) __1905_get(second_movie_website) if int(movie_num) % 30 == 0: for page in range( int(crawling_page) + 1, int(movie_num) / 30 + 1): page_url = year_url + 'o0d0p' + str(page) + '.html' print "共有", int( movie_num) / 30, "页,", "第", page, "页:", page_url browser.get(page_url) time.sleep(2) try: WebDriverWait(browser, 15, 3).until( lambda browser: browser.find_element_by_xpath( "//div[@class='leftArea']")) movies = browser.find_elements_by_xpath( "//div[@class='leftArea']/ul[@class='inqList pt18']/li" ) movie_list = [] for movie in movies: print year, time.sleep(0.1) movie_name = movie.find_element_by_xpath( "div[@class='text']/p/a").text print movie_name, movie_url = movie.find_element_by_xpath( "div[@class='text']/p/a").get_attribute("href") print movie_url, movie_id = movie_url.split('/film/')[1].replace( "/", "") print movie_id time.sleep(0.1) movie_list.append(year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) name = movie_site + str(year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) with open( "../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + str(int(movie_num) / 30) + "`" + str(page) + "\n") except: browser.close() time.sleep(120) __1905_get(second_movie_website) except: browser.close() time.sleep(120) __1905_get(second_movie_website)
def __cbooo_get(movie_site): browser.get("http://www.cbooo.cn/movies") time.sleep(1) with open("../movie_sites/" + movie_site + "/crawled_year.txt", "r") as f: lines = f.readlines() if len(lines) > 0: crawling_year = lines[-1].split("`")[0] total_page = lines[-1].split("`")[1] crawling_page = lines[-1].split("`")[2].replace("\n", "") print crawling_year, total_page, crawling_page else: crawling_year = '2007' total_page = '-1' crawling_page = '0' print crawling_year, total_page, crawling_page if total_page != crawling_page: movie_year = int(crawling_year) else: movie_year = int(crawling_year) + 1 crawling_page = '0' for year in range(movie_year, 2019): try: WebDriverWait( browser, 15, 3).until(lambda browser: browser.find_element_by_xpath( "//select[@id='selYear']")) browser.find_element_by_xpath( "//select[@id='selYear']/option[@value='" + str(year) + "']").click() time.sleep(1) browser.find_element_by_xpath("//input[@id='btnSearch']").click() time.sleep(1) next_page = browser.find_elements_by_xpath( "//ul[@id='ulpage']/li")[-1] time.sleep(1) total_pages = next_page.get_attribute("onclick").split( "1,")[1].replace(")", "") print total_pages for page in range(int(crawling_page) + 1, int(total_pages) + 1): page_url = 'http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=' + str( movie_year) + '&initial=全部&pIndex=' + str(page) print "共有" + total_pages + "页。现在爬取第" + str( page) + "页:" + page_url html = urllib2.urlopen(page_url) time.sleep(1) json_content = json.loads(html.read()) movie_content = json_content["pData"] movie_list = [] for movie in movie_content: movie_year = str(year) movie_name = movie['MovieName'] movie_id = movie['ID'] movie_url = 'http://www.cbooo.cn/m/' + movie_id print movie_year, movie_name, movie_url, movie_id time.sleep(0.5) with open("../data/movies.txt", "a+") as f: f.write(movie_name + "``" + movie_year + "\n") movie_list.append(movie_year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) time.sleep(0.5) name = movie_site + str(year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) with open("../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + total_pages + "`" + str(page) + "\n") except: browser.close() time.sleep(120) __cbooo_get(second_movie_website)
def __douban_get(movie_site): # 集合法断点重续 total_movies = set() crawled_movies = set() error_movies = set() with open("../data/movies.txt", "r") as f: line = f.readlines() for r in range(1, len(line)): total_movies.add(line[r].replace("\n", "")) print "全部电影集合完成!!" with open("../data/movies_crawled.txt", "r") as m: lines = m.readlines() for line in lines: crawled_movies.add(line.replace("\n", "")) print "已爬电影集合完成!!" with open("../movie_sites/豆瓣网/movie_error.log", "r") as n: liness = n.readlines() for line in liness: error_movies.add(line.replace("\n", "")) print "出错电影集合完成!!" movies = set(total_movies - (crawled_movies | error_movies)) for movie in movies: movie_name = movie.split("``")[0] print type(movie_name) movie_year = movie.split("``")[1] print movie_name, movie_year movie_url = 'https://movie.douban.com/subject_search?search_text=' + urllib.quote( movie_name) + '&cat=1002' browser.get(movie_url) time.sleep(2) try: WebDriverWait(browser, 20, 2).until(lambda browser: browser. find_element_by_xpath("//div[@id='root']")) search_result = browser.find_elements_by_xpath( "//div[@class='title']") flag = 0 movie_list = [] for result in search_result: time.sleep(0.2) try: name = result.find_element_by_xpath("a") if movie_year in name.text or str(int(movie_year) + 1) in name.text or str( int(movie_year) - 1) in name.text: movie_url = name.get_attribute("href") print movie_url, movie_id = movie_url.split("subject/")[1].replace( "/", "") print movie_id with open("../data/movies_crawled.txt", "r") as n: n.write(movie_name + "``" + movie_year + "\n") movie_list.append(movie_year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) time.sleep(0.5) name = movie_site + str(movie_year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) break else: flag = 1 + flag print "this movie is wrong!" except: flag = 1 + flag print "can not find name!!!" pass if flag == int(len(search_result)): with open("../movie_sites/豆瓣网/movie_error.log", "a+") as n: n.write(movie_name + "``" + movie_year + "\n") except: time.sleep(120) __douban_get(third_movie_website)
def __58921_get(movie_site): with open("../movie_sites/" + movie_site + "/crawled_year.txt", "r") as f: lines = f.readlines() if len(lines) > 0: crawling_year = lines[-1].split("`")[0] total_page = lines[-1].split("`")[1] crawling_page = lines[-1].split("`")[2].replace("\n", "") print crawling_year, total_page, crawling_page else: crawling_year = '2007' total_page = '-1' crawling_page = '0' print crawling_year, total_page, crawling_page if total_page != crawling_page: movie_year = int(crawling_year) else: movie_year = int(crawling_year) + 1 crawling_page = '0' for year in range(movie_year, 2019): year_url = 'http://58921.com/alltime/' + str(year) browser.get(year_url) time.sleep(1) try: WebDriverWait( browser, 10, 2).until(lambda browser: browser.find_element_by_xpath( "//div[@class='item-list item_pager']")) total_movies = browser.find_element_by_xpath( "//div[@class='item-list item_pager']//span[@class='pager_number']" ).text if int(total_movies) % 20 == 0: for page in range(int(crawling_page), int(total_movies) / 20): page_url = year_url + '?page=' + str(page) print "共有", int( total_movies) / 20, "页,", "第", page + 1, "页:", page_url browser.get(page_url) time.sleep(1) movie_list = [] try: WebDriverWait(browser, 15, 3).until( lambda browser: browser.find_element_by_xpath( "//div[@class='table-responsive']")) movies = browser.find_elements_by_xpath( "//div[@class='table-responsive']//tbody/tr") for movie in movies: time.sleep(0.2) print year, movie_name = movie.find_elements_by_xpath( "td")[2].text print movie_name, movie_url = movie.find_element_by_xpath( "td/a").get_attribute("href") print movie_url, movie_id = movie_url.split('/film/')[1] print movie_id time.sleep(0.1) movie_list.append(year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) name = movie_site + str(year) time.sleep(1) movie_list = [] connect_redis.__redis_storage(name, movie_list) time.sleep(1) with open( "../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + str(int(total_movies) / 20) + "`" + str(page + 1) + "\n") except: browser.close() time.sleep(120) __58921_get(first_movie_website) if int(total_movies) % 20 != 0: for page in range(0, int(total_movies) / 20 + 1): page_url = year_url + '?page=' + str(page) print "共有", int( total_movies ) / 20 + 1, "页,", "第", page + 1, "页:", page_url browser.get(page_url) time.sleep(1) try: WebDriverWait(browser, 15, 3).until( lambda browser: browser.find_element_by_xpath( "//div[@class='table-responsive']")) movies = browser.find_elements_by_xpath( "//div[@class='table-responsive']//tbody/tr") for movie in movies: time.sleep(0.2) print year, movie_name = movie.find_elements_by_xpath( "td")[2].text print movie_name, movie_url = movie.find_element_by_xpath( "td/a").get_attribute("href") print movie_url, movie_id = movie_url.split('/film/')[1] print movie_id time.sleep(0.1) movie_list.append(year) movie_list.append(movie_name) movie_list.append(movie_url) movie_list.append(movie_id) name = movie_site + str(year) time.sleep(1) connect_redis.__redis_storage(name, movie_list) time.sleep(1) with open( "../movie_sites/" + movie_site + "/crawled_year.txt", "a+") as m: m.write( str(year) + "`" + str(int(total_movies) / 20 + 1) + "`" + str(page + 1) + "\n") except: browser.close() time.sleep(120) __58921_get(first_movie_website) except: browser.close() time.sleep(120) __58921_get(first_movie_website)