def main(max_threads): # cache = MongoCache(expires=timedelta()) seed_url = get_urls() # print(seed_url) cache = MongoCache(expires=timedelta()) # cache.clear() callback = DetailPageCallback( path='/Users/zhangming/ENV/fang_crawler/data_csv') threaded_crawler(seed_url, cache=None, scrape_callback=callback, proxies=ProxiesPool(), max_threads=max_threads)
def __call__(self, url, html): if url == 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip': urls = [] zf = ZipFile(self.seed_path) file_name = zf.namelist()[0] df = pd.read_csv(zf.open(file_name)) urls = [] for website in df.iloc[:, 1].tolist( ): # top 1 millon url's will be stored in this list:df.iloc[:, 1].tolist() urls.append('http://' + website) if len(urls) == self.max_urls: break return urls if __name__ == '__main__': cache = MongoCache(expires=timedelta()) url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' # the url can't be downloaded, so be we use requests package in Downloader.py # so it run incorrectly. link_crawler(url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, proxies=None, num_retries=1, scrape_callback=AlexaCallback(url), cache=cache)
from datetime import timedelta from ScrapeCallback import ScrapeClassbackIhdexPage, ScrapeClassbackStreetPage from DiskCache import MongoCache import csv from ProxiesPool import ProxiesPool # from urllib.parse import urljoin # old urls, beacause the new urls contain arguements that can load 100 div in a page. file_name = '/Users/zhangming/ENV/fang_crawler/data_csv/street_seed_urls.csv' urls = [] with open(file_name) as csvfile: reader = csv.reader(csvfile) header = next(reader) # 从第二行开始读取,将头换到第二行 for row in reader: urls.extend(row) proxies = ProxiesPool() cache = MongoCache(expires=timedelta()) headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Connection':'keep-alive' } # cookies= cache.clear() write_csv = ScrapeClassbackIhdexPage(path='/Users/zhangming/ENV/fang_crawler/data_csv') # 将目录页中的每个详情页的地址返回 for url in urls: link_crawler(url, max_urls=5, headers=headers, cookies=None, proxies=proxies, cache=None, scrape_callback=write_csv) # saved in data_csv/index_urls.csv