Esempio n. 1
0
def main(max_threads):
    # cache = MongoCache(expires=timedelta())
    seed_url = get_urls()
    # print(seed_url)
    cache = MongoCache(expires=timedelta())
    # cache.clear()
    callback = DetailPageCallback(
        path='/Users/zhangming/ENV/fang_crawler/data_csv')
    threaded_crawler(seed_url,
                     cache=None,
                     scrape_callback=callback,
                     proxies=ProxiesPool(),
                     max_threads=max_threads)
Esempio n. 2
0
    def __call__(self, url, html):
        if url == 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip':
            urls = []
            zf = ZipFile(self.seed_path)
            file_name = zf.namelist()[0]
            df = pd.read_csv(zf.open(file_name))
            urls = []
            for website in df.iloc[:, 1].tolist(
            ):  # top 1 millon url's will be stored in this list:df.iloc[:, 1].tolist()
                urls.append('http://' + website)
                if len(urls) == self.max_urls:
                    break
            return urls


if __name__ == '__main__':
    cache = MongoCache(expires=timedelta())
    url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
    # the url can't be downloaded, so be we use requests package in Downloader.py
    # so it run incorrectly.
    link_crawler(url,
                 link_regex=None,
                 delay=5,
                 max_depth=-1,
                 max_urls=-1,
                 headers=None,
                 proxies=None,
                 num_retries=1,
                 scrape_callback=AlexaCallback(url),
                 cache=cache)
Esempio n. 3
0
from datetime import timedelta
from ScrapeCallback import ScrapeClassbackIhdexPage, ScrapeClassbackStreetPage
from DiskCache import MongoCache
import csv
from ProxiesPool import ProxiesPool
# from urllib.parse import urljoin

# old urls, beacause the new urls contain arguements that can load 100 div in a page.

file_name = '/Users/zhangming/ENV/fang_crawler/data_csv/street_seed_urls.csv'
urls = []
with open(file_name) as csvfile:
    reader = csv.reader(csvfile)
    header = next(reader) # 从第二行开始读取,将头换到第二行
    for row in reader:
        urls.extend(row)

proxies = ProxiesPool()
cache = MongoCache(expires=timedelta())
headers  = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
    'Connection':'keep-alive'
}
# cookies=
cache.clear()
write_csv = ScrapeClassbackIhdexPage(path='/Users/zhangming/ENV/fang_crawler/data_csv')
# 将目录页中的每个详情页的地址返回
for url in urls:
    link_crawler(url, max_urls=5, headers=headers, cookies=None, proxies=proxies, cache=None, scrape_callback=write_csv)
# saved in data_csv/index_urls.csv