def __init__(self): self.urls = url_manager.Url_Manager() self.downloader = html_downLoader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.outper = html_outputer.OutPuter() #self.cache=disk_cache.DiskCache() self.cache=mongo_cache.MongoCache()
def main(): template_url = 'http://127.0.0.1:8000/places/ajax/search.json?page={}&page_size=10&search_term={}' countries = set() # download = downloader.Downloader(mongo_cache.MongoCache()) cache = mongo_cache.MongoCache() cache.clear() download = downloader.Downloader(cache=cache) for letter in string.lowercase: page = 0 while True: html = download(template_url.format(page, letter)) try: ajax = json.loads(html) except ValueError as e: print e ajax = None else: for record in ajax['records']: countries.add(record['country']) page += 1 if ajax is None or page >= ajax['num_pages']: break open('countries.txt', 'w').write('\n'.join(sorted(countries)))
def download_comments(self, url): # 按页将评论存储起来了,还需要进一步净化按指定格式存储 downloader = Downloader(cache=mongo_cache.MongoCache()) try: data, code = downloader(url) if not data: raise Exception('抓取结果为空') dictStr = data[data.find('(') + 1:-2] astDictStr = dictStr.replace("true", "True").replace("false", "False").replace("null", "None") dictResult = ast.literal_eval(astDictStr) oriComments = dictResult["comments"] # 精处理我们所需信息,提取 content,score comments = [] for com in oriComments: comments.append( { 'content':com["content"].decode("GBK").replace(u'\n',u'。'), 'score':com["score"] } ) return comments except Exception as exc: raise self.retry(exc=exc) # 遵从默认重连参数
def main(): writer = csv.writer(open(CSV_FILE, 'w')) downloader = Downloader(cache=mongo_cache.MongoCache()) html = downloader(URL) ajax = json.loads(html) for record in ajax['records']: writer.writerow([record['country']]) print('Records written in {}'.format(CSV_FILE))
def task(base_url, start, stop, db_helper, col): cookie = {'Cookie': '%s=%s' % (k, v) for k, v in COOKIE.items()} downloader = Downloader(cache=mongo_cache.MongoCache(), cookie=cookie) for i in range(start, stop, 20): new_url = set_query_parameter(base_url, 'start', i) # ---------------IO流中的输入流---------------------------------------- html, code = downloader(new_url) if code != 200: print u'访问出错...' break # -------------格式化处理-------------------------------------------- comm_elements = lxml.html.fromstring(html).cssselect( 'div#comments .comment') comments = [] for e in comm_elements: # 构建mongodb中comment文档 comment = {} comment['comment-vote'] = int( e.cssselect('h3 span.comment-vote span.votes') [0].text_content()) if not e.cssselect('h3 span.comment-info span.rating'): comment['comment-roting'] = 0 else: comment['comment-roting'] = int( e.cssselect('h3 span.comment-info span.rating')[0].get( 'class').split(' ')[0][-2]) comment['comment-text'] = e.cssselect( 'p')[0].text_content().encode('utf-8') comments.append(comment) # ----------IO流中的输出流-------------------------------------------- print len(comments) try: db_helper.insert_documents(collection=col, documents=comments) finally: db_helper.close() print 'finish %d th download' % (i / 20)
def main(string): countries = set() downloader = Downloader(cache=mongo_cache.MongoCache()) for letter in string.lower(): page = 0 while True: html = downloader(TEMPLATE_URL.format(page, letter)) try: ajax = json.loads(html) except ValueError as e: print(e) ajax = None else: for record in ajax['records']: countries.add(record['country']) page += 1 if ajax is None or page >= ajax['num_pages']: break with open(TXT_FILE, 'w') as f: f.write('\n'.join(sorted(countries))) print('Records written in {}'.format(TXT_FILE))
def main(): template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}' countries = set() html_cache = mongo_cache.MongoCache() download = downloader.Downloader(delay=3, num_retries=1, timeout=60, cache=html_cache) for search_term in string.ascii_lowercase: page = 1 while True: html = download(template_url.format(page, search_term)) try: ajax = json.loads(html) except ValueError as e: print (e) ajax = None else: for record in ajax['records']: countries.add(record['country']) page += 1 if ajax is None or page >= ajax['num_pages']: break open('countries.txt', 'w').write('\n'.join(sorted(countries)))
def main(): template_url = 'http://example.webscraping.com/places/ajax/search.json?&search_term={}&page_size=10&page={}' countries = set() # 先将数据存储在集合中,因为集合这种数据类型不会存储重复的元素 download = downloader.Downloader(mongo_cache.MongoCache()) for letter in string.lowercase: # 从a-z进行遍历搜索 page = 0 while True: html = download(template_url.format(letter, page)) try: ajax = json.loads(html) # 将JSON格式的数据使用json模块解析成一个字典 except ValueError as e: print e ajax = None else: for record in ajax['records']: # print record['country'] countries.add(record['country']) page += 1 if ajax is None or page >= ajax['num_pages']: break open('D:\countries.txt', 'w').write('\n'.join(sorted(countries)))
def process_crawler(args, **kwargs): num_cpus = multiprocessing.cpu_count() #pool = multiprocessing.Pool(processes=num_cpus) print('Starting {} processes'.format(num_cpus)) html_cache1 = mongo_cache.MongoCache() html_cache1.clear() mongo_result1 = mongo_result.MongoResult() mongo_result1.clear() processes = [] for i in range(num_cpus): p = multiprocessing.Process(target=threaded_crawler( seed_url=target_url, link_regex='/(places|view)', scrape_callback=mongo_result1, html_cache=html_cache1).run, args=[args], kwargs=kwargs) #parsed = pool.apply_async(threaded_link_crawler, args, kwargs) p.start() processes.append(p) # wait for processes to complete for p in processes: p.join()