def main(): Helper.html_header() ycrawler = YCrawler( 4, 'http://5550555.tw/chinanews/chinanews_list.php?page_1=a&s_key=1&start_year=2015&start_month=9&start_day=29', '5550555') ycrawler.crawl()
def __save_bucket(self, to_db=True): if to_db == True: storage = Storage() storage.connect('crawldb') storage.set_collection('crawlset') Helper.log("Insert count", len(self.crawlset_bucket)) storage.insert_documents(self.crawlset_bucket) else: FileStorage.bulk_write(self.crawlset_bucket, 'content')
def __finalize_crawlset_bucket(self): Helper.log("Adding content..") self.crawlset_bucket = set(self.crawlset_bucket) json_bucket = [] for crawlset in self.crawlset_bucket: if crawlset.content == '': Helper.log('Downloading content for URL', crawlset.link) crawlset.content = UrlGrabber.retrieve_html(crawlset.link) Helper.log('OK!') json_bucket.append(crawlset.to_dictionary()) self.crawlset_bucket = json_bucket
def __fill_crawlset_bucket(self): current_depth = -1 if current_depth < self.level: current_depth += 1 Helper.log('Depth', current_depth) grabber1 = UrlGrabber(current_depth, self.site_url, self.site_root) crawlsets1 = grabber1.grab() if crawlsets1 is not None: self.crawlset_bucket.extend(crawlsets1) current_depth += 1 if current_depth < self.level: current_depth += 1 for set1 in crawlsets1: set1 = (set1) Helper.log('Depth', current_depth) grabber2 = UrlGrabber(current_depth, set1.link, self.site_root) crawlsets2 = grabber2.grab() if crawlsets2 is not None: self.crawlset_bucket.extend(crawlsets2) # Level 3 if current_depth < self.level: current_depth += 1 for set2 in crawlsets2: set2 = (set2) Helper.log('Depth', current_depth) grabber3 = UrlGrabber(current_depth, set2.link, self.site_root) crawlsets3 = grabber3.grab() if crawlsets3 is not None: self.crawlset_bucket.extend(crawlsets3) # Level 4 if current_depth < self.level: current_depth += 1 for set3 in crawlsets3: set3 = (set3) Helper.log('Depth', current_depth) grabber4 = UrlGrabber( current_depth, set3.link, self.site_root) crawlsets4 = grabber4.grab() if crawlsets4 is not None: self.crawlset_bucket.extend(crawlsets4)
def get_id(): uri = Helper.current_uri() params = Helper.url_params(uri) id = params["id"] return id
def main(): Helper.html_header() # id = '562cdf0fe4a4c906e73d4ee1' id = get_id() html = get_html(id) print(html)
def main(): Helper.html_header() ycrawler = YCrawler(4, 'http://5550555.tw/chinanews/chinanews_list.php?page_1=a&s_key=1&start_year=2015&start_month=9&start_day=29', '5550555') ycrawler.crawl()
def main(): Helper.html_header() crawlsets = retrieve_crawlsets() html = generate_html_table(crawlsets) print(html)