Exemple #1
0
def main():
    Helper.html_header()
    ycrawler = YCrawler(
        4,
        'http://5550555.tw/chinanews/chinanews_list.php?page_1=a&s_key=1&start_year=2015&start_month=9&start_day=29',
        '5550555')
    ycrawler.crawl()
Exemple #2
0
 def __save_bucket(self, to_db=True):
     if to_db == True:
         storage = Storage()
         storage.connect('crawldb')
         storage.set_collection('crawlset')
         Helper.log("Insert count", len(self.crawlset_bucket))
         storage.insert_documents(self.crawlset_bucket)
     else:
         FileStorage.bulk_write(self.crawlset_bucket, 'content')
Exemple #3
0
 def __finalize_crawlset_bucket(self):
     Helper.log("Adding content..")
     self.crawlset_bucket = set(self.crawlset_bucket)
     json_bucket = []
     for crawlset in self.crawlset_bucket:
         if crawlset.content == '':
             Helper.log('Downloading content for URL', crawlset.link)
             crawlset.content = UrlGrabber.retrieve_html(crawlset.link)
             Helper.log('OK!')
         json_bucket.append(crawlset.to_dictionary())
     self.crawlset_bucket = json_bucket
Exemple #4
0
 def __fill_crawlset_bucket(self):
     current_depth = -1
     if current_depth < self.level:
         current_depth += 1
         Helper.log('Depth', current_depth)
         grabber1 = UrlGrabber(current_depth, self.site_url, self.site_root)
         crawlsets1 = grabber1.grab()
         if crawlsets1 is not None:
             self.crawlset_bucket.extend(crawlsets1)
         current_depth += 1
         if current_depth < self.level:
             current_depth += 1
             for set1 in crawlsets1:
                 set1 = (set1)
                 Helper.log('Depth', current_depth)
                 grabber2 = UrlGrabber(current_depth, set1.link,
                                       self.site_root)
                 crawlsets2 = grabber2.grab()
                 if crawlsets2 is not None:
                     self.crawlset_bucket.extend(crawlsets2)
                 # Level 3
                 if current_depth < self.level:
                     current_depth += 1
                     for set2 in crawlsets2:
                         set2 = (set2)
                         Helper.log('Depth', current_depth)
                         grabber3 = UrlGrabber(current_depth, set2.link,
                                               self.site_root)
                         crawlsets3 = grabber3.grab()
                         if crawlsets3 is not None:
                             self.crawlset_bucket.extend(crawlsets3)
                         # Level 4
                         if current_depth < self.level:
                             current_depth += 1
                             for set3 in crawlsets3:
                                 set3 = (set3)
                                 Helper.log('Depth', current_depth)
                                 grabber4 = UrlGrabber(
                                     current_depth, set3.link,
                                     self.site_root)
                                 crawlsets4 = grabber4.grab()
                                 if crawlsets4 is not None:
                                     self.crawlset_bucket.extend(crawlsets4)
Exemple #5
0
def get_id():
    uri = Helper.current_uri()
    params = Helper.url_params(uri)
    id = params["id"]
    return id
Exemple #6
0
def main():
    Helper.html_header()
    # id = '562cdf0fe4a4c906e73d4ee1'
    id = get_id()
    html = get_html(id)
    print(html)
Exemple #7
0
def main():
    Helper.html_header()
    ycrawler = YCrawler(4, 'http://5550555.tw/chinanews/chinanews_list.php?page_1=a&s_key=1&start_year=2015&start_month=9&start_day=29', '5550555')
    ycrawler.crawl()
Exemple #8
0
def main():
    Helper.html_header()
    crawlsets = retrieve_crawlsets()
    html = generate_html_table(crawlsets)
    print(html)
Exemple #9
0
def main():
    Helper.html_header()
    crawlsets = retrieve_crawlsets()
    html = generate_html_table(crawlsets)
    print(html)