コード例 #1
0
        'admin_cs2_password'] if 'admin_cs2_password' in config else None
    backup_dir = config['backups_path'] \
        if 'backups_path' in config and config['backups_path'] != "." \
        else script_dir
    report_path = config['report_path'] if 'report_path' in config else None
    latest_backup_symlink = config[
        'latest_backup_symlink'] if 'latest_backup_symlink' in config else None

    if not os.path.isdir(backup_dir):
        os.makedirs(backup_dir)

    a = Authenticator(event_name, c2_login, c2_password, interactive=False)
    if not a.sign_in():
        exit()

    f = Fetcher(a.event_name, a.cookie)
    if not f.fetch_data():
        exit()
    f.fetch_etickets()
    f.fetch_details()

    db_path = os.path.join(backup_dir,
                           datetime.now().strftime('%y-%m-%d_%H-%M-%S.db'))
    MakeDB(db_path, f.data)

    if latest_backup_symlink:
        os.remove(latest_backup_symlink) if os.path.exists(
            latest_backup_symlink) else None
        try:
            os.symlink(db_path, latest_backup_symlink)
        except OSError:
コード例 #2
0
def getLinks(url):
    page = Fetcher(url)
    page.fetch()
    for i, url in enumerate(page):
        print "%d. %s" % (i, url)
コード例 #3
0
    def crawl(self):
        """ Main function in the crawling process.  Core algorithm is:
        q <- starting page
        while q not empty:
           url <- q.get()
           if url is new and suitable:
              page <- fetch(url)   
              q.put(urls found in page)
           else:
              nothing

        new and suitable means that we don't re-visit URLs we've seen
        already fetched, and user-supplied criteria like maximum
        search depth are checked. 
        """

        q = Queue()
        q.put((self.root, 0))

        while not q.empty():
            this_url, depth = q.get()

            #Non-URL-specific filter: Discard anything over depth limit
            if depth > self.depth_limit:
                continue

            #Apply URL-based filters.
            do_not_follow = [
                f for f in self.pre_visit_filters if not f(this_url)
            ]

            #Special-case depth 0 (starting URL)
            if depth == 0 and [] != do_not_follow:
                print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow

            #If no filters failed (that is, all passed), process URL
            if [] == do_not_follow:
                try:
                    self.visited_links.add(this_url)
                    self.num_followed += 1
                    page = Fetcher(this_url)
                    page.fetch()
                    for link_url in [
                            self._pre_visit_url_condense(l)
                            for l in page.out_links()
                    ]:
                        if link_url not in self.urls_seen:
                            q.put((link_url, depth + 1))
                            self.urls_seen.add(link_url)

                        do_not_remember = [
                            f for f in self.out_url_filters if not f(link_url)
                        ]
                        if [] == do_not_remember:
                            self.num_links += 1
                            self.urls_remembered.add(link_url)
                            link = Link(this_url, link_url, "href")
                            if link not in self.links_remembered:
                                self.links_remembered.add(link)
                except Exception, e:
                    print >> sys.stderr, "ERROR: Can't process url '%s' (%s)" % (
                        this_url, e)