def crawl(self):
        """ Main function in the crawling process.  Core algorithm is:
        q <- starting page
        while q not empty:
           url <- q.get()
           if url is new and suitable:
              page <- fetch(url)   
              q.put(urls found in page)
           else:
              nothing

        new and suitable means that we don't re-visit URLs we've seen
        already fetched, and user-supplied criteria like maximum
        search depth are checked. 
        """
        
        q = Queue()
        q.put((self.root, 0))

        while not q.empty():
            this_url, depth = q.get()
            
            #Non-URL-specific filter: Discard anything over depth limit
            if depth > self.depth_limit:
                continue
            
            #Apply URL-based filters.
            do_not_follow = [f for f in self.pre_visit_filters if not f(this_url)]
            
            #Special-case depth 0 (starting URL)
            if depth == 0 and [] != do_not_follow:
                print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow

            #If no filters failed (that is, all passed), process URL
            if [] == do_not_follow:
                try:
                    self.visited_links.add(this_url)
                    self.num_followed += 1
                    page = Fetcher(this_url)
                    page.fetch()
                    for link_url in [self._pre_visit_url_condense(l) for l in page.out_links()]:
                        if link_url not in self.urls_seen:
                            q.put((link_url, depth+1))
                            self.urls_seen.add(link_url)
                            
                        do_not_remember = [f for f in self.out_url_filters if not f(link_url)]
                        if [] == do_not_remember:
                                self.num_links += 1
                                self.urls_remembered.add(link_url)
                                link = Link(this_url, link_url, "href")
                                if link not in self.links_remembered:
                                    self.links_remembered.add(link)
                except Exception, e:
                    print >>sys.stderr, "ERROR: Can't process url '%s' (%s)" % (this_url, e)
def getLinks(url):
    page = Fetcher(url)
    page.fetch()
    for i, url in enumerate(page):
        print "%d. %s" % (i, url)
        'admin_cs2_password'] if 'admin_cs2_password' in config else None
    backup_dir = config['backups_path'] \
        if 'backups_path' in config and config['backups_path'] != "." \
        else script_dir
    report_path = config['report_path'] if 'report_path' in config else None
    latest_backup_symlink = config[
        'latest_backup_symlink'] if 'latest_backup_symlink' in config else None

    if not os.path.isdir(backup_dir):
        os.makedirs(backup_dir)

    a = Authenticator(event_name, c2_login, c2_password, interactive=False)
    if not a.sign_in():
        exit()

    f = Fetcher(a.event_name, a.cookie)
    if not f.fetch_data():
        exit()
    f.fetch_etickets()
    f.fetch_details()

    db_path = os.path.join(backup_dir,
                           datetime.now().strftime('%y-%m-%d_%H-%M-%S.db'))
    MakeDB(db_path, f.data)

    if latest_backup_symlink:
        os.remove(latest_backup_symlink) if os.path.exists(
            latest_backup_symlink) else None
        try:
            os.symlink(db_path, latest_backup_symlink)
        except OSError:
    def crawl(self):
        """ Main function in the crawling process.  Core algorithm is:
        q <- starting page
        while q not empty:
           url <- q.get()
           if url is new and suitable:
              page <- fetch(url)   
              q.put(urls found in page)
           else:
              nothing

        new and suitable means that we don't re-visit URLs we've seen
        already fetched, and user-supplied criteria like maximum
        search depth are checked. 
        """

        q = Queue()
        q.put((self.root, 0))

        while not q.empty():
            this_url, depth = q.get()

            #Non-URL-specific filter: Discard anything over depth limit
            if depth > self.depth_limit:
                continue

            #Apply URL-based filters.
            do_not_follow = [
                f for f in self.pre_visit_filters if not f(this_url)
            ]

            #Special-case depth 0 (starting URL)
            if depth == 0 and [] != do_not_follow:
                print >> sys.stderr, "Whoops! Starting URL %s rejected by the following filters:", do_not_follow

            #If no filters failed (that is, all passed), process URL
            if [] == do_not_follow:
                try:
                    self.visited_links.add(this_url)
                    self.num_followed += 1
                    page = Fetcher(this_url)
                    page.fetch()
                    for link_url in [
                            self._pre_visit_url_condense(l)
                            for l in page.out_links()
                    ]:
                        if link_url not in self.urls_seen:
                            q.put((link_url, depth + 1))
                            self.urls_seen.add(link_url)

                        do_not_remember = [
                            f for f in self.out_url_filters if not f(link_url)
                        ]
                        if [] == do_not_remember:
                            self.num_links += 1
                            self.urls_remembered.add(link_url)
                            link = Link(this_url, link_url, "href")
                            if link not in self.links_remembered:
                                self.links_remembered.add(link)
                except Exception, e:
                    print >> sys.stderr, "ERROR: Can't process url '%s' (%s)" % (
                        this_url, e)
def getLinks(url):
    page = Fetcher(url)
    page.fetch()
    for i, url in enumerate(page):
        print "%d. %s" % (i, url)
	def setUp(self):
		self.fetcher = Fetcher()
class FetcherTest(unittest.TestCase):
	""" testing for success of Fetcher functions """

	def get_data_file(self, filename):
		""" returns the content of a test data file in ./data"""
		test_data = os.path.join(os.path.dirname(__file__), "data")
		f = open(os.path.join(test_data,filename))
		data = f.read()
		f.close()
		return data

	def setUp(self):
		self.fetcher = Fetcher()

	def test_get_full_url(self):
		control_data = "https://api.github.com/helloworld"
		result_url = self.fetcher.get_full_url("helloworld")
		self.assertEqual(control_data, result_url, "Full URL does not match: "+control_data+" vs "+result_url)

	def test_process_repo_single_repo(self):
		self.fetcher.get_from_net = Mock(return_value=self.get_data_file("octocat.Spoon-Knife.json"))
		result = self.fetcher.process_repo("")

		self.assertIsInstance(result, type(list()), "Result was not a list")
		self.assertNotEqual(len(result), 0, "List is empty")
		self.assertEqual(len(result), 1, "List has extra items")
		self.assertIsInstance(result[0], type(dict()), "List item is not a dictionary: "+repr(result[0]))

		# testing membership
		self.assertIn("full_name", result[0], "Full name missing from dictionary: "+ repr(result[0]))
		self.assertIn("name", result[0], "Name missing from dictionary: "+ repr(result[0]))
		self.assertIn("fork", result[0], "Fork missing from dictionary: "+ repr(result[0]))
		self.assertIn("url", result[0], "URL missing from dictionary: "+ repr(result[0]))
		self.assertIn("language", result[0], "Language missing from dictionary: "+ repr(result[0]))
		self.assertIn("created", result[0], "Created missing from dictionary: "+ repr(result[0]))

		# testing values
		self.assertEqual(result[0]["full_name"], "octocat/Spoon-Knife", "Full name does not match, Fullname: "+ repr(result[0]["full_name"]))
		self.assertEqual(result[0]["name"], "Spoon-Knife", "Name does not match, Name: "+ repr(result[0]["name"]))
		self.assertEqual(result[0]["fork"], False, "Fork does not match, Fork: "+ repr(result[0]["fork"]))
		self.assertEqual(result[0]["url"], "https://api.github.com/repos/octocat/Spoon-Knife", "URL does not match, URL: "+ repr(result[0]["url"]))
		self.assertEqual(result[0]["language"], None, "Langauge does not match, Language: "+ repr(result[0]["language"]))
		self.assertEqual(result[0]["created"], "2011-01-27T19:30:43Z", "Created does not match, Created: "+ repr(result[0]["created"]))

	def test_process_repo_multiple_repo(self):
		self.fetcher.get_from_net = Mock(return_value=self.get_data_file("octocat.json"))
		result = self.fetcher.process_repo("", True)

		self.assertIsInstance(result, type(list()), "Result was not a list")
		self.assertNotEqual(len(result), 0, "List is empty")
		self.assertEqual(len(result), 3, "List has extra items")
		self.assertIsInstance(result[0], type(dict()), "List item is not a dictionary: "+repr(result[0]))
		self.assertIsInstance(result[1], type(dict()), "List item is not a dictionary: "+repr(result[0]))
		self.assertIsInstance(result[2], type(dict()), "List item is not a dictionary: "+repr(result[0]))

		# testing membership - first item
		self.assertIn("full_name", result[0], "Full name missing from dictionary: "+ repr(result[0]))
		self.assertIn("name", result[0], "Name missing from dictionary: "+ repr(result[0]))
		self.assertIn("fork", result[0], "Fork missing from dictionary: "+ repr(result[0]))
		self.assertIn("url", result[0], "URL missing from dictionary: "+ repr(result[0]))
		self.assertIn("language", result[0], "Language missing from dictionary: "+ repr(result[0]))
		self.assertIn("created", result[0], "Created missing from dictionary: "+ repr(result[0]))

		# testing membership - second item
		self.assertIn("full_name", result[1], "Full name missing from dictionary: "+ repr(result[1]))
		self.assertIn("name", result[1], "Name missing from dictionary: "+ repr(result[1]))
		self.assertIn("fork", result[1], "Fork missing from dictionary: "+ repr(result[1]))
		self.assertIn("url", result[1], "URL missing from dictionary: "+ repr(result[1]))
		self.assertIn("language", result[1], "Language missing from dictionary: "+ repr(result[1]))
		self.assertIn("created", result[1], "Created missing from dictionary: "+ repr(result[1]))

		# testing values - third item
		self.assertEqual(result[2]["full_name"], "octocat/ThisIsATest", "Full name does not match, Fullname: "+ repr(result[2]["full_name"]))
		self.assertEqual(result[2]["name"], "ThisIsATest", "Name does not match, Name: "+ repr(result[2]["name"]))
		self.assertEqual(result[2]["fork"], False, "Fork does not match, Fork: "+ repr(result[2]["fork"]))
		self.assertEqual(result[2]["url"], "https://api.github.com/repos/octocat/ThisIsATest", "URL does not match, URL: "+ repr(result[2]["url"]))
		self.assertEqual(result[2]["language"], None, "Langauge does not match, Language: "+ repr(result[2]["language"]))
		self.assertEqual(result[2]["created"], "2012-03-07T23:25:47Z", "Created does not match, Created: "+ repr(result[2]["created"]))

	def tearDown(self):
		self.fetcher = None
Example #8
0
    event_name = config['event_name']
    c2_login = config['admin_cs2_name']
    c2_password = config[
        'admin_cs2_password'] if 'admin_cs2_password' in config else None
    db_path = config['db_path']
    sql = config['sql_after_get'].strip(
    ) if 'sql_after_get' in config else None

    all_data = len(sys.argv) > 1 and sys.argv[1] == '-a'

    a = Authenticator(event_name, c2_login, c2_password)
    if not a.sign_in():
        exit()

    print()
    f = Fetcher(a.event_name, a.cookie)
    if not f.fetch_data():
        exit()

    if all_data:
        if not f.fetch_etickets():
            exit()
        if not f.fetch_details():
            exit()

    print('\nCreating ' + db_path + '...')
    MakeDB(db_path, f.data)

    if sql:
        from tabulate import tabulate
Example #9
0
def main(user='', repo='', logfile='', frmt='json'):
    """Entry point
    :param user: username
    :param repo: reponame (if username is provided), fullname (if username isn't provided) 
    :param logfile: logfile 
    :param frmt: csv, json (not yet implemented, only does csv)
    """
    repo_url = ''
    single_repo = False  #flag : if fetching one repo (True) or many (False)
    public_repos = False  #multiple repos are public repos
    count = 0  #max repos to fetch if public_repos is selected
    if user != '' and repo != '':
        fullname = "{0}/{1}".format(user, repo)
        repo_url = REPO_URL.format(full_name=fullname)
        single_repo = True
    elif user != '':
        repo_url = USER_REPO_LIST.format(user=user)
        single_repo = False
    elif repo != '':  #fullname of repo is provided
        repo_url = REPO_URL.format(full_name=repo)
        single_repo = True
    else:  #fetch public repos
        ans = raw_input('fetch all repos [y/n]? ')
        if ans in ('y', 'Y'):
            count = raw_input('maximum number of repos [{0}]? '.format(DEFAULT_MAX_PUBLIC_REPOS))
            if count == '':
                count = DEFAULT_MAX_PUBLIC_REPOS
            else:
                try:
                    count = int(count)
                except:
                    print 'Invalid integer'

            repo_url = ALL_REPO_LIST
            single_repo = False
            public_repos = True
        else:
            game_over('no repo/user selected')

    fetcher = Fetcher()
    repo_url = fetcher.get_full_url(repo_url)
    repo_dets = None
    if single_repo:
        repo_dets = fetcher.process_repo(repo_url)
    elif public_repos:
        repo_dets = fetcher.get_public_repos(count)
    else:
        repo_dets = fetcher.process_repo(repo_url, multiple=True)

    if logfile == '': #TODO use reponame if logfile is not present
        logfile = stdout
    else:
        logfile = open(logfile, 'w')

    if type(logfile) is not file:
        game_over('nowhere to log')

    print 'gotten', len(repo_dets), 'repos'

    for i in repo_dets:
        commits = fetcher.extract_commits(i)
        fetcher.write_commits(logfile, i, commits) #TODO write file in JSON

    if logfile != stdout:
        logfile.close()
Example #10
0
#!/usr/bin/python3.3
# --------------------------------------------------------
# Copyright (c) 2013 Matthew Pate and Daniel Catalano
# [This program is licensed under the "MIT License"]
# Please see the file COPYING in the source distribution
# of this software for license terms.
# --------------------------------------------------------

# adds project root directory to PYTHONPATH needed for the next import statement

from sys import path
path.append('..')
from lib.fetcher import Fetcher

spider = Fetcher()
spider.download_param_grib_range('gfs', 2013082400, 00, 240, 12, 1.0)
spider.download_param_grib_range('gfs', 2013082400, 00, 240, 12, 0.5)
#spider.download_param_grib_range('gfs', 2013080300, 00, 240, 12, 2.5)