Exemple #1
0
 def __init__(self):
     Handler.__init__(self)
     # Initialization code here. This part is run only once
     versions_a = [
         'http://www.example.com/resourceA_v1',
         'http://www.example.com/resourceA_v2',
         'http://www.example.com/resourceA_v3'
     ]
     date_times_a = [
         '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z',
         '2015-01-03T22:00:00Z'
     ]
     versions_b = [
         'http://www.example.com/resourceB_v1',
         'http://www.example.com/resourceB_v2',
     ]
     date_times_b = ['1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z']
     self.archives = {
         'http://www.example.com/resourceA':
         versions_a,
         'http://www.example.com/resourceB':
         versions_b,
         'http://www.example.com/resource%20space': [
             'http://www.example.com/space',
         ],
     }
     self.dates = {
         'http://www.example.com/resourceA': date_times_a,
         'http://www.example.com/resourceB': date_times_b,
         'http://www.example.com/resource%20space':
         ['1970-01-01T00:00:00Z'],
     }
Exemple #2
0
    def __init__(self):
        Handler.__init__(self)
        self.TIMESTAMPFMT = "%Y%m%d%H%M%S"

        # Storing first mementos
        self.inner_cache = {}
        self.max_inner_cache_size = 100000
 def __init__(self, base_uri='http://www.example.com/'):
     Handler.__init__(self)
     # Initialization code here. This part is run only once
     versions_a = [
         '{0}resourceA_v1'.format(base_uri),
         '{0}resourceA_v2'.format(base_uri),
         '{0}resourceA_v3'.format(base_uri)
     ]
     date_times_a = [
         '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z',
         '2015-01-03T22:00:00Z'
     ]
     versions_b = [
         '{0}resourceB_v1'.format(base_uri),
         '{0}resourceB_v2'.format(base_uri),
     ]
     date_times_b = ['1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z']
     self.archives = {
         '{0}resourceA'.format(base_uri):
         versions_a,
         '{0}resourceB'.format(base_uri):
         versions_b,
         '{0}resource%20space'.format(base_uri): [
             '{0}space'.format(base_uri),
         ],
     }
     self.dates = {
         '{0}resourceA'.format(base_uri): date_times_a,
         '{0}resourceB'.format(base_uri): date_times_b,
         '{0}resource%20space'.format(base_uri): ['1970-01-01T00:00:00Z'],
     }
Exemple #4
0
    def __init__(self):
        Handler.__init__(self)
        self.TIMESTAMPFMT = '%Y%m%d%H%M%S'

        # Storing first mementos
        self.inner_cache = {}
        self.max_inner_cache_size = 100000
Exemple #5
0
    def __init__(self):

        self.baseuri = "http://83.212.204.92:8080/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://83.212.204.92:8080/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)
Exemple #6
0
    def __init__(self):

        self.baseuri = "http://83.212.204.92:8080/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://83.212.204.92:8080/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)
Exemple #7
0
    def __init__(self):
        Handler.__init__(self)

        # Local fields
        self.api_url = 'https://api.w3.org/specifications/%s/versions?_format=json&apikey=%s&embed=1'

        self.re_spec_name = re.compile("https?:\/\/(www.)?w3.org\/TR\/(.*)", re.IGNORECASE)
Exemple #8
0
    def __init__(self):
        Handler.__init__(self)

        self.datere = re.compile("http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+")
        self.colls = [
            "lcwa0001",
            "lcwa0002",
            "lcwa0003",
            "lcwa0004",
            "lcwa0005",
            "lcwa0006",
            "lcwa0007",
            "lcwa0008",
            "lcwa0009",
            "lcwa0010",
            "lcwa0011",
            "lcwa0012",
            "lcwa0013",
            "lcwa0014",
            "lcwa0015",
            "lcwa0016",
            "lcwa0017",
            "lcwa0018",
            "lcwa0019",
            "lcwa0020",
            "lcwa0029",
            "lcwa0031",
            "lcwa0032",
            "lcwa0033",
            "lcwa0037",
        ]
Exemple #9
0
    def __init__(self):
        Handler.__init__(self)
        # Initialization code here. This part is run only once
        versions_a = [
            'http://www.example.com/resourceA_v1',
            'http://www.example.com/resourceA_v2',
            'http://www.example.com/resourceA_v3'

        ]
        date_times_a = [
            '1999-09-30T01:50:50Z',
            '2010-10-16T13:27:27Z',
            '2015-01-03T22:00:00Z'
        ]
        versions_b = [
            'http://www.example.com/resourceB_v1',
            'http://www.example.com/resourceB_v2',

        ]
        date_times_b = [
            '1998-07-17T17:47:31Z',
            '2000-11-08T19:05:09Z'
        ]
        self.archives = {
            'http://www.example.com/resourceA': versions_a,
            'http://www.example.com/resourceB': versions_b
        }
        self.dates = {
            'http://www.example.com/resourceA': date_times_a,
            'http://www.example.com/resourceB': date_times_b

        }
Exemple #10
0
    def __init__(self):
        Handler.__init__(self)

        # Resources
        # Ignores all that trails the identifier (? params, vX version,...info)
        self.rex = re.compile(
            r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)')
        self.api_base = 'http://export.arxiv.org/oai2'
Exemple #11
0
    def __init__(self):
        Handler.__init__(self)

        # Resources
        # Ignores all that trails the identifier (? params, vX version,...info)
        self.rex = re.compile(
            r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)')
        self.api_base = 'http://export.arxiv.org/oai2'
Exemple #12
0
    def __init__(self):
        Handler.__init__(self)

        # Local fields
        self.api_url = 'https://api.w3.org/specifications/%s/versions?_format=json&apikey=%s&embed=1'

        self.re_spec_name = re.compile("https?:\/\/(www.)?w3.org\/TR\/(.*)",
                                       re.IGNORECASE)
Exemple #13
0
    def __init__(self):
        Handler.__init__(self)

        self.hosts = [
            'www.wowwiki.com',
            'en.memory-alpha.org',
            'wiki.ffxiclopedia.org',
            'www.jedipedia.de'
        ]
Exemple #14
0
 def __init__(self):
     Handler.__init__(self)
     self.db = MySQLdb.connect(
         host="localhost",
         user="******",
         passwd="password",
         db="amber_timegate"
     )
     self.cursor = self.db.cursor()
     logging.basicConfig(filename='/tmp/temp.log')
Exemple #15
0
    def __init__(self):
        Handler.__init__(self)
        self.baseuri = "http://webharvest.gov/"
        congress_number = 109
        FIRST_YEAR = 2006
        THIS_YEAR = datetime.utcnow().year
        self.collections = ["peth04"]

        for i in range(FIRST_YEAR, THIS_YEAR, 2):
            self.collections.append("congress%sth" % congress_number)
            congress_number += 1
Exemple #16
0
    def __init__(self):
        Handler.__init__(self)

        self.datere = re.compile(
            'http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+')
        self.colls = [
            'lcwa0001', 'lcwa0002', 'lcwa0003', 'lcwa0004', 'lcwa0005',
            'lcwa0006', 'lcwa0007', 'lcwa0008', 'lcwa0009', 'lcwa0010',
            'lcwa0011', 'lcwa0012', 'lcwa0013', 'lcwa0014', 'lcwa0015',
            'lcwa0016', 'lcwa0017', 'lcwa0018', 'lcwa0019', 'lcwa0020',
            'lcwa0029', 'lcwa0031', 'lcwa0032', 'lcwa0033', 'lcwa0037'
        ]
Exemple #17
0
    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        self.resources = ['https://github.com/.+',
                          'https://raw.githubusercontent.com/']

        # Local fields
        self.api = 'https://api.github.com'

        # Precompiles regular expressions
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ((?:raw.githubusercontent|github).com/)  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile(
            '<(.+?)>; rel="next"')  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files
Exemple #18
0
    def __init__(self):
        Handler.__init__(self)
        self.LIMIT_MAX = 100
        self.BASE = 'http://www.pastpages.org'
        self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S'
        self.FIRST_DATE = datetime(2012, 0o4, 27).strftime(self.API_TIMEFMT)

        # Building pages list of ('uri', 'slug') pairs
        self.pages_list = []

        try:
            params = {
                'limit': self.LIMIT_MAX
            }
            request = '/api/beta/sites/'
            has_next = True

            # Keep while there are still result pages
            while has_next:
                json_response = self.request(
                    self.BASE + request, params=params).json()

                self.pages_list.extend([
                    # 'objects' is the list of responses
                    # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively
                    (obj['url'], obj['slug'])
                    for obj in json_response['objects']
                ])

                request = json_response['meta']['next']
                params = None  # the request already contains &limit and &offset
                # Each response has a non null 'meta.next' value if it has a
                # continuation
                has_next = request is not None

        except Exception as e:
            logging.critical("Cannot create the handler's page list:")
            raise e

        logging.info("Found %s websites on pastpages' API." %
                     len(self.pages_list))
Exemple #19
0
    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        self.resources = [
            'https://github.com/.+', 'https://raw.githubusercontent.com/'
        ]

        # Local fields
        self.api = 'https://api.github.com'

        # Precompiles regular expressions
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ((?:raw.githubusercontent|github).com/)  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile(
            '<(.+?)>; rel="next"'
        )  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files
Exemple #20
0
    def __init__(self):
        Handler.__init__(self)
        self.LIMIT_MAX = 100
        self.BASE = 'http://www.pastpages.org'
        self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S'
        self.FIRST_DATE = datetime(2012, 0o4, 27).strftime(self.API_TIMEFMT)

        # Building pages list of ('uri', 'slug') pairs
        self.pages_list = []

        try:
            params = {'limit': self.LIMIT_MAX}
            request = '/api/beta/sites/'
            has_next = True

            # Keep while there are still result pages
            while has_next:
                json_response = self.request(self.BASE + request,
                                             params=params).json()

                self.pages_list.extend([
                    # 'objects' is the list of responses
                    # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively
                    (obj['url'], obj['slug'])
                    for obj in json_response['objects']
                ])

                request = json_response['meta']['next']
                params = None  # the request already contains &limit and &offset
                # Each response has a non null 'meta.next' value if it has a
                # continuation
                has_next = request is not None

        except Exception as e:
            logging.critical("Cannot create the handler's page list:")
            raise e

        logging.info("Found %s websites on pastpages' API." %
                     len(self.pages_list))
Exemple #21
0
    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        # TODO: move to config file
        self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+']

        # Local fields
        self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3'  # TODO: move to config file
        self.apikey = 'VqeqaShAw4GWVc3dp7--'  # TODO: move to config file

        # Precompiles regular expressions  ## TODO: generalize for URLs with
        # numeric project ID instead of user/repo!!!
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ([^/]+)/  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile(
            '<(.+?)>; rel="next"')  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files
Exemple #22
0
    def __init__(self, base_uri='http://www.example.com/'):
        Handler.__init__(self)
        # Initialization code here. This part is run only once
        versions_a = [
            '{0}resourceA_v1'.format(base_uri),
            '{0}resourceA_v2'.format(base_uri),
            '{0}resourceA_v3'.format(base_uri)

        ]
        date_times_a = [
            '1999-09-30T01:50:50Z',
            '2010-10-16T13:27:27Z',
            '2015-01-03T22:00:00Z'
        ]
        versions_b = [
            '{0}resourceB_v1'.format(base_uri),
            '{0}resourceB_v2'.format(base_uri),

        ]
        date_times_b = [
            '1998-07-17T17:47:31Z',
            '2000-11-08T19:05:09Z'
        ]
        self.archives = {
            '{0}resourceA'.format(base_uri): versions_a,
            '{0}resourceB'.format(base_uri): versions_b,
            '{0}resource%20space'.format(base_uri): [
                '{0}space'.format(base_uri),
            ],
        }
        self.dates = {
            '{0}resourceA'.format(base_uri): date_times_a,
            '{0}resourceB'.format(base_uri): date_times_b,
            '{0}resource%20space'.format(base_uri): [
                '1970-01-01T00:00:00Z'
            ],
        }
Exemple #23
0
 def __init__(self):
     self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/"
     regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">'
     self.uriRegex = re.compile(regex)
     Handler.__init__(self)
Exemple #24
0
 def __init__(self):
     Handler.__init__(self)
Exemple #25
0
 def __init__(self):
     Handler.__init__(self)
     regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">'
     self.uriRegex = re.compile(regex)
Exemple #26
0
 def __init__(self):
     Handler.__init__(self)
Exemple #27
0
 def __init__(self):
     Handler.__init__(self)
     regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">'
     self.uriRegex = re.compile(regex)
Exemple #28
0
 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/"
     self.dtre = re.compile(
         "http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
Exemple #29
0
 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://arquivo.pt/wayback/wayback/xmlquery"
Exemple #30
0
 def __init__(self):
     self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/"
     regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">'
     self.uriRegex = re.compile(regex)
     Handler.__init__(self)
Exemple #31
0
 def __init__(self):
     Handler.__init__(self)
     self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
Exemple #32
0
 def __init__(self):
     Handler.__init__(self)
     cj = cookielib.LWPCookieJar()
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
     urllib2.install_opener(opener)
Exemple #33
0
 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/"
     self.dtre = re.compile(
         "http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
Exemple #34
0
 def __init__(self):
     Handler.__init__(self)
     self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
     self.hosts = [".orain.org"]
Exemple #35
0
 def __init__(self):
     Handler.__init__(self)
     self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
     self.hosts = [".orain.org"]