Ejemplo n.º 1
0
    def __init__(self):
        #self.baseuri = "http://was.nl.sg/wayback/*/"
        self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">';
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)
Ejemplo n.º 2
0
    def __init__(self):

        self.baseuri = "http://www.padi.cat:8080/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://www.padi.cat:8080/wayback/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)
Ejemplo n.º 3
0
    def __init__(self):
        Handler.__init__(self)
        # Initialization code here. This part is run only once
        versions_a = [
            'http://www.example.com/resourceA_v1',
            'http://www.example.com/resourceA_v2',
            'http://www.example.com/resourceA_v3'

        ]
        date_times_a = [
            '1999-09-30T01:50:50Z',
            '2010-10-16T13:27:27Z',
            '2015-01-03T22:00:00Z'
        ]
        versions_b = [
            'http://www.example.com/resourceB_v1',
            'http://www.example.com/resourceB_v2',

        ]
        date_times_b = [
            '1998-07-17T17:47:31Z',
            '2000-11-08T19:05:09Z'
        ]
        self.archives = {
            'http://www.example.com/resourceA': versions_a,
            'http://www.example.com/resourceB': versions_b
        }
        self.dates = {
            'http://www.example.com/resourceA': date_times_a,
            'http://www.example.com/resourceB': date_times_b

        }
Ejemplo n.º 4
0
    def __init__(self):
        Handler.__init__(self)
        self.TIMESTAMPFMT = '%Y%m%d%H%M%S'

        # Storing first mementos
        self.inner_cache = {}
        self.max_inner_cache_size = 100000
Ejemplo n.º 5
0
    def __init__(self):
        Handler.__init__(self)

        self.datere = re.compile('http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+')
        self.colls = [
            'lcwa0001',
            'lcwa0002',
            'lcwa0003',
            'lcwa0004',
            'lcwa0005',
            'lcwa0006',
            'lcwa0007',
            'lcwa0008',
            'lcwa0009',
            'lcwa0010',
            'lcwa0011',
            'lcwa0012',
            'lcwa0013',
            'lcwa0014',
            'lcwa0015',
            'lcwa0016',
            'lcwa0017',
            'lcwa0018',
            'lcwa0019',
            'lcwa0020',
            'lcwa0029',
            'lcwa0031',
            'lcwa0032',
            'lcwa0033',
            'lcwa0037'
        ]
Ejemplo n.º 6
0
    def __init__(self):

        self.baseuri = "http://www.padi.cat:8080/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://www.padi.cat:8080/wayback/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)
Ejemplo n.º 7
0
    def __init__(self):
        #self.baseuri = "http://was.nl.sg/wayback/*/"
        self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)
Ejemplo n.º 8
0
    def __init__(self):
        Handler.__init__(self)

        # Resources

        # Ignores all that trails the identifier (? params, vX version,...)
        self.rex = re.compile(r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)')
        self.api_base = 'http://export.arxiv.org/oai2'
Ejemplo n.º 9
0
    def __init__(self):
        Handler.__init__(self)

        self.hosts = [
                    'www.wowwiki.com',
                    'en.memory-alpha.org',
                    'wiki.ffxiclopedia.org',
                    'www.jedipedia.de'
        ]
Ejemplo n.º 10
0
    def __init__(self):
        Handler.__init__(self)
        self.baseuri = "http://webharvest.gov/"
        congress_number = 109
        FIRST_YEAR = 2006
        THIS_YEAR = now().year
        self.collections = ["peth04"]

        for i in range(FIRST_YEAR, THIS_YEAR, 2):
            self.collections.append("congress%sth" % congress_number)
            congress_number += 1
Ejemplo n.º 11
0
    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        self.resources = ['https://github.com/.+',
                          'https://raw.githubusercontent.com/']

        # Local fields
        self.api = 'https://api.github.com'

        # Precompiles regular expressions
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ((?:raw.githubusercontent|github).com/)  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile('<(.+?)>; rel="next"')  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files
Ejemplo n.º 12
0
    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+']  # TODO: move to config file

        # Local fields
        self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3'  # TODO: move to config file
        self.apikey = 'VqeqaShAw4GWVc3dp7--'  # TODO: move to config file

        # Precompiles regular expressions  ## TODO: generalize for URLs with numeric project ID instead of user/repo!!!
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ([^/]+)/  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile('<(.+?)>; rel="next"')  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files
Ejemplo n.º 13
0
    def __init__(self):
        Handler.__init__(self)
        self.LIMIT_MAX = 100
        self.BASE = 'http://www.pastpages.org'
        self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S'
        self.FIRST_DATE = datetime(2012, 04, 27).strftime(self.API_TIMEFMT)

        # Building pages list of ('uri', 'slug') pairs
        self.pages_list = []

        try:
            params = {
                'limit': self.LIMIT_MAX
            }
            request = '/api/beta/sites/'
            has_next = True

            # Keep while there are still result pages
            while has_next:
                json_response = self.request(self.BASE+request, params=params).json()

                self.pages_list.extend([
                    # 'objects' is the list of responses
                    # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively
                    (obj['url'], obj['slug'])
                        for obj in json_response['objects']
                ])

                request = json_response['meta']['next']
                params = None  # the request already contains &limit and &offset
                has_next = request is not None # Each response has a non null 'meta.next' value if it has a continuation

        except Exception as e:
            logging.critical("Cannot create the handler's page list:")
            raise e

        logging.info("Found %s websites on pastpages' API." % len(self.pages_list))
Ejemplo n.º 14
0
 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://arquivo.pt/wayback/wayback/xmlquery"
Ejemplo n.º 15
0
 def __init__(self):
     Handler.__init__(self)
     regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">'
     self.uriRegex = re.compile(regex)
Ejemplo n.º 16
0
 def __init__(self):
     Handler.__init__(self)
     cj = cookielib.LWPCookieJar()
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
     urllib2.install_opener(opener)
Ejemplo n.º 17
0
 def __init__(self):
     self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/"
     regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">';
     self.uriRegex = re.compile(regex)
     Handler.__init__(self)
Ejemplo n.º 18
0
 def __init__(self):
     Handler.__init__(self)
Ejemplo n.º 19
0
 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/"
     self.dtre = re.compile("http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
Ejemplo n.º 20
0
 def __init__(self):
     Handler.__init__(self)
     self.TIMESTAMPFMT = '%Y%m%d%H%M%S'