def __init__(self): Handler.__init__(self) # Initialization code here. This part is run only once versions_a = [ 'http://www.example.com/resourceA_v1', 'http://www.example.com/resourceA_v2', 'http://www.example.com/resourceA_v3' ] date_times_a = [ '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z', '2015-01-03T22:00:00Z' ] versions_b = [ 'http://www.example.com/resourceB_v1', 'http://www.example.com/resourceB_v2', ] date_times_b = ['1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z'] self.archives = { 'http://www.example.com/resourceA': versions_a, 'http://www.example.com/resourceB': versions_b, 'http://www.example.com/resource%20space': [ 'http://www.example.com/space', ], } self.dates = { 'http://www.example.com/resourceA': date_times_a, 'http://www.example.com/resourceB': date_times_b, 'http://www.example.com/resource%20space': ['1970-01-01T00:00:00Z'], }
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = "%Y%m%d%H%M%S" # Storing first mementos self.inner_cache = {} self.max_inner_cache_size = 100000
def __init__(self, base_uri='http://www.example.com/'): Handler.__init__(self) # Initialization code here. This part is run only once versions_a = [ '{0}resourceA_v1'.format(base_uri), '{0}resourceA_v2'.format(base_uri), '{0}resourceA_v3'.format(base_uri) ] date_times_a = [ '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z', '2015-01-03T22:00:00Z' ] versions_b = [ '{0}resourceB_v1'.format(base_uri), '{0}resourceB_v2'.format(base_uri), ] date_times_b = ['1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z'] self.archives = { '{0}resourceA'.format(base_uri): versions_a, '{0}resourceB'.format(base_uri): versions_b, '{0}resource%20space'.format(base_uri): [ '{0}space'.format(base_uri), ], } self.dates = { '{0}resourceA'.format(base_uri): date_times_a, '{0}resourceB'.format(base_uri): date_times_b, '{0}resource%20space'.format(base_uri): ['1970-01-01T00:00:00Z'], }
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = '%Y%m%d%H%M%S' # Storing first mementos self.inner_cache = {} self.max_inner_cache_size = 100000
def __init__(self): self.baseuri = "http://83.212.204.92:8080/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://83.212.204.92:8080/[\S]*">' self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): self.baseuri = "http://83.212.204.92:8080/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://83.212.204.92:8080/[\S]*">' self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): Handler.__init__(self) # Local fields self.api_url = 'https://api.w3.org/specifications/%s/versions?_format=json&apikey=%s&embed=1' self.re_spec_name = re.compile("https?:\/\/(www.)?w3.org\/TR\/(.*)", re.IGNORECASE)
def __init__(self): Handler.__init__(self) self.datere = re.compile("http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+") self.colls = [ "lcwa0001", "lcwa0002", "lcwa0003", "lcwa0004", "lcwa0005", "lcwa0006", "lcwa0007", "lcwa0008", "lcwa0009", "lcwa0010", "lcwa0011", "lcwa0012", "lcwa0013", "lcwa0014", "lcwa0015", "lcwa0016", "lcwa0017", "lcwa0018", "lcwa0019", "lcwa0020", "lcwa0029", "lcwa0031", "lcwa0032", "lcwa0033", "lcwa0037", ]
def __init__(self): Handler.__init__(self) # Initialization code here. This part is run only once versions_a = [ 'http://www.example.com/resourceA_v1', 'http://www.example.com/resourceA_v2', 'http://www.example.com/resourceA_v3' ] date_times_a = [ '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z', '2015-01-03T22:00:00Z' ] versions_b = [ 'http://www.example.com/resourceB_v1', 'http://www.example.com/resourceB_v2', ] date_times_b = [ '1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z' ] self.archives = { 'http://www.example.com/resourceA': versions_a, 'http://www.example.com/resourceB': versions_b } self.dates = { 'http://www.example.com/resourceA': date_times_a, 'http://www.example.com/resourceB': date_times_b }
def __init__(self): Handler.__init__(self) # Resources # Ignores all that trails the identifier (? params, vX version,...info) self.rex = re.compile( r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)') self.api_base = 'http://export.arxiv.org/oai2'
def __init__(self): Handler.__init__(self) # Resources # Ignores all that trails the identifier (? params, vX version,...info) self.rex = re.compile( r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)') self.api_base = 'http://export.arxiv.org/oai2'
def __init__(self): Handler.__init__(self) # Local fields self.api_url = 'https://api.w3.org/specifications/%s/versions?_format=json&apikey=%s&embed=1' self.re_spec_name = re.compile("https?:\/\/(www.)?w3.org\/TR\/(.*)", re.IGNORECASE)
def __init__(self): Handler.__init__(self) self.hosts = [ 'www.wowwiki.com', 'en.memory-alpha.org', 'wiki.ffxiclopedia.org', 'www.jedipedia.de' ]
def __init__(self): Handler.__init__(self) self.db = MySQLdb.connect( host="localhost", user="******", passwd="password", db="amber_timegate" ) self.cursor = self.db.cursor() logging.basicConfig(filename='/tmp/temp.log')
def __init__(self): Handler.__init__(self) self.baseuri = "http://webharvest.gov/" congress_number = 109 FIRST_YEAR = 2006 THIS_YEAR = datetime.utcnow().year self.collections = ["peth04"] for i in range(FIRST_YEAR, THIS_YEAR, 2): self.collections.append("congress%sth" % congress_number) congress_number += 1
def __init__(self): Handler.__init__(self) self.datere = re.compile( 'http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+') self.colls = [ 'lcwa0001', 'lcwa0002', 'lcwa0003', 'lcwa0004', 'lcwa0005', 'lcwa0006', 'lcwa0007', 'lcwa0008', 'lcwa0009', 'lcwa0010', 'lcwa0011', 'lcwa0012', 'lcwa0013', 'lcwa0014', 'lcwa0015', 'lcwa0016', 'lcwa0017', 'lcwa0018', 'lcwa0019', 'lcwa0020', 'lcwa0029', 'lcwa0031', 'lcwa0032', 'lcwa0033', 'lcwa0037' ]
def __init__(self): Handler.__init__(self) # Mandatory fields self.resources = ['https://github.com/.+', 'https://raw.githubusercontent.com/'] # Local fields self.api = 'https://api.github.com' # Precompiles regular expressions self.rex = re.compile(""" # The format of URI-Rs (https://) # protocol ((?:raw.githubusercontent|github).com/) # base ([^/]+)/ # user ([^/]+) # repo (/.*)? # optional path """, re.X) # verbosed: ignore whitespaces and \n self.header_rex = re.compile( '<(.+?)>; rel="next"') # The regex for the query continuation header self.file_rex = re.compile('(/blob)?/master') # The regex for files
def __init__(self): Handler.__init__(self) self.LIMIT_MAX = 100 self.BASE = 'http://www.pastpages.org' self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S' self.FIRST_DATE = datetime(2012, 0o4, 27).strftime(self.API_TIMEFMT) # Building pages list of ('uri', 'slug') pairs self.pages_list = [] try: params = { 'limit': self.LIMIT_MAX } request = '/api/beta/sites/' has_next = True # Keep while there are still result pages while has_next: json_response = self.request( self.BASE + request, params=params).json() self.pages_list.extend([ # 'objects' is the list of responses # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively (obj['url'], obj['slug']) for obj in json_response['objects'] ]) request = json_response['meta']['next'] params = None # the request already contains &limit and &offset # Each response has a non null 'meta.next' value if it has a # continuation has_next = request is not None except Exception as e: logging.critical("Cannot create the handler's page list:") raise e logging.info("Found %s websites on pastpages' API." % len(self.pages_list))
def __init__(self): Handler.__init__(self) # Mandatory fields self.resources = [ 'https://github.com/.+', 'https://raw.githubusercontent.com/' ] # Local fields self.api = 'https://api.github.com' # Precompiles regular expressions self.rex = re.compile(""" # The format of URI-Rs (https://) # protocol ((?:raw.githubusercontent|github).com/) # base ([^/]+)/ # user ([^/]+) # repo (/.*)? # optional path """, re.X) # verbosed: ignore whitespaces and \n self.header_rex = re.compile( '<(.+?)>; rel="next"' ) # The regex for the query continuation header self.file_rex = re.compile('(/blob)?/master') # The regex for files
def __init__(self): Handler.__init__(self) self.LIMIT_MAX = 100 self.BASE = 'http://www.pastpages.org' self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S' self.FIRST_DATE = datetime(2012, 0o4, 27).strftime(self.API_TIMEFMT) # Building pages list of ('uri', 'slug') pairs self.pages_list = [] try: params = {'limit': self.LIMIT_MAX} request = '/api/beta/sites/' has_next = True # Keep while there are still result pages while has_next: json_response = self.request(self.BASE + request, params=params).json() self.pages_list.extend([ # 'objects' is the list of responses # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively (obj['url'], obj['slug']) for obj in json_response['objects'] ]) request = json_response['meta']['next'] params = None # the request already contains &limit and &offset # Each response has a non null 'meta.next' value if it has a # continuation has_next = request is not None except Exception as e: logging.critical("Cannot create the handler's page list:") raise e logging.info("Found %s websites on pastpages' API." % len(self.pages_list))
def __init__(self): Handler.__init__(self) # Mandatory fields # TODO: move to config file self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+'] # Local fields self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3' # TODO: move to config file self.apikey = 'VqeqaShAw4GWVc3dp7--' # TODO: move to config file # Precompiles regular expressions ## TODO: generalize for URLs with # numeric project ID instead of user/repo!!! self.rex = re.compile(""" # The format of URI-Rs (https://) # protocol ([^/]+)/ # base ([^/]+)/ # user ([^/]+) # repo (/.*)? # optional path """, re.X) # verbosed: ignore whitespaces and \n self.header_rex = re.compile( '<(.+?)>; rel="next"') # The regex for the query continuation header self.file_rex = re.compile('(/blob)?/master') # The regex for files
def __init__(self, base_uri='http://www.example.com/'): Handler.__init__(self) # Initialization code here. This part is run only once versions_a = [ '{0}resourceA_v1'.format(base_uri), '{0}resourceA_v2'.format(base_uri), '{0}resourceA_v3'.format(base_uri) ] date_times_a = [ '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z', '2015-01-03T22:00:00Z' ] versions_b = [ '{0}resourceB_v1'.format(base_uri), '{0}resourceB_v2'.format(base_uri), ] date_times_b = [ '1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z' ] self.archives = { '{0}resourceA'.format(base_uri): versions_a, '{0}resourceB'.format(base_uri): versions_b, '{0}resource%20space'.format(base_uri): [ '{0}space'.format(base_uri), ], } self.dates = { '{0}resourceA'.format(base_uri): date_times_a, '{0}resourceB'.format(base_uri): date_times_b, '{0}resource%20space'.format(base_uri): [ '1970-01-01T00:00:00Z' ], }
def __init__(self): self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">' self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): Handler.__init__(self)
def __init__(self): Handler.__init__(self) regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">' self.uriRegex = re.compile(regex)
def __init__(self): Handler.__init__(self)
def __init__(self): Handler.__init__(self) regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">' self.uriRegex = re.compile(regex)
def __init__(self): Handler.__init__(self) self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/" self.dtre = re.compile( "http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
def __init__(self): Handler.__init__(self) self.baseuri = "http://arquivo.pt/wayback/wayback/xmlquery"
def __init__(self): self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">' self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = '%Y%m%d%H%M%S'
def __init__(self): Handler.__init__(self) cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener)
def __init__(self): Handler.__init__(self) self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/" self.dtre = re.compile( "http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = '%Y%m%d%H%M%S' self.hosts = [".orain.org"]
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = '%Y%m%d%H%M%S' self.hosts = [".orain.org"]