def __init__(self): #self.baseuri = "http://was.nl.sg/wayback/*/" self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">'; self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): self.baseuri = "http://www.padi.cat:8080/wayback/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://www.padi.cat:8080/wayback/[\S]*">' self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): Handler.__init__(self) # Initialization code here. This part is run only once versions_a = [ 'http://www.example.com/resourceA_v1', 'http://www.example.com/resourceA_v2', 'http://www.example.com/resourceA_v3' ] date_times_a = [ '1999-09-30T01:50:50Z', '2010-10-16T13:27:27Z', '2015-01-03T22:00:00Z' ] versions_b = [ 'http://www.example.com/resourceB_v1', 'http://www.example.com/resourceB_v2', ] date_times_b = [ '1998-07-17T17:47:31Z', '2000-11-08T19:05:09Z' ] self.archives = { 'http://www.example.com/resourceA': versions_a, 'http://www.example.com/resourceB': versions_b } self.dates = { 'http://www.example.com/resourceA': date_times_a, 'http://www.example.com/resourceB': date_times_b }
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = '%Y%m%d%H%M%S' # Storing first mementos self.inner_cache = {} self.max_inner_cache_size = 100000
def __init__(self): Handler.__init__(self) self.datere = re.compile('http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+') self.colls = [ 'lcwa0001', 'lcwa0002', 'lcwa0003', 'lcwa0004', 'lcwa0005', 'lcwa0006', 'lcwa0007', 'lcwa0008', 'lcwa0009', 'lcwa0010', 'lcwa0011', 'lcwa0012', 'lcwa0013', 'lcwa0014', 'lcwa0015', 'lcwa0016', 'lcwa0017', 'lcwa0018', 'lcwa0019', 'lcwa0020', 'lcwa0029', 'lcwa0031', 'lcwa0032', 'lcwa0033', 'lcwa0037' ]
def __init__(self): #self.baseuri = "http://was.nl.sg/wayback/*/" self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">' self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): Handler.__init__(self) # Resources # Ignores all that trails the identifier (? params, vX version,...) self.rex = re.compile(r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)') self.api_base = 'http://export.arxiv.org/oai2'
def __init__(self): Handler.__init__(self) self.hosts = [ 'www.wowwiki.com', 'en.memory-alpha.org', 'wiki.ffxiclopedia.org', 'www.jedipedia.de' ]
def __init__(self): Handler.__init__(self) self.baseuri = "http://webharvest.gov/" congress_number = 109 FIRST_YEAR = 2006 THIS_YEAR = now().year self.collections = ["peth04"] for i in range(FIRST_YEAR, THIS_YEAR, 2): self.collections.append("congress%sth" % congress_number) congress_number += 1
def __init__(self): Handler.__init__(self) # Mandatory fields self.resources = ['https://github.com/.+', 'https://raw.githubusercontent.com/'] # Local fields self.api = 'https://api.github.com' # Precompiles regular expressions self.rex = re.compile(""" # The format of URI-Rs (https://) # protocol ((?:raw.githubusercontent|github).com/) # base ([^/]+)/ # user ([^/]+) # repo (/.*)? # optional path """, re.X) # verbosed: ignore whitespaces and \n self.header_rex = re.compile('<(.+?)>; rel="next"') # The regex for the query continuation header self.file_rex = re.compile('(/blob)?/master') # The regex for files
def __init__(self): Handler.__init__(self) # Mandatory fields self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+'] # TODO: move to config file # Local fields self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3' # TODO: move to config file self.apikey = 'VqeqaShAw4GWVc3dp7--' # TODO: move to config file # Precompiles regular expressions ## TODO: generalize for URLs with numeric project ID instead of user/repo!!! self.rex = re.compile(""" # The format of URI-Rs (https://) # protocol ([^/]+)/ # base ([^/]+)/ # user ([^/]+) # repo (/.*)? # optional path """, re.X) # verbosed: ignore whitespaces and \n self.header_rex = re.compile('<(.+?)>; rel="next"') # The regex for the query continuation header self.file_rex = re.compile('(/blob)?/master') # The regex for files
def __init__(self): Handler.__init__(self) self.LIMIT_MAX = 100 self.BASE = 'http://www.pastpages.org' self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S' self.FIRST_DATE = datetime(2012, 04, 27).strftime(self.API_TIMEFMT) # Building pages list of ('uri', 'slug') pairs self.pages_list = [] try: params = { 'limit': self.LIMIT_MAX } request = '/api/beta/sites/' has_next = True # Keep while there are still result pages while has_next: json_response = self.request(self.BASE+request, params=params).json() self.pages_list.extend([ # 'objects' is the list of responses # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively (obj['url'], obj['slug']) for obj in json_response['objects'] ]) request = json_response['meta']['next'] params = None # the request already contains &limit and &offset has_next = request is not None # Each response has a non null 'meta.next' value if it has a continuation except Exception as e: logging.critical("Cannot create the handler's page list:") raise e logging.info("Found %s websites on pastpages' API." % len(self.pages_list))
def __init__(self): Handler.__init__(self) self.baseuri = "http://arquivo.pt/wayback/wayback/xmlquery"
def __init__(self): Handler.__init__(self) regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">' self.uriRegex = re.compile(regex)
def __init__(self): Handler.__init__(self) cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener)
def __init__(self): self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/" regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">'; self.uriRegex = re.compile(regex) Handler.__init__(self)
def __init__(self): Handler.__init__(self)
def __init__(self): Handler.__init__(self) self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/" self.dtre = re.compile("http://www.collectionscanada.gc.ca/webarchives/(\d+)/")
def __init__(self): Handler.__init__(self) self.TIMESTAMPFMT = '%Y%m%d%H%M%S'