Ejemplos de Handler en Python, ejemplos de core.handler_baseclass.Handler en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: sg.py Proyecto: anukat2015/timegate

    def __init__(self):
        #self.baseuri = "http://was.nl.sg/wayback/*/"
        self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">';
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: cat.py Proyecto: mementoweb/timegate

    def __init__(self):

        self.baseuri = "http://www.padi.cat:8080/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://www.padi.cat:8080/wayback/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: example.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)
        # Initialization code here. This part is run only once
        versions_a = [
            'http://www.example.com/resourceA_v1',
            'http://www.example.com/resourceA_v2',
            'http://www.example.com/resourceA_v3'

        ]
        date_times_a = [
            '1999-09-30T01:50:50Z',
            '2010-10-16T13:27:27Z',
            '2015-01-03T22:00:00Z'
        ]
        versions_b = [
            'http://www.example.com/resourceB_v1',
            'http://www.example.com/resourceB_v2',

        ]
        date_times_b = [
            '1998-07-17T17:47:31Z',
            '2000-11-08T19:05:09Z'
        ]
        self.archives = {
            'http://www.example.com/resourceA': versions_a,
            'http://www.example.com/resourceB': versions_b
        }
        self.dates = {
            'http://www.example.com/resourceA': date_times_a,
            'http://www.example.com/resourceB': date_times_b

        }

Ejemplo n.º 4

0

Mostrar archivo

Archivo: wikipedia.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)
        self.TIMESTAMPFMT = '%Y%m%d%H%M%S'

        # Storing first mementos
        self.inner_cache = {}
        self.max_inner_cache_size = 100000

Ejemplo n.º 5

0

Mostrar archivo

Archivo: loc.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)

        self.datere = re.compile('http://webarchive.loc.gov/[a-zA-Z0-9]+/([0-9]+)/.+')
        self.colls = [
            'lcwa0001',
            'lcwa0002',
            'lcwa0003',
            'lcwa0004',
            'lcwa0005',
            'lcwa0006',
            'lcwa0007',
            'lcwa0008',
            'lcwa0009',
            'lcwa0010',
            'lcwa0011',
            'lcwa0012',
            'lcwa0013',
            'lcwa0014',
            'lcwa0015',
            'lcwa0016',
            'lcwa0017',
            'lcwa0018',
            'lcwa0019',
            'lcwa0020',
            'lcwa0029',
            'lcwa0031',
            'lcwa0032',
            'lcwa0033',
            'lcwa0037'
        ]

Ejemplo n.º 6

0

Mostrar archivo

Archivo: cat.py Proyecto: skbly7/amber_timegate

    def __init__(self):

        self.baseuri = "http://www.padi.cat:8080/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://www.padi.cat:8080/wayback/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)

Ejemplo n.º 7

0

Mostrar archivo

    def __init__(self):
        #self.baseuri = "http://was.nl.sg/wayback/*/"
        self.baseuri = "http://eresources.nlb.gov.sg/webarchives/wayback/*/"

        regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://eresources.nlb.gov.sg/webarchives/wayback/[\S]*">'
        self.uriRegex = re.compile(regex)
        Handler.__init__(self)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: arxiv.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)

        # Resources

        # Ignores all that trails the identifier (? params, vX version,...)
        self.rex = re.compile(r'(http://arxiv.org)/((?:pdf)|(?:abs))/(\d+\.\d+)(.*)')
        self.api_base = 'http://export.arxiv.org/oai2'

Ejemplo n.º 9

0

Mostrar archivo

Archivo: wikia.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)

        self.hosts = [
                    'www.wowwiki.com',
                    'en.memory-alpha.org',
                    'wiki.ffxiclopedia.org',
                    'www.jedipedia.de'
        ]

Ejemplo n.º 10

0

Mostrar archivo

Archivo: nara.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)
        self.baseuri = "http://webharvest.gov/"
        congress_number = 109
        FIRST_YEAR = 2006
        THIS_YEAR = now().year
        self.collections = ["peth04"]

        for i in range(FIRST_YEAR, THIS_YEAR, 2):
            self.collections.append("congress%sth" % congress_number)
            congress_number += 1

Ejemplo n.º 11

0

Mostrar archivo

Archivo: github.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        self.resources = ['https://github.com/.+',
                          'https://raw.githubusercontent.com/']

        # Local fields
        self.api = 'https://api.github.com'

        # Precompiles regular expressions
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ((?:raw.githubusercontent|github).com/)  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile('<(.+?)>; rel="next"')  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files

Ejemplo n.º 12

0

Mostrar archivo

Archivo: gitlab.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)
        # Mandatory fields
        self.resources = ['https://gitlab.ub.uni-bielefeld.de/.+']  # TODO: move to config file

        # Local fields
        self.api = 'https://gitlab.ub.uni-bielefeld.de/api/v3'  # TODO: move to config file
        self.apikey = 'VqeqaShAw4GWVc3dp7--'  # TODO: move to config file

        # Precompiles regular expressions  ## TODO: generalize for URLs with numeric project ID instead of user/repo!!!
        self.rex = re.compile("""  # The format of URI-Rs
                              (https://)  # protocol
                              ([^/]+)/  # base
                              ([^/]+)/  # user
                              ([^/]+)  # repo
                              (/.*)?  # optional path
                              """, re.X)  # verbosed: ignore whitespaces and \n
        self.header_rex = re.compile('<(.+?)>; rel="next"')  # The regex for the query continuation header
        self.file_rex = re.compile('(/blob)?/master')  # The regex for files

Ejemplo n.º 13

0

Mostrar archivo

Archivo: pastpages.py Proyecto: jirikuncar/timegate

    def __init__(self):
        Handler.__init__(self)
        self.LIMIT_MAX = 100
        self.BASE = 'http://www.pastpages.org'
        self.API_TIMEFMT = '%Y-%m-%dT%H:%M:%S'
        self.FIRST_DATE = datetime(2012, 04, 27).strftime(self.API_TIMEFMT)

        # Building pages list of ('uri', 'slug') pairs
        self.pages_list = []

        try:
            params = {
                'limit': self.LIMIT_MAX
            }
            request = '/api/beta/sites/'
            has_next = True

            # Keep while there are still result pages
            while has_next:
                json_response = self.request(self.BASE+request, params=params).json()

                self.pages_list.extend([
                    # 'objects' is the list of responses
                    # 'objects.url' and 'objects.slug' are the URI and the website's short name respectively
                    (obj['url'], obj['slug'])
                        for obj in json_response['objects']
                ])

                request = json_response['meta']['next']
                params = None  # the request already contains &limit and &offset
                has_next = request is not None # Each response has a non null 'meta.next' value if it has a continuation

        except Exception as e:
            logging.critical("Cannot create the handler's page list:")
            raise e

        logging.info("Found %s websites on pastpages' API." % len(self.pages_list))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: po.py Proyecto: jirikuncar/timegate

 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://arquivo.pt/wayback/wayback/xmlquery"

Ejemplo n.º 15

0

Mostrar archivo

Archivo: es.py Proyecto: jirikuncar/timegate

 def __init__(self):
     Handler.__init__(self)
     regex = r'<a onclick="SetAnchorDate\(\'(.*)\'\);" href="(.*)">'
     self.uriRegex = re.compile(regex)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: webcite.py Proyecto: jirikuncar/timegate

 def __init__(self):
     Handler.__init__(self)
     cj = cookielib.LWPCookieJar()
     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
     urllib2.install_opener(opener)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: si.py Proyecto: jirikuncar/timegate

 def __init__(self):
     self.baseuri = "http://nukrobi2.nuk.uni-lj.si:8080/wayback/*/"
     regex = r'<a onclick="SetAnchorDate\(\'.*\'\);" href="http://nukrobi2.nuk.uni-lj.si:8080/wayback/[\S]*">';
     self.uriRegex = re.compile(regex)
     Handler.__init__(self)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: cr.py Proyecto: jirikuncar/timegate

 def __init__(self):
     Handler.__init__(self)

Ejemplo n.º 19

0

Mostrar archivo

Archivo: can.py Proyecto: jirikuncar/timegate

 def __init__(self):
     Handler.__init__(self)
     self.baseuri = "http://www.collectionscanada.gc.ca/webarchives/*/"
     self.dtre = re.compile("http://www.collectionscanada.gc.ca/webarchives/(\d+)/")

Ejemplo n.º 20

0

Mostrar archivo

Archivo: mediawiki.py Proyecto: jirikuncar/timegate

 def __init__(self):
     Handler.__init__(self)
     self.TIMESTAMPFMT = '%Y%m%d%H%M%S'