Example #1
0
class Crawler(object):
    def __init__(self):
        """Create an item representing the 'PeeringDB organization ID' class if 
        doesn't already exist. And fetch QIDs for organizations already in the
        wikibase."""

        sys.stderr.write('Initialization...\n')

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID for the item representing the organization IDs
        orgid_qid = self.wh.get_qid(
            ORGID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB org IDs',  # Commit message
                'description':
                'Identifier for an organization in the PeeringDB database'
            })

        # Load the QIDs for organizations already available in the wikibase
        self.orgid2qid = self.wh.extid2qid(qid=orgid_qid)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_ORGS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch organizations information from PeeringDB and push to wikibase"""

        sys.stderr.write('Fetching PeeringDB data...\n')
        req = requests.get(URL_PDB_ORGS)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')
        organizations = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, res in enumerate(map(self.update_org, organizations)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}')

    def update_org(self, organization):
        """Add the organization to wikibase if it's not there and update properties"""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('organization')
        ],
                      [
                          self.wh.get_pid('name'),
                          organization['name'].strip(), self.reference
                      ]]

        # set property website
        if organization['website']:
            statements.append([
                self.wh.get_pid('website'), organization['website'],
                self.reference
            ])

        # set property country
        if organization['country'] in iso3166.countries_by_alpha2:
            country_qid = self.wh.get_qid(
                iso3166.countries_by_alpha2[organization['country']].name)
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # Update name, website, and country for this organization
        org_qid = self.org_qid(organization)
        self.wh.upsert_statements('update peeringDB organization', org_qid,
                                  statements)

        return org_qid

    def org_qid(self, organization):
        """Find the organization QID or add it to wikibase if it is not yet there.
        Return the organization QID."""

        # Check if the organization is in the wikibase
        if str(organization['id']) not in self.orgid2qid:
            # Set properties for this new organization
            org_qualifier = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)),
            ]
            statements = [[
                self.wh.get_pid('external ID'),
                str(organization['id']), [], org_qualifier
            ]]

            # Add this organization to the wikibase
            org_qid = self.wh.add_item('add new peeringDB organization',
                                       label=organization['name'],
                                       statements=statements)
            # keep track of this QID
            self.orgid2qid[str(organization['id'])] = org_qid

        return self.orgid2qid[str(organization['id'])]
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB exchange point ID class if 
        doesn't already exist. And fetch QIDs for exchange points already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB IX IDs
        ixid_qid = self.wh.get_qid(
            IXID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB ix IDs',  # Commit message
                'description':
                'Identifier for an exchange point in the PeeringDB database'  # Description
            })

        # Load the QIDs for ix already available in the wikibase
        self.ixid2qid = self.wh.extid2qid(qid=ixid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)

        # Added properties will have this reference information
        self.today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_IXS),
                          (self.wh.get_pid('point in time'), self.today)]

    def run(self):
        """Fetch ixs information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = requests.get(URL_PDB_IXS)
        if req.status_code != 200:
            sys.exit('Error while fetching IXs data')
        ixs = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, ix in enumerate(ixs):

            # Get more info for this IX
            req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}')
            if req.status_code != 200:
                sys.exit('Error while fetching IXs data')
            ix_info = json.loads(req.text)['data'][0]

            # Update info in wiki
            self.update_ix(ix_info)

            sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}')

    def update_ix(self, ix):
        """Add the ix to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('Internet exchange point')
        ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(ix['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ', ix['org_id'])

        # set property country
        if ix['country']:
            country_qid = self.wh.country2qid(ix['country'])
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # set property website
        if ix['website']:
            statements.append(
                [self.wh.get_pid('website'), ix['website'], self.reference])

        # set traffic webpage
        if ix['url_stats']:
            statements.append([
                self.wh.get_pid('website'),
                ix['url_stats'],  # statement
                self.reference,  # reference 
                [
                    (self.wh.get_pid('instance of'),
                     self.wh.get_qid('traffic statistics')),
                ]  # qualifier
            ])

        ix_qid = self.ix_qid(ix)
        # Update name, website, and organization for this IX
        self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements)

        # update LAN corresponding to this IX
        if 'ixlan_set' in ix:
            for ixlan in ix['ixlan_set']:
                pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}'
                pfx_ref = [(self.wh.get_pid('source'),
                            self.wh.get_qid('PeeringDB')),
                           (self.wh.get_pid('reference URL'), pfx_url),
                           (self.wh.get_pid('point in time'), self.today)]

                req = requests.get(pfx_url)
                if req.status_code != 200:
                    sys.exit('Error while fetching IXs data')
                lans = json.loads(req.text)['data']

                for lan in lans:
                    for prefix in lan['ixpfx_set']:
                        pfx_qid = self.wh.prefix2qid(prefix['prefix'],
                                                     create=True)

                        pfx_stmts = [[
                            self.wh.get_pid('instance of'),
                            self.wh.get_qid('peering LAN'), pfx_ref
                        ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]]

                        self.wh.upsert_statements('update peeringDB ixlan',
                                                  pfx_qid, pfx_stmts)

        return ix_qid

    def ix_qid(self, ix):
        """Find the ix QID for the given ix.
        If this ix is not yet registered in the wikibase then add it.

        Return the ix QID."""

        # Check if the IX is in the wikibase
        if str(ix['id']) not in self.ixid2qid:
            # Set properties for this new ix
            ix_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)),
            ]
            statements = [(self.wh.get_pid('instance of'),
                           self.wh.get_qid('Internet exchange point')),
                          (self.wh.get_pid('external ID'), str(ix['id']), [],
                           ix_qualifiers)]

            # Add this ix to the wikibase
            ix_qid = self.wh.add_item('add new peeringDB IX',
                                      label=ix['name'],
                                      description=ix['name_long'],
                                      statements=statements)
            # keep track of this QID
            self.ixid2qid[str(ix['id'])] = ix_qid

        return self.ixid2qid[str(ix['id'])]
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB network ID class if 
        doesn't already exist. And fetch QIDs for networks already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB network IDs
        netid_qid = self.wh.get_qid(
            NETID_LABEL,
            create={  # Create it if it doesn't exist
                'summary': 'add PeeringDB net IDs',  # Commit message
                'description':
                'Identifier for a network in the PeeringDB database'  # Description
            })

        # Load the QIDs for networks already available in the wikibase
        self.netid2qid = self.wh.extid2qid(qid=netid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)
        # Load the QIDs for peeringDB IXs
        self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_NETS),
                          (self.wh.get_pid('point in time'), today)]

        # Session object to fetch peeringdb data
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch networks information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = self.http_session.get(URL_PDB_NETS)
        if req.status_code != 200:
            sys.exit('Error while fetching data from API')
        networks = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, _ in enumerate(map(self.update_net, networks)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}')

    def update_net(self, network):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('name'), network['name'].strip(), self.reference
        ]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(network['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ',
                  network['org_id'])

        # set property website
        if network['website']:
            statements.append([
                self.wh.get_pid('website'), network['website'], self.reference
            ])

        # Update IX membership
        # Fetch membership for this network
        netixlan_url = URL_PDB_NETS + f'/{network["id"]}'

        req = self.http_session.get(netixlan_url)
        if req.status_code != 200:
            sys.exit(f'Error while fetching network data (id={network["id"]})')

        net_details = json.loads(req.text)['data']
        if len(net_details) > 1:
            print(net_details)

        net_details = net_details[0]

        # Push membership to wikidata
        today = self.wh.today()
        netixlan_ref = [(self.wh.get_pid('source'),
                         self.wh.get_qid('PeeringDB')),
                        (self.wh.get_pid('reference URL'), netixlan_url),
                        (self.wh.get_pid('point in time'), today)]

        for ixlan in net_details['netixlan_set']:
            ix_qid = self.ixid2qid.get(str(ixlan['ix_id']))
            if ix_qid is None:
                print(f'Unknown IX: ix_id={ixlan["ix_id"]}')
                continue
            statements.append(
                [self.wh.get_pid('member of'), ix_qid, netixlan_ref])

        # Update name, website, and organization for this network
        net_qid = self.net_qid(network)
        self.wh.upsert_statements('update peeringDB networks', net_qid,
                                  statements)

        return net_qid

    def net_qid(self, network):
        """Find the network QID for the given network.
        If this network is not yet registered in the wikibase then find (or 
        create) the item corresponding to the network ASN and register 
        the peeringDB network ID with this item.

        Return the network QID."""

        # Check if the network is in the wikibase
        if str(network['id']) not in self.netid2qid:
            # Find or create the corresponding ASN item
            net_qid = self.wh.asn2qid(network['asn'], create=True)
            # Set properties for this new network
            net_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)),
            ]
            statements = [[
                self.wh.get_pid('external ID'),
                str(network['id']), [], net_qualifiers
            ]]

            # Add this network to the wikibase
            self.wh.upsert_statements('add new peeringDB network',
                                      net_qid,
                                      statements=statements)
            # keep track of this QID
            self.netid2qid[str(network['id'])] = net_qid

        return self.netid2qid[str(network['id'])]
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for RIPE Atlas
        self.atlas_qid = self.wh.get_qid(
            'RIPE Atlas',
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.',  # Item description
                'aliases':
                'Atlas|atlas',
                'statements':
                [[self.wh.get_pid('managed by'),
                  self.wh.get_qid('RIPE NCC')]]
            })

        # Get the QID for Atlas Probe
        self.atlas_probe_qid = self.wh.get_qid(
            'Atlas probe',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.',  # Item description
                'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID for Atlas Anchor
        self.atlas_anchor_qid = self.wh.get_qid(
            'Atlas anchor',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.',  # Item description
                'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID of the item representing PeeringDB IX IDs
        self.probeid_qid = self.wh.get_qid(
            PROBEID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas probes',  # Commit message
                'description':
                'Identifier for a probe in the RIPE Atlas measurement platform'  # Description
            })

        # Load the QIDs for probes already available in the wikibase
        self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid)

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('RIPE NCC')),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

        self.v4_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv4'))]

        self.v6_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv6'))]

    def run(self):
        """Fetch probe information from Atlas API and push to wikibase. """

        next_page = URL

        while next_page is not None:
            req = requests.get(next_page)
            if req.status_code != 200:
                sys.exit('Error while fetching the blocklist')

            info = json.loads(req.text)
            next_page = info['next']

            for i, probe in enumerate(info['results']):

                self.update_probe(probe)
                sys.stderr.write(f'\rProcessed {i+1} probes')
            sys.stderr.write(f'\n')

    def update_probe(self, probe):
        """Add the probe to wikibase if it's not already there and update its
        properties."""

        # TODO add status, geometry (geo-location) and IPs?

        # Properties for this probe
        statements = []

        if probe['is_anchor']:
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_probe_qid])
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_anchor_qid])
        if probe['asn_v4']:
            as_qid = self.wh.asn2qid(probe['asn_v4'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v4_qualifiers
                ])
        if probe['asn_v6']:
            as_qid = self.wh.asn2qid(probe['asn_v6'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v6_qualifiers
                ])
        if probe['prefix_v4']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v4'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['prefix_v6']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v6'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['country_code']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(probe['country_code']), self.reference
            ])
        if probe['first_connected']:
            statements.append([
                self.wh.get_pid('start time'),
                self.wh.to_wbtime(probe['first_connected']), self.reference
            ])

        if 'name' in probe['status']:
            # Get the QIDs for probes status
            status_qid = self.wh.get_qid(
                f'RIPE Atlas probe status: {probe["status"]["name"]}',
                create={  # Create it if it doesn't exist
                    'summary': 'add RIPE Atlas probe status',  # Commit message
                })

            if probe['status_since']:
                statements.append([
                    self.wh.get_pid('status'), status_qid, self.reference,
                    [(self.wh.get_pid('start time'),
                      self.wh.to_wbtime(probe['status_since']))]
                ])

            # set end time if the probe is abandonned
            if probe['status']['name'] == 'Abandoned' and probe['status_since']:
                statements.append([
                    self.wh.get_pid('end time'),
                    self.wh.to_wbtime(probe['status_since'])
                ])

        # Add probe tags
        for tag in probe['tags']:
            statements.append([
                self.wh.get_pid('tag'),
                self.wh.get_qid(tag['name'],
                                create={
                                    'summary': 'Add RIPE Atlas tag',
                                })
            ])

        # Commit to wikibase
        # Get the probe QID (create if probe is not yet registered) and commit changes
        probe_qid = self.probe_qid(probe)
        self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid,
                                  statements)

    def probe_qid(self, probe):
        """Find the ix QID for the given probe ID.
        If this probe is not yet registered in the wikibase then add it.

        Return the probe QID."""

        id = str(probe['id'])

        # Check if the IX is in the wikibase
        if id not in self.probeid2qid:
            # Set properties for this new probe
            probeid_qualifiers = [
                (self.wh.get_pid('instance of'), self.probeid_qid),
            ]
            statements = [
                (self.wh.get_pid('instance of'), self.atlas_probe_qid),
                (self.wh.get_pid('external ID'), id, [], probeid_qualifiers)
            ]

            # Add this probe to the wikibase
            probe_qid = self.wh.add_item('add new RIPE Atlas probe',
                                         label=f'RIPE Atlas probe #{id}',
                                         description=probe['description'],
                                         statements=statements)
            # keep track of this QID
            self.probeid2qid[id] = probe_qid

        return self.probeid2qid[id]