Example #1
0
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy()

        # Reference information for data pushed to the wikibase
        self.reference = [
            (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')),
            (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME),
            (self.wh.get_pid('point in time'), self.wh.today())
            ]

    def run(self):
        """Fetch the AS name file from RIPE website and process lines one by one"""

        req = requests.get(URL_RIPE_AS_NAME)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')

        self.wh.login() # Login once for all threads, not needed with OAuth
        
        for i, res in enumerate(map(self.update_asn, req.text.splitlines())):
            sys.stderr.write(f'\rProcessed {i} ASes')
            

    def update_asn(self, one_line):
        # Parse given line to get ASN, name, and country code 
        asn, _, name_cc = one_line.partition(' ')
        name, _, cc = name_cc.rpartition(', ')

        asn_qid = self.wh.asn2qid(asn, create=True)
        cc_qid = self.wh.country2qid(cc, create=True)

        statements = []
        statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] )  # Set country
        if cc_qid is not None:
            statements.append( [self.wh.get_pid('name'), name, self.reference] )       # Set AS name

        try:
            # Update AS name and country
            self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements)

        except Exception as error:
            # print errors and continue running
            print('Error for: ', one_line)
            print(error)

        return asn_qid
Example #2
0
class Crawler(object):
    def __init__(self):
        """Create an item representing the 'PeeringDB organization ID' class if 
        doesn't already exist. And fetch QIDs for organizations already in the
        wikibase."""

        sys.stderr.write('Initialization...\n')

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID for the item representing the organization IDs
        orgid_qid = self.wh.get_qid(
            ORGID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB org IDs',  # Commit message
                'description':
                'Identifier for an organization in the PeeringDB database'
            })

        # Load the QIDs for organizations already available in the wikibase
        self.orgid2qid = self.wh.extid2qid(qid=orgid_qid)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_ORGS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch organizations information from PeeringDB and push to wikibase"""

        sys.stderr.write('Fetching PeeringDB data...\n')
        req = requests.get(URL_PDB_ORGS)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')
        organizations = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, res in enumerate(map(self.update_org, organizations)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}')

    def update_org(self, organization):
        """Add the organization to wikibase if it's not there and update properties"""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('organization')
        ],
                      [
                          self.wh.get_pid('name'),
                          organization['name'].strip(), self.reference
                      ]]

        # set property website
        if organization['website']:
            statements.append([
                self.wh.get_pid('website'), organization['website'],
                self.reference
            ])

        # set property country
        if organization['country'] in iso3166.countries_by_alpha2:
            country_qid = self.wh.get_qid(
                iso3166.countries_by_alpha2[organization['country']].name)
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # Update name, website, and country for this organization
        org_qid = self.org_qid(organization)
        self.wh.upsert_statements('update peeringDB organization', org_qid,
                                  statements)

        return org_qid

    def org_qid(self, organization):
        """Find the organization QID or add it to wikibase if it is not yet there.
        Return the organization QID."""

        # Check if the organization is in the wikibase
        if str(organization['id']) not in self.orgid2qid:
            # Set properties for this new organization
            org_qualifier = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)),
            ]
            statements = [[
                self.wh.get_pid('external ID'),
                str(organization['id']), [], org_qualifier
            ]]

            # Add this organization to the wikibase
            org_qid = self.wh.add_item('add new peeringDB organization',
                                       label=organization['name'],
                                       statements=statements)
            # keep track of this QID
            self.orgid2qid[str(organization['id'])] = org_qid

        return self.orgid2qid[str(organization['id'])]
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB network ID class if 
        doesn't already exist. And fetch QIDs for networks already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB network IDs
        netid_qid = self.wh.get_qid(
            NETID_LABEL,
            create={  # Create it if it doesn't exist
                'summary': 'add PeeringDB net IDs',  # Commit message
                'description':
                'Identifier for a network in the PeeringDB database'  # Description
            })

        # Load the QIDs for networks already available in the wikibase
        self.netid2qid = self.wh.extid2qid(qid=netid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)
        # Load the QIDs for peeringDB IXs
        self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_NETS),
                          (self.wh.get_pid('point in time'), today)]

        # Session object to fetch peeringdb data
        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch networks information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = self.http_session.get(URL_PDB_NETS)
        if req.status_code != 200:
            sys.exit('Error while fetching data from API')
        networks = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, _ in enumerate(map(self.update_net, networks)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}')

    def update_net(self, network):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('name'), network['name'].strip(), self.reference
        ]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(network['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ',
                  network['org_id'])

        # set property website
        if network['website']:
            statements.append([
                self.wh.get_pid('website'), network['website'], self.reference
            ])

        # Update IX membership
        # Fetch membership for this network
        netixlan_url = URL_PDB_NETS + f'/{network["id"]}'

        req = self.http_session.get(netixlan_url)
        if req.status_code != 200:
            sys.exit(f'Error while fetching network data (id={network["id"]})')

        net_details = json.loads(req.text)['data']
        if len(net_details) > 1:
            print(net_details)

        net_details = net_details[0]

        # Push membership to wikidata
        today = self.wh.today()
        netixlan_ref = [(self.wh.get_pid('source'),
                         self.wh.get_qid('PeeringDB')),
                        (self.wh.get_pid('reference URL'), netixlan_url),
                        (self.wh.get_pid('point in time'), today)]

        for ixlan in net_details['netixlan_set']:
            ix_qid = self.ixid2qid.get(str(ixlan['ix_id']))
            if ix_qid is None:
                print(f'Unknown IX: ix_id={ixlan["ix_id"]}')
                continue
            statements.append(
                [self.wh.get_pid('member of'), ix_qid, netixlan_ref])

        # Update name, website, and organization for this network
        net_qid = self.net_qid(network)
        self.wh.upsert_statements('update peeringDB networks', net_qid,
                                  statements)

        return net_qid

    def net_qid(self, network):
        """Find the network QID for the given network.
        If this network is not yet registered in the wikibase then find (or 
        create) the item corresponding to the network ASN and register 
        the peeringDB network ID with this item.

        Return the network QID."""

        # Check if the network is in the wikibase
        if str(network['id']) not in self.netid2qid:
            # Find or create the corresponding ASN item
            net_qid = self.wh.asn2qid(network['asn'], create=True)
            # Set properties for this new network
            net_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)),
            ]
            statements = [[
                self.wh.get_pid('external ID'),
                str(network['id']), [], net_qualifiers
            ]]

            # Add this network to the wikibase
            self.wh.upsert_statements('add new peeringDB network',
                                      net_qid,
                                      statements=statements)
            # keep track of this QID
            self.netid2qid[str(network['id'])] = net_qid

        return self.netid2qid[str(network['id'])]
Example #4
0
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.caida_qid = self.wh.get_qid('CAIDA')

        # Get the QID for ASRank project
        self.asrank_qid = self.wh.get_qid(
            'CAIDA ASRank',
            create={  # Create it if it doesn't exist
                'summary': 'add CAIDA ASRank',  # Commit message
                'description':
                "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.",  # Item description
                'statements': [[self.wh.get_pid('managed by'), self.caida_qid]]
            })

        self.reference = [(self.wh.get_pid('source'), self.caida_qid),
                          (self.wh.get_pid('reference URL'), URL_API),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from ASRank and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()
        has_next = True
        i = 0
        while has_next:
            req = requests.get(URL_API + f'?offset={i}')
            if req.status_code != 200:
                sys.exit('Error while fetching data from API')

            ranking = json.loads(req.text)['data']['asns']
            has_next = ranking['pageInfo']['hasNextPage']

            for res in pool.map(self.update_net, ranking['edges']):
                sys.stderr.write(
                    f'\rProcessing... {i+1}/{ranking["totalCount"]}')
                i += 1

        pool.shutdown()

    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        asn = asn['node']

        # Properties
        statements = []

        if asn['asnName']:
            statements.append(
                [self.wh.get_pid('name'), asn['asnName'], self.reference])

        # set countries
        cc = asn['country']['iso']
        if cc:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(cc), self.reference
            ])

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.asrank_qid,
            }, self.reference
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from CAIDA ASRank', net_qid,
                                  statements)
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB exchange point ID class if 
        doesn't already exist. And fetch QIDs for exchange points already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB IX IDs
        ixid_qid = self.wh.get_qid(
            IXID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB ix IDs',  # Commit message
                'description':
                'Identifier for an exchange point in the PeeringDB database'  # Description
            })

        # Load the QIDs for ix already available in the wikibase
        self.ixid2qid = self.wh.extid2qid(qid=ixid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)

        # Added properties will have this reference information
        self.today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_IXS),
                          (self.wh.get_pid('point in time'), self.today)]

    def run(self):
        """Fetch ixs information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = requests.get(URL_PDB_IXS)
        if req.status_code != 200:
            sys.exit('Error while fetching IXs data')
        ixs = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, ix in enumerate(ixs):

            # Get more info for this IX
            req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}')
            if req.status_code != 200:
                sys.exit('Error while fetching IXs data')
            ix_info = json.loads(req.text)['data'][0]

            # Update info in wiki
            self.update_ix(ix_info)

            sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}')

    def update_ix(self, ix):
        """Add the ix to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('Internet exchange point')
        ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(ix['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ', ix['org_id'])

        # set property country
        if ix['country']:
            country_qid = self.wh.country2qid(ix['country'])
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # set property website
        if ix['website']:
            statements.append(
                [self.wh.get_pid('website'), ix['website'], self.reference])

        # set traffic webpage
        if ix['url_stats']:
            statements.append([
                self.wh.get_pid('website'),
                ix['url_stats'],  # statement
                self.reference,  # reference 
                [
                    (self.wh.get_pid('instance of'),
                     self.wh.get_qid('traffic statistics')),
                ]  # qualifier
            ])

        ix_qid = self.ix_qid(ix)
        # Update name, website, and organization for this IX
        self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements)

        # update LAN corresponding to this IX
        if 'ixlan_set' in ix:
            for ixlan in ix['ixlan_set']:
                pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}'
                pfx_ref = [(self.wh.get_pid('source'),
                            self.wh.get_qid('PeeringDB')),
                           (self.wh.get_pid('reference URL'), pfx_url),
                           (self.wh.get_pid('point in time'), self.today)]

                req = requests.get(pfx_url)
                if req.status_code != 200:
                    sys.exit('Error while fetching IXs data')
                lans = json.loads(req.text)['data']

                for lan in lans:
                    for prefix in lan['ixpfx_set']:
                        pfx_qid = self.wh.prefix2qid(prefix['prefix'],
                                                     create=True)

                        pfx_stmts = [[
                            self.wh.get_pid('instance of'),
                            self.wh.get_qid('peering LAN'), pfx_ref
                        ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]]

                        self.wh.upsert_statements('update peeringDB ixlan',
                                                  pfx_qid, pfx_stmts)

        return ix_qid

    def ix_qid(self, ix):
        """Find the ix QID for the given ix.
        If this ix is not yet registered in the wikibase then add it.

        Return the ix QID."""

        # Check if the IX is in the wikibase
        if str(ix['id']) not in self.ixid2qid:
            # Set properties for this new ix
            ix_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)),
            ]
            statements = [(self.wh.get_pid('instance of'),
                           self.wh.get_qid('Internet exchange point')),
                          (self.wh.get_pid('external ID'), str(ix['id']), [],
                           ix_qualifiers)]

            # Add this ix to the wikibase
            ix_qid = self.wh.add_item('add new peeringDB IX',
                                      label=ix['name'],
                                      description=ix['name_long'],
                                      statements=statements)
            # keep track of this QID
            self.ixid2qid[str(ix['id'])] = ix_qid

        return self.ixid2qid[str(ix['id'])]
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.apnic_qid = self.wh.get_qid('APNIC')
        self.url = URL_API  # url will change for each country
        self.reference = [(self.wh.get_pid('source'), self.apnic_qid),
                          (self.wh.get_pid('reference URL'), self.url),
                          (self.wh.get_pid('point in time'), today)]

        self.countries = iso3166.countries_by_alpha2

    def run(self):
        """Fetch data from APNIC and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()

        for cc, country in self.countries.items():

            # Get the QID of the selected country / create this country if needed
            self.countryrank_qid = self.wh.get_qid(
                f'APNIC eyeball estimates ({cc})',
                create={  # Create it if it doesn't exist
                    'summary':
                    'add APNIC eyeball estimates for ' + cc,
                    'description':
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],
                        [self.wh.get_pid('country'),
                         self.wh.country2qid(cc)],
                    ]
                })

            self.countrypercent_qid = self.wh.get_qid(
                f'% of Internet users in {country.name}',
                create={  # Create it if it doesn't exist
                    'summary':
                    'add APNIC eyeball estimates for ' + cc,
                    'description':
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],
                        [self.wh.get_pid('country'),
                         self.wh.country2qid(cc)],
                    ]
                })

            self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
            req = requests.get(self.url)
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)

            ranking = json.loads(req.text)
            # Make sure the ranking is sorted and add rank field
            ranking.sort(key=lambda x: x['percent'], reverse=True)
            for i, asn in enumerate(ranking):
                asn['rank'] = i

            # Push data to wiki
            for i, res in enumerate(pool.map(self.update_net, ranking)):
                sys.stderr.write(
                    f'\rProcessing {country.name}... {i+1}/{len(ranking)}')

        pool.shutdown()

    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its
        properties."""

        # Properties
        statements = []

        # set name
        if asn['autnum']:
            statements.append(
                [self.wh.get_pid('name'), asn['autnum'], self.reference])

        # set country
        if asn['cc']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(asn['cc']), self.reference
            ])

        # set rank
        statements.append([
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.reference
        ])

        # set population
        statements.append([
            self.wh.get_pid('population'), {
                'amount': asn['percent'],
                'unit': self.countrypercent_qid,
            }, self.reference
        ])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['as'], create=True)
        self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid,
                                  statements)