Esempio n. 1
0
class Crawler(object):
    def __init__(self):
        """Create an item representing the 'PeeringDB organization ID' class if 
        doesn't already exist. And fetch QIDs for organizations already in the
        wikibase."""

        sys.stderr.write('Initialization...\n')

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID for the item representing the organization IDs
        orgid_qid = self.wh.get_qid(
            ORGID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB org IDs',  # Commit message
                'description':
                'Identifier for an organization in the PeeringDB database'
            })

        # Load the QIDs for organizations already available in the wikibase
        self.orgid2qid = self.wh.extid2qid(qid=orgid_qid)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_ORGS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch organizations information from PeeringDB and push to wikibase"""

        sys.stderr.write('Fetching PeeringDB data...\n')
        req = requests.get(URL_PDB_ORGS)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')
        organizations = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, res in enumerate(map(self.update_org, organizations)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}')

    def update_org(self, organization):
        """Add the organization to wikibase if it's not there and update properties"""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('organization')
        ],
                      [
                          self.wh.get_pid('name'),
                          organization['name'].strip(), self.reference
                      ]]

        # set property website
        if organization['website']:
            statements.append([
                self.wh.get_pid('website'), organization['website'],
                self.reference
            ])

        # set property country
        if organization['country'] in iso3166.countries_by_alpha2:
            country_qid = self.wh.get_qid(
                iso3166.countries_by_alpha2[organization['country']].name)
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # Update name, website, and country for this organization
        org_qid = self.org_qid(organization)
        self.wh.upsert_statements('update peeringDB organization', org_qid,
                                  statements)

        return org_qid

    def org_qid(self, organization):
        """Find the organization QID or add it to wikibase if it is not yet there.
        Return the organization QID."""

        # Check if the organization is in the wikibase
        if str(organization['id']) not in self.orgid2qid:
            # Set properties for this new organization
            org_qualifier = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)),
            ]
            statements = [[
                self.wh.get_pid('external ID'),
                str(organization['id']), [], org_qualifier
            ]]

            # Add this organization to the wikibase
            org_qid = self.wh.add_item('add new peeringDB organization',
                                       label=organization['name'],
                                       statements=statements)
            # keep track of this QID
            self.orgid2qid[str(organization['id'])] = org_qid

        return self.orgid2qid[str(organization['id'])]
print('Adding items')
statements = defaultdict(list)
# wikidata = wikihandy.Wikihandy(wikidata_project='wikidata', lang='wikidata')

with open(BASIC_ITEMS_FNAME, 'r') as fp:
    csvdata = csv.reader(decomment(fp), skipinitialspace=True)

    for row in csvdata:
        if not row:
            continue

        label, description, aliases, statements = [col.strip() for col in row]
        print(label)

        # Retrive statements from the csv file
        # Assume all properties have the 'wikidata-item' datatype
        claims = []
        for statement in statements.split('|'):
            try:
                property, target = statement.split(':')
            except ValueError:
                # skip lines with no statement
                continue

            claims.append(
                [wh.get_pid(property.strip()),
                 wh.get_qid(target), []])

        wh.add_item("bootstrap", label, description, aliases, claims)
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB exchange point ID class if 
        doesn't already exist. And fetch QIDs for exchange points already in the
        wikibase."""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB IX IDs
        ixid_qid = self.wh.get_qid(
            IXID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add PeeringDB ix IDs',  # Commit message
                'description':
                'Identifier for an exchange point in the PeeringDB database'  # Description
            })

        # Load the QIDs for ix already available in the wikibase
        self.ixid2qid = self.wh.extid2qid(qid=ixid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)

        # Added properties will have this reference information
        self.today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('PeeringDB')),
                          (self.wh.get_pid('reference URL'), URL_PDB_IXS),
                          (self.wh.get_pid('point in time'), self.today)]

    def run(self):
        """Fetch ixs information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = requests.get(URL_PDB_IXS)
        if req.status_code != 200:
            sys.exit('Error while fetching IXs data')
        ixs = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, ix in enumerate(ixs):

            # Get more info for this IX
            req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}')
            if req.status_code != 200:
                sys.exit('Error while fetching IXs data')
            ix_info = json.loads(req.text)['data'][0]

            # Update info in wiki
            self.update_ix(ix_info)

            sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}')

    def update_ix(self, ix):
        """Add the ix to wikibase if it's not already there and update its
        properties."""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('Internet exchange point')
        ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(ix['org_id']))
        if org_qid is not None:
            statements.append(
                [self.wh.get_pid('managed by'), org_qid, self.reference])
        else:
            print('Error this organization is not in wikibase: ', ix['org_id'])

        # set property country
        if ix['country']:
            country_qid = self.wh.country2qid(ix['country'])
            if country_qid is not None:
                statements.append(
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # set property website
        if ix['website']:
            statements.append(
                [self.wh.get_pid('website'), ix['website'], self.reference])

        # set traffic webpage
        if ix['url_stats']:
            statements.append([
                self.wh.get_pid('website'),
                ix['url_stats'],  # statement
                self.reference,  # reference 
                [
                    (self.wh.get_pid('instance of'),
                     self.wh.get_qid('traffic statistics')),
                ]  # qualifier
            ])

        ix_qid = self.ix_qid(ix)
        # Update name, website, and organization for this IX
        self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements)

        # update LAN corresponding to this IX
        if 'ixlan_set' in ix:
            for ixlan in ix['ixlan_set']:
                pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}'
                pfx_ref = [(self.wh.get_pid('source'),
                            self.wh.get_qid('PeeringDB')),
                           (self.wh.get_pid('reference URL'), pfx_url),
                           (self.wh.get_pid('point in time'), self.today)]

                req = requests.get(pfx_url)
                if req.status_code != 200:
                    sys.exit('Error while fetching IXs data')
                lans = json.loads(req.text)['data']

                for lan in lans:
                    for prefix in lan['ixpfx_set']:
                        pfx_qid = self.wh.prefix2qid(prefix['prefix'],
                                                     create=True)

                        pfx_stmts = [[
                            self.wh.get_pid('instance of'),
                            self.wh.get_qid('peering LAN'), pfx_ref
                        ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]]

                        self.wh.upsert_statements('update peeringDB ixlan',
                                                  pfx_qid, pfx_stmts)

        return ix_qid

    def ix_qid(self, ix):
        """Find the ix QID for the given ix.
        If this ix is not yet registered in the wikibase then add it.

        Return the ix QID."""

        # Check if the IX is in the wikibase
        if str(ix['id']) not in self.ixid2qid:
            # Set properties for this new ix
            ix_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)),
            ]
            statements = [(self.wh.get_pid('instance of'),
                           self.wh.get_qid('Internet exchange point')),
                          (self.wh.get_pid('external ID'), str(ix['id']), [],
                           ix_qualifiers)]

            # Add this ix to the wikibase
            ix_qid = self.wh.add_item('add new peeringDB IX',
                                      label=ix['name'],
                                      description=ix['name_long'],
                                      statements=statements)
            # keep track of this QID
            self.ixid2qid[str(ix['id'])] = ix_qid

        return self.ixid2qid[str(ix['id'])]
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for RIPE Atlas
        self.atlas_qid = self.wh.get_qid(
            'RIPE Atlas',
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.',  # Item description
                'aliases':
                'Atlas|atlas',
                'statements':
                [[self.wh.get_pid('managed by'),
                  self.wh.get_qid('RIPE NCC')]]
            })

        # Get the QID for Atlas Probe
        self.atlas_probe_qid = self.wh.get_qid(
            'Atlas probe',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.',  # Item description
                'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID for Atlas Anchor
        self.atlas_anchor_qid = self.wh.get_qid(
            'Atlas anchor',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.',  # Item description
                'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID of the item representing PeeringDB IX IDs
        self.probeid_qid = self.wh.get_qid(
            PROBEID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas probes',  # Commit message
                'description':
                'Identifier for a probe in the RIPE Atlas measurement platform'  # Description
            })

        # Load the QIDs for probes already available in the wikibase
        self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid)

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('RIPE NCC')),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

        self.v4_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv4'))]

        self.v6_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv6'))]

    def run(self):
        """Fetch probe information from Atlas API and push to wikibase. """

        next_page = URL

        while next_page is not None:
            req = requests.get(next_page)
            if req.status_code != 200:
                sys.exit('Error while fetching the blocklist')

            info = json.loads(req.text)
            next_page = info['next']

            for i, probe in enumerate(info['results']):

                self.update_probe(probe)
                sys.stderr.write(f'\rProcessed {i+1} probes')
            sys.stderr.write(f'\n')

    def update_probe(self, probe):
        """Add the probe to wikibase if it's not already there and update its
        properties."""

        # TODO add status, geometry (geo-location) and IPs?

        # Properties for this probe
        statements = []

        if probe['is_anchor']:
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_probe_qid])
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_anchor_qid])
        if probe['asn_v4']:
            as_qid = self.wh.asn2qid(probe['asn_v4'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v4_qualifiers
                ])
        if probe['asn_v6']:
            as_qid = self.wh.asn2qid(probe['asn_v6'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v6_qualifiers
                ])
        if probe['prefix_v4']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v4'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['prefix_v6']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v6'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['country_code']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(probe['country_code']), self.reference
            ])
        if probe['first_connected']:
            statements.append([
                self.wh.get_pid('start time'),
                self.wh.to_wbtime(probe['first_connected']), self.reference
            ])

        if 'name' in probe['status']:
            # Get the QIDs for probes status
            status_qid = self.wh.get_qid(
                f'RIPE Atlas probe status: {probe["status"]["name"]}',
                create={  # Create it if it doesn't exist
                    'summary': 'add RIPE Atlas probe status',  # Commit message
                })

            if probe['status_since']:
                statements.append([
                    self.wh.get_pid('status'), status_qid, self.reference,
                    [(self.wh.get_pid('start time'),
                      self.wh.to_wbtime(probe['status_since']))]
                ])

            # set end time if the probe is abandonned
            if probe['status']['name'] == 'Abandoned' and probe['status_since']:
                statements.append([
                    self.wh.get_pid('end time'),
                    self.wh.to_wbtime(probe['status_since'])
                ])

        # Add probe tags
        for tag in probe['tags']:
            statements.append([
                self.wh.get_pid('tag'),
                self.wh.get_qid(tag['name'],
                                create={
                                    'summary': 'Add RIPE Atlas tag',
                                })
            ])

        # Commit to wikibase
        # Get the probe QID (create if probe is not yet registered) and commit changes
        probe_qid = self.probe_qid(probe)
        self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid,
                                  statements)

    def probe_qid(self, probe):
        """Find the ix QID for the given probe ID.
        If this probe is not yet registered in the wikibase then add it.

        Return the probe QID."""

        id = str(probe['id'])

        # Check if the IX is in the wikibase
        if id not in self.probeid2qid:
            # Set properties for this new probe
            probeid_qualifiers = [
                (self.wh.get_pid('instance of'), self.probeid_qid),
            ]
            statements = [
                (self.wh.get_pid('instance of'), self.atlas_probe_qid),
                (self.wh.get_pid('external ID'), id, [], probeid_qualifiers)
            ]

            # Add this probe to the wikibase
            probe_qid = self.wh.add_item('add new RIPE Atlas probe',
                                         label=f'RIPE Atlas probe #{id}',
                                         description=probe['description'],
                                         statements=statements)
            # keep track of this QID
            self.probeid2qid[id] = probe_qid

        return self.probeid2qid[id]