class Crawler(object): def __init__(self): """Create an item representing the 'PeeringDB organization ID' class if doesn't already exist. And fetch QIDs for organizations already in the wikibase.""" sys.stderr.write('Initialization...\n') # Helper for wiki access self.wh = Wikihandy() # Get the QID for the item representing the organization IDs orgid_qid = self.wh.get_qid( ORGID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB org IDs', # Commit message 'description': 'Identifier for an organization in the PeeringDB database' }) # Load the QIDs for organizations already available in the wikibase self.orgid2qid = self.wh.extid2qid(qid=orgid_qid) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_ORGS), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch organizations information from PeeringDB and push to wikibase""" sys.stderr.write('Fetching PeeringDB data...\n') req = requests.get(URL_PDB_ORGS) if req.status_code != 200: sys.exit('Error while fetching AS names') organizations = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, res in enumerate(map(self.update_org, organizations)): sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}') def update_org(self, organization): """Add the organization to wikibase if it's not there and update properties""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ], [ self.wh.get_pid('name'), organization['name'].strip(), self.reference ]] # set property website if organization['website']: statements.append([ self.wh.get_pid('website'), organization['website'], self.reference ]) # set property country if organization['country'] in iso3166.countries_by_alpha2: country_qid = self.wh.get_qid( iso3166.countries_by_alpha2[organization['country']].name) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # Update name, website, and country for this organization org_qid = self.org_qid(organization) self.wh.upsert_statements('update peeringDB organization', org_qid, statements) return org_qid def org_qid(self, organization): """Find the organization QID or add it to wikibase if it is not yet there. Return the organization QID.""" # Check if the organization is in the wikibase if str(organization['id']) not in self.orgid2qid: # Set properties for this new organization org_qualifier = [ (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(organization['id']), [], org_qualifier ]] # Add this organization to the wikibase org_qid = self.wh.add_item('add new peeringDB organization', label=organization['name'], statements=statements) # keep track of this QID self.orgid2qid[str(organization['id'])] = org_qid return self.orgid2qid[str(organization['id'])]
print('Adding items') statements = defaultdict(list) # wikidata = wikihandy.Wikihandy(wikidata_project='wikidata', lang='wikidata') with open(BASIC_ITEMS_FNAME, 'r') as fp: csvdata = csv.reader(decomment(fp), skipinitialspace=True) for row in csvdata: if not row: continue label, description, aliases, statements = [col.strip() for col in row] print(label) # Retrive statements from the csv file # Assume all properties have the 'wikidata-item' datatype claims = [] for statement in statements.split('|'): try: property, target = statement.split(':') except ValueError: # skip lines with no statement continue claims.append( [wh.get_pid(property.strip()), wh.get_qid(target), []]) wh.add_item("bootstrap", label, description, aliases, claims)
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB exchange point ID class if doesn't already exist. And fetch QIDs for exchange points already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB IX IDs ixid_qid = self.wh.get_qid( IXID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB ix IDs', # Commit message 'description': 'Identifier for an exchange point in the PeeringDB database' # Description }) # Load the QIDs for ix already available in the wikibase self.ixid2qid = self.wh.extid2qid(qid=ixid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Added properties will have this reference information self.today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_IXS), (self.wh.get_pid('point in time'), self.today)] def run(self): """Fetch ixs information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = requests.get(URL_PDB_IXS) if req.status_code != 200: sys.exit('Error while fetching IXs data') ixs = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, ix in enumerate(ixs): # Get more info for this IX req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}') if req.status_code != 200: sys.exit('Error while fetching IXs data') ix_info = json.loads(req.text)['data'][0] # Update info in wiki self.update_ix(ix_info) sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}') def update_ix(self, ix): """Add the ix to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point') ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]] # link to corresponding organization org_qid = self.orgid2qid.get(str(ix['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', ix['org_id']) # set property country if ix['country']: country_qid = self.wh.country2qid(ix['country']) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # set property website if ix['website']: statements.append( [self.wh.get_pid('website'), ix['website'], self.reference]) # set traffic webpage if ix['url_stats']: statements.append([ self.wh.get_pid('website'), ix['url_stats'], # statement self.reference, # reference [ (self.wh.get_pid('instance of'), self.wh.get_qid('traffic statistics')), ] # qualifier ]) ix_qid = self.ix_qid(ix) # Update name, website, and organization for this IX self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements) # update LAN corresponding to this IX if 'ixlan_set' in ix: for ixlan in ix['ixlan_set']: pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}' pfx_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), pfx_url), (self.wh.get_pid('point in time'), self.today)] req = requests.get(pfx_url) if req.status_code != 200: sys.exit('Error while fetching IXs data') lans = json.loads(req.text)['data'] for lan in lans: for prefix in lan['ixpfx_set']: pfx_qid = self.wh.prefix2qid(prefix['prefix'], create=True) pfx_stmts = [[ self.wh.get_pid('instance of'), self.wh.get_qid('peering LAN'), pfx_ref ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]] self.wh.upsert_statements('update peeringDB ixlan', pfx_qid, pfx_stmts) return ix_qid def ix_qid(self, ix): """Find the ix QID for the given ix. If this ix is not yet registered in the wikibase then add it. Return the ix QID.""" # Check if the IX is in the wikibase if str(ix['id']) not in self.ixid2qid: # Set properties for this new ix ix_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)), ] statements = [(self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point')), (self.wh.get_pid('external ID'), str(ix['id']), [], ix_qualifiers)] # Add this ix to the wikibase ix_qid = self.wh.add_item('add new peeringDB IX', label=ix['name'], description=ix['name_long'], statements=statements) # keep track of this QID self.ixid2qid[str(ix['id'])] = ix_qid return self.ixid2qid[str(ix['id'])]
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for RIPE Atlas self.atlas_qid = self.wh.get_qid( 'RIPE Atlas', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.', # Item description 'aliases': 'Atlas|atlas', 'statements': [[self.wh.get_pid('managed by'), self.wh.get_qid('RIPE NCC')]] }) # Get the QID for Atlas Probe self.atlas_probe_qid = self.wh.get_qid( 'Atlas probe', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.', # Item description 'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID for Atlas Anchor self.atlas_anchor_qid = self.wh.get_qid( 'Atlas anchor', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.', # Item description 'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID of the item representing PeeringDB IX IDs self.probeid_qid = self.wh.get_qid( PROBEID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probes', # Commit message 'description': 'Identifier for a probe in the RIPE Atlas measurement platform' # Description }) # Load the QIDs for probes already available in the wikibase self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] self.v4_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv4'))] self.v6_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv6'))] def run(self): """Fetch probe information from Atlas API and push to wikibase. """ next_page = URL while next_page is not None: req = requests.get(next_page) if req.status_code != 200: sys.exit('Error while fetching the blocklist') info = json.loads(req.text) next_page = info['next'] for i, probe in enumerate(info['results']): self.update_probe(probe) sys.stderr.write(f'\rProcessed {i+1} probes') sys.stderr.write(f'\n') def update_probe(self, probe): """Add the probe to wikibase if it's not already there and update its properties.""" # TODO add status, geometry (geo-location) and IPs? # Properties for this probe statements = [] if probe['is_anchor']: statements.append( [self.wh.get_pid('instance of'), self.atlas_probe_qid]) statements.append( [self.wh.get_pid('instance of'), self.atlas_anchor_qid]) if probe['asn_v4']: as_qid = self.wh.asn2qid(probe['asn_v4']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v4_qualifiers ]) if probe['asn_v6']: as_qid = self.wh.asn2qid(probe['asn_v6']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v6_qualifiers ]) if probe['prefix_v4']: prefix_qid = self.wh.prefix2qid(probe['prefix_v4']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['prefix_v6']: prefix_qid = self.wh.prefix2qid(probe['prefix_v6']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['country_code']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(probe['country_code']), self.reference ]) if probe['first_connected']: statements.append([ self.wh.get_pid('start time'), self.wh.to_wbtime(probe['first_connected']), self.reference ]) if 'name' in probe['status']: # Get the QIDs for probes status status_qid = self.wh.get_qid( f'RIPE Atlas probe status: {probe["status"]["name"]}', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probe status', # Commit message }) if probe['status_since']: statements.append([ self.wh.get_pid('status'), status_qid, self.reference, [(self.wh.get_pid('start time'), self.wh.to_wbtime(probe['status_since']))] ]) # set end time if the probe is abandonned if probe['status']['name'] == 'Abandoned' and probe['status_since']: statements.append([ self.wh.get_pid('end time'), self.wh.to_wbtime(probe['status_since']) ]) # Add probe tags for tag in probe['tags']: statements.append([ self.wh.get_pid('tag'), self.wh.get_qid(tag['name'], create={ 'summary': 'Add RIPE Atlas tag', }) ]) # Commit to wikibase # Get the probe QID (create if probe is not yet registered) and commit changes probe_qid = self.probe_qid(probe) self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid, statements) def probe_qid(self, probe): """Find the ix QID for the given probe ID. If this probe is not yet registered in the wikibase then add it. Return the probe QID.""" id = str(probe['id']) # Check if the IX is in the wikibase if id not in self.probeid2qid: # Set properties for this new probe probeid_qualifiers = [ (self.wh.get_pid('instance of'), self.probeid_qid), ] statements = [ (self.wh.get_pid('instance of'), self.atlas_probe_qid), (self.wh.get_pid('external ID'), id, [], probeid_qualifiers) ] # Add this probe to the wikibase probe_qid = self.wh.add_item('add new RIPE Atlas probe', label=f'RIPE Atlas probe #{id}', description=probe['description'], statements=statements) # keep track of this QID self.probeid2qid[id] = probe_qid return self.probeid2qid[id]