class Crawler(object): def __init__(self): # Helper for wiki access self.wh = Wikihandy() # Reference information for data pushed to the wikibase self.reference = [ (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME), (self.wh.get_pid('point in time'), self.wh.today()) ] def run(self): """Fetch the AS name file from RIPE website and process lines one by one""" req = requests.get(URL_RIPE_AS_NAME) if req.status_code != 200: sys.exit('Error while fetching AS names') self.wh.login() # Login once for all threads, not needed with OAuth for i, res in enumerate(map(self.update_asn, req.text.splitlines())): sys.stderr.write(f'\rProcessed {i} ASes') def update_asn(self, one_line): # Parse given line to get ASN, name, and country code asn, _, name_cc = one_line.partition(' ') name, _, cc = name_cc.rpartition(', ') asn_qid = self.wh.asn2qid(asn, create=True) cc_qid = self.wh.country2qid(cc, create=True) statements = [] statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] ) # Set country if cc_qid is not None: statements.append( [self.wh.get_pid('name'), name, self.reference] ) # Set AS name try: # Update AS name and country self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements) except Exception as error: # print errors and continue running print('Error for: ', one_line) print(error) return asn_qid
class Crawler(object): def __init__(self): """Fetch QIDs for Tranco and affiliation (create them if they are not in the wikibase).""" sys.stderr.write('Initialization...\n') # Helper for wiki access self.wh = Wikihandy() self.tranco_qid = self.wh.get_qid( 'Tranco Top 1M', create={ # Create it if it doesn't exist 'summary': 'add Tranco ranking', # Commit message 'description': 'A Research-Oriented Top Sites Ranking Hardened Against Manipulation', # Item description 'statements': [ [self.wh.get_pid('website'), 'https://tranco-list.eu/'], [ self.wh.get_pid('publication'), 'https://tranco-list.eu/assets/tranco-ndss19.pdf' ], [ self.wh.get_pid('source code repository'), 'https://github.com/DistriNet/tranco-list' ], ] }) self.org_qid = self.wh.get_qid( 'imec-DistriNet', create={ # Create it if it doesn't exist 'summary': 'add Tranco ranking', # Commit message 'description': 'The imec-DistriNet research group is part of the Department of Computer Science at the KU Leuven and part of the imec High Impact Initiative Distributed Trust.', # Item description 'statements': [ [ self.wh.get_pid('website'), 'https://distrinet.cs.kuleuven.be/' ], ] }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch Tranco top 1M and push to wikibase. """ sys.stderr.write('Downloading latest list...\n') req = requests.get(URL) if req.status_code != 200: sys.exit('Error while fetching Tranco csv file') # open zip file and read top list with ZipFile(io.BytesIO(req.content)) as z: with z.open('top-1m.csv') as list: for i, row in enumerate(io.TextIOWrapper(list)): row = row.rstrip() sys.stderr.write(f'\rProcessed {i} domains \t {row}') self.update(row) def update(self, one_line): """Add the network to wikibase if it's not already there and update its properties.""" rank, domain = one_line.split(',') # set rank statements = [[ self.wh.get_pid('ranking'), { 'amount': rank, 'unit': self.tranco_qid, }, self.reference ]] # Commit to wikibase # Get the domain name QID (create if it is not yet registered) and commit changes dn_qid = self.wh.get_qid(domain, create={ 'summary': 'add Tranco ranking', 'statements': [[ self.wh.get_pid('instance of'), self.wh.get_qid('domain name') ]] }) self.wh.upsert_statements('update from tranco top 1M', dn_qid, statements)
class Crawler(object): def __init__(self): """Initialize wikihandy """ # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information self.org_qid = self.wh.get_qid(ORG) self.countries = iso3166.countries_by_alpha2 # Session object to fetch peeringdb data retries = Retry(total=15, backoff_factor=0.2, status_forcelist=[104, 500, 502, 503, 504]) self.http_session = requests.Session() self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) def run(self): """Fetch data from API and push to wikibase. """ for cc, country in self.countries.items(): # Query IHR self.url = URL_API.format(country=cc) req = self.http_session.get(self.url + '&format=json') if req.status_code != 200: sys.exit('Error while fetching data for ' + cc) data = json.loads(req.text) ranking = data['results'] # Setup references today = self.wh.today() self.references = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today), ] # Setup qualifiers country_qid = self.wh.country2qid(country.name) if country_qid is not None: self.qualifiers = [(self.wh.get_pid('country'), country_qid)] else: self.qualifiers = [] # Find the latest timebin in the data last_timebin = '1970-01-01' for r in ranking: if arrow.get(r['timebin']) > arrow.get(last_timebin): last_timebin = r['timebin'] # Make ranking and push data for metric, weight in [('Total eyeball', 'eyeball'), ('Total AS', 'as')]: # Get the QID of the selected country / create this country if needed self.countryrank_qid = self.wh.get_qid( f'IHR country ranking: {metric} ({cc})', create={ # Create it if it doesn't exist 'summary': f'add IHR {metric} ranking for ' + cc, 'description': f"IHR's ranking of networks ({metric}) for " + country.name, 'statements': [[self.wh.get_pid('managed by'), self.org_qid]] }) # Filter out unnecessary data selected = [ r for r in ranking if (r['weightscheme'] == weight and r['transitonly'] == False and r['hege'] > MIN_HEGE and r['timebin'] == last_timebin) ] # Make sure the ranking is sorted and add rank field selected.sort(key=lambda x: x['hege'], reverse=True) for i, asn in enumerate(selected): asn['rank'] = i # Push data to wiki for i, res in enumerate(map(self.update_entry, selected)): sys.stderr.write( f'\rProcessing {country.name}... {i+1}/{len(selected)}' ) sys.stderr.write('\n') def update_entry(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" # Properties statements = [] # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.countryrank_qid, }, self.references, self.qualifiers ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['asn'], create=True) self.wh.upsert_statements('update from IHR country ranking', net_qid, statements, asynchronous=False)
class ip2asn(object): def __init__(self, wikihandy=None, sparql=DEFAULT_WIKI_SPARQL): """Fetch routing prefixes and their origin AS from iyp. wikihandy: a Wikihandy instance to use. A new will be created if this is set to None. """ logging.info('ip2asn initialization...\n') if wikihandy is None: self.wh = Wikihandy() else: self.wh = wikihandy self.rtree = radix.Radix() self.sparql = SPARQLWrapper(sparql) logging.info('Fetching prefix info...\n') # Fetch prefixes QUERY = """ #Items that have a pKa value set SELECT ?item ?prefix ?as_qid ?asn WHERE { ?item wdt:%s wd:%s. ?item rdfs:label ?prefix. ?item wdt:%s ?as_qid. ?as_qid wdt:%s ?asn. } """ % ( self.wh.get_pid('instance of'), self.wh.get_qid('IP routing prefix') , self.wh.get_pid('originated by') , self.wh.get_pid('autonomous system number') , ) # Query wiki self.sparql.setQuery(QUERY) self.sparql.setReturnFormat(JSON) response = self.sparql.query().convert() results = response['results'] # Parse results for res in results['bindings']: prefix_qid = res['item']['value'].rpartition('/')[2] prefix = res['prefix']['value'] asn = res['asn']['value'] as_qid = res['as_qid']['value'].rpartition('/')[2] rnode = self.rtree.add(prefix) rnode.data['prefix'] = prefix rnode.data['asn'] = asn rnode.data['prefix_qid'] = prefix_qid rnode.data['as_qid'] = as_qid def lookup(self, ip): """Lookup for the given ip address. Returns a dictionary with the corresponding prefix and ASN, as well as the corresponding QIDs.""" try: node = self.rtree.search_best(ip) except ValueError: print("Wrong IP address: %s" % ip) return None if node is None: return None else: return node.data
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB network ID class if doesn't already exist. And fetch QIDs for networks already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB network IDs netid_qid = self.wh.get_qid( NETID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB net IDs', # Commit message 'description': 'Identifier for a network in the PeeringDB database' # Description }) # Load the QIDs for networks already available in the wikibase self.netid2qid = self.wh.extid2qid(qid=netid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Load the QIDs for peeringDB IXs self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL) # Added properties will have this reference information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_NETS), (self.wh.get_pid('point in time'), today)] # Session object to fetch peeringdb data retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[104, 500, 502, 503, 504]) self.http_session = requests.Session() self.http_session.mount('https://', HTTPAdapter(max_retries=retries)) def run(self): """Fetch networks information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = self.http_session.get(URL_PDB_NETS) if req.status_code != 200: sys.exit('Error while fetching data from API') networks = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, _ in enumerate(map(self.update_net, networks)): sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}') def update_net(self, network): """Add the network to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('name'), network['name'].strip(), self.reference ]] # link to corresponding organization org_qid = self.orgid2qid.get(str(network['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', network['org_id']) # set property website if network['website']: statements.append([ self.wh.get_pid('website'), network['website'], self.reference ]) # Update IX membership # Fetch membership for this network netixlan_url = URL_PDB_NETS + f'/{network["id"]}' req = self.http_session.get(netixlan_url) if req.status_code != 200: sys.exit(f'Error while fetching network data (id={network["id"]})') net_details = json.loads(req.text)['data'] if len(net_details) > 1: print(net_details) net_details = net_details[0] # Push membership to wikidata today = self.wh.today() netixlan_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), netixlan_url), (self.wh.get_pid('point in time'), today)] for ixlan in net_details['netixlan_set']: ix_qid = self.ixid2qid.get(str(ixlan['ix_id'])) if ix_qid is None: print(f'Unknown IX: ix_id={ixlan["ix_id"]}') continue statements.append( [self.wh.get_pid('member of'), ix_qid, netixlan_ref]) # Update name, website, and organization for this network net_qid = self.net_qid(network) self.wh.upsert_statements('update peeringDB networks', net_qid, statements) return net_qid def net_qid(self, network): """Find the network QID for the given network. If this network is not yet registered in the wikibase then find (or create) the item corresponding to the network ASN and register the peeringDB network ID with this item. Return the network QID.""" # Check if the network is in the wikibase if str(network['id']) not in self.netid2qid: # Find or create the corresponding ASN item net_qid = self.wh.asn2qid(network['asn'], create=True) # Set properties for this new network net_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)), ] statements = [[ self.wh.get_pid('external ID'), str(network['id']), [], net_qualifiers ]] # Add this network to the wikibase self.wh.upsert_statements('add new peeringDB network', net_qid, statements=statements) # keep track of this QID self.netid2qid[str(network['id'])] = net_qid return self.netid2qid[str(network['id'])]
class ip2plan(object): def __init__(self, wikihandy=None, sparql=DEFAULT_WIKI_SPARQL): """Fetch peering lans and their corresponding IXP from iyp. wikihandy: a Wikihandy instance to use. A new will be created if this is set to None. """ logging.info('ip2plan initialization...\n') if wikihandy is None: self.wh = Wikihandy() else: self.wh = wikihandy self.rtree = radix.Radix() self.sparql = SPARQLWrapper(sparql) logging.info('Fetching prefix info...\n') # Fetch prefixes QUERY = """ SELECT ?item ?prefix ?ix_qid ?org_qid WHERE { ?item wdt:%s wd:%s. ?item rdfs:label ?prefix. ?item wdt:%s ?ix_qid. ?ix_qid wdt:%s ?org_qid. } """ % ( self.wh.get_pid('instance of'), self.wh.get_qid('peering LAN'), self.wh.get_pid('managed by'), self.wh.get_pid('managed by'), ) # Query wiki self.sparql.setQuery(QUERY) self.sparql.setReturnFormat(JSON) response = self.sparql.query().convert() results = response['results'] # Parse results for res in results['bindings']: prefix_qid = res['item']['value'].rpartition('/')[2] prefix = res['prefix']['value'] ix_qid = res['ix_qid']['value'].rpartition('/')[2] org_qid = res['org_qid']['value'].rpartition('/')[2] rnode = self.rtree.add(prefix) rnode.data['prefix'] = prefix rnode.data['ix_qid'] = ix_qid rnode.data['prefix_qid'] = prefix_qid rnode.data['org_qid'] = org_qid logging.info(QUERY) logging.info(f'Found {len(self.rtree.nodes())} peering LANs') def lookup(self, ip): """Lookup for the given ip address. Returns a dictionary with the corresponding prefix and ASN, as well as the corresponding QIDs.""" try: node = self.rtree.search_best(ip) except ValueError: print("Wrong IP address: %s" % ip) return None if node is None: return None else: return node.data
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.caida_qid = self.wh.get_qid('CAIDA') # Get the QID for ASRank project self.asrank_qid = self.wh.get_qid( 'CAIDA ASRank', create={ # Create it if it doesn't exist 'summary': 'add CAIDA ASRank', # Commit message 'description': "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.", # Item description 'statements': [[self.wh.get_pid('managed by'), self.caida_qid]] }) self.reference = [(self.wh.get_pid('source'), self.caida_qid), (self.wh.get_pid('reference URL'), URL_API), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch networks information from ASRank and push to wikibase. """ self.wh.login() # Login once for all threads pool = ThreadPoolExecutor() has_next = True i = 0 while has_next: req = requests.get(URL_API + f'?offset={i}') if req.status_code != 200: sys.exit('Error while fetching data from API') ranking = json.loads(req.text)['data']['asns'] has_next = ranking['pageInfo']['hasNextPage'] for res in pool.map(self.update_net, ranking['edges']): sys.stderr.write( f'\rProcessing... {i+1}/{ranking["totalCount"]}') i += 1 pool.shutdown() def update_net(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" asn = asn['node'] # Properties statements = [] if asn['asnName']: statements.append( [self.wh.get_pid('name'), asn['asnName'], self.reference]) # set countries cc = asn['country']['iso'] if cc: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(cc), self.reference ]) # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.asrank_qid, }, self.reference ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['asn'], create=True) self.wh.upsert_statements('update from CAIDA ASRank', net_qid, statements)
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.org_qid = self.wh.get_qid('RIPE NCC') self.url = URL_API # url will change for each country self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today) ] def get_last_line(self,line): """Keep the end of the last given line""" self.last_line = line.rpartition(' ')[2] def get_all_lines(self, line): """Keep the end of each given lines""" self.all_lines.append(line.rpartition(' ')[2]) def run(self): """Fetch data from RIPE and push to wikibase. """ now = date.today() today = f'{now.year}/{now.month:02d}/{now.day:02d}' logging.info('Connecting to the FTP server..') # Find latest roa files filepaths = [] ftp = FTP(FTP_URL) ftp.login() ftp.cwd(FTP_ROOT) self.all_lines = [] self.last_line = '' ftp.retrlines('LIST', callback=self.get_all_lines) logging.info('Listing directories...') logging.info(f'{self.all_lines}') for dir in self.all_lines: path = FTP_ROOT+'/'+dir ftp.cwd(path) self.last_line = '' while self.last_line not in ['roas.csv', 'repo.tar.gz']: ftp.cwd(self.last_line) path += self.last_line + '/' ftp.retrlines('LIST', callback=self.get_last_line) if self.last_line == 'roas.csv' and today in path: path += 'roas.csv' logging.info(f'Found ROA file: {path}') filepaths.append(path) for filepath in filepaths: self.url = URL_API+filepath logging.info(f'Fetching ROA file: {self.url}') req = requests.get( self.url ) if req.status_code != 200: sys.exit('Error while fetching data for '+filepath) # Aggregate data per prefix prefix_info = defaultdict(list) for line in req.text.splitlines(): url, asn, prefix, max_length, start, end = line.split(',') # Skip header if url=='URI': continue prefix_info[prefix].append({ 'url': url, 'asn': asn, 'max_length': max_length, 'start': start, 'end': end}) for i, (prefix, attributes) in enumerate(prefix_info.items()): self.update(prefix, attributes) sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix}) ') def update(self, prefix, attributes): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] for att in attributes: qualifiers = [ [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])], [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])], # [self.wh.get_pid('reference URL'), url ] ] if att['max_length']: qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] ) # Properties asn_qid = self.wh.asn2qid(att['asn'], create=True) if asn_qid is None: print('Error: ', line) return statements.append( [ self.wh.get_pid('route origin authorization'), asn_qid, self.reference, qualifiers ] ) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB exchange point ID class if doesn't already exist. And fetch QIDs for exchange points already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB IX IDs ixid_qid = self.wh.get_qid( IXID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB ix IDs', # Commit message 'description': 'Identifier for an exchange point in the PeeringDB database' # Description }) # Load the QIDs for ix already available in the wikibase self.ixid2qid = self.wh.extid2qid(qid=ixid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Added properties will have this reference information self.today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_IXS), (self.wh.get_pid('point in time'), self.today)] def run(self): """Fetch ixs information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = requests.get(URL_PDB_IXS) if req.status_code != 200: sys.exit('Error while fetching IXs data') ixs = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, ix in enumerate(ixs): # Get more info for this IX req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}') if req.status_code != 200: sys.exit('Error while fetching IXs data') ix_info = json.loads(req.text)['data'][0] # Update info in wiki self.update_ix(ix_info) sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}') def update_ix(self, ix): """Add the ix to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point') ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]] # link to corresponding organization org_qid = self.orgid2qid.get(str(ix['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', ix['org_id']) # set property country if ix['country']: country_qid = self.wh.country2qid(ix['country']) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # set property website if ix['website']: statements.append( [self.wh.get_pid('website'), ix['website'], self.reference]) # set traffic webpage if ix['url_stats']: statements.append([ self.wh.get_pid('website'), ix['url_stats'], # statement self.reference, # reference [ (self.wh.get_pid('instance of'), self.wh.get_qid('traffic statistics')), ] # qualifier ]) ix_qid = self.ix_qid(ix) # Update name, website, and organization for this IX self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements) # update LAN corresponding to this IX if 'ixlan_set' in ix: for ixlan in ix['ixlan_set']: pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}' pfx_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), pfx_url), (self.wh.get_pid('point in time'), self.today)] req = requests.get(pfx_url) if req.status_code != 200: sys.exit('Error while fetching IXs data') lans = json.loads(req.text)['data'] for lan in lans: for prefix in lan['ixpfx_set']: pfx_qid = self.wh.prefix2qid(prefix['prefix'], create=True) pfx_stmts = [[ self.wh.get_pid('instance of'), self.wh.get_qid('peering LAN'), pfx_ref ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]] self.wh.upsert_statements('update peeringDB ixlan', pfx_qid, pfx_stmts) return ix_qid def ix_qid(self, ix): """Find the ix QID for the given ix. If this ix is not yet registered in the wikibase then add it. Return the ix QID.""" # Check if the IX is in the wikibase if str(ix['id']) not in self.ixid2qid: # Set properties for this new ix ix_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)), ] statements = [(self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point')), (self.wh.get_pid('external ID'), str(ix['id']), [], ix_qualifiers)] # Add this ix to the wikibase ix_qid = self.wh.add_item('add new peeringDB IX', label=ix['name'], description=ix['name_long'], statements=statements) # keep track of this QID self.ixid2qid[str(ix['id'])] = ix_qid return self.ixid2qid[str(ix['id'])]
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for Routeviews organization self.org_qid = self.wh.get_qid('Route Views') self.today = self.wh.today() def run(self): """Fetch BGP data from collectors and push to wikibase. """ today = arrow.now().replace(hour=0, minute=0) start = today.shift(hours=-1) end = today.shift(hours=1) stream = pybgpstream.BGPStream( from_time=int(start.timestamp()), until_time=int(end.timestamp()), record_type="ribs", ) rtree = radix.Radix() sys.stderr.write(f'\nReading BGP data:\n') for i, elem in enumerate(stream): # Extract the prefix and origin ASN msg = elem.fields prefix = msg['prefix'] origin_asn_str = msg['as-path'].split(' ')[-1] origin_asns = [] if '{' in origin_asn_str: origin_asns = origin_asn_str[1:-1].split(',') else: origin_asns = [origin_asn_str] # Store origin ASN in radix tree rnode = rtree.search_exact(prefix) if rnode is None: rnode = rtree.add(prefix) rnode.data['origin'] = defaultdict(set) for asn in origin_asns: rnode.data['origin'][asn].add(elem.collector) sys.stderr.write(f'\rProcessed {i+1} BGP messages') sys.stderr.write(f'\nPushing data to IYP...\n') # Push all prefixes data to IYP for i, rnode in enumerate(rtree): data = rnode.data['origin'] self.update_entry(rnode.prefix, data) sys.stderr.write(f'\rProcessed {i+1} prefixes') def update_entry(self, prefix, originasn_collector): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] # set origin AS for asn, collectors in originasn_collector.items(): for collector in collectors: # Added properties will have this additional information url = URL_RV if 'rrc' in collector: url = URL_RIS self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), url.format(collector)), (self.wh.get_pid('point in time'), self.today) ] as_qid = self.wh.asn2qid(asn, create=True) statements.append( [self.wh.get_pid('originated by'), as_qid, self.reference]) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIS/Routeviews RIBs', prefix_qid, statements)
print('Adding items') statements = defaultdict(list) # wikidata = wikihandy.Wikihandy(wikidata_project='wikidata', lang='wikidata') with open(BASIC_ITEMS_FNAME, 'r') as fp: csvdata = csv.reader(decomment(fp), skipinitialspace=True) for row in csvdata: if not row: continue label, description, aliases, statements = [col.strip() for col in row] print(label) # Retrive statements from the csv file # Assume all properties have the 'wikidata-item' datatype claims = [] for statement in statements.split('|'): try: property, target = statement.split(':') except ValueError: # skip lines with no statement continue claims.append( [wh.get_pid(property.strip()), wh.get_qid(target), []]) wh.add_item("bootstrap", label, description, aliases, claims)
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.apnic_qid = self.wh.get_qid('APNIC') self.url = URL_API # url will change for each country self.reference = [(self.wh.get_pid('source'), self.apnic_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today)] self.countries = iso3166.countries_by_alpha2 def run(self): """Fetch data from APNIC and push to wikibase. """ self.wh.login() # Login once for all threads pool = ThreadPoolExecutor() for cc, country in self.countries.items(): # Get the QID of the selected country / create this country if needed self.countryrank_qid = self.wh.get_qid( f'APNIC eyeball estimates ({cc})', create={ # Create it if it doesn't exist 'summary': 'add APNIC eyeball estimates for ' + cc, 'description': "APNIC's AS population estimates" + "based on advertisement for " + country.name, 'statements': [ [self.wh.get_pid('managed by'), self.apnic_qid], [self.wh.get_pid('website'), URL_API], [self.wh.get_pid('country'), self.wh.country2qid(cc)], ] }) self.countrypercent_qid = self.wh.get_qid( f'% of Internet users in {country.name}', create={ # Create it if it doesn't exist 'summary': 'add APNIC eyeball estimates for ' + cc, 'description': "APNIC's AS population estimates" + "based on advertisement for " + country.name, 'statements': [ [self.wh.get_pid('managed by'), self.apnic_qid], [self.wh.get_pid('website'), URL_API], [self.wh.get_pid('country'), self.wh.country2qid(cc)], ] }) self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}' req = requests.get(self.url) if req.status_code != 200: sys.exit('Error while fetching data for ' + cc) ranking = json.loads(req.text) # Make sure the ranking is sorted and add rank field ranking.sort(key=lambda x: x['percent'], reverse=True) for i, asn in enumerate(ranking): asn['rank'] = i # Push data to wiki for i, res in enumerate(pool.map(self.update_net, ranking)): sys.stderr.write( f'\rProcessing {country.name}... {i+1}/{len(ranking)}') pool.shutdown() def update_net(self, asn): """Add the network to wikibase if it's not already there and update its properties.""" # Properties statements = [] # set name if asn['autnum']: statements.append( [self.wh.get_pid('name'), asn['autnum'], self.reference]) # set country if asn['cc']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(asn['cc']), self.reference ]) # set rank statements.append([ self.wh.get_pid('ranking'), { 'amount': asn['rank'], 'unit': self.countryrank_qid, }, self.reference ]) # set population statements.append([ self.wh.get_pid('population'), { 'amount': asn['percent'], 'unit': self.countrypercent_qid, }, self.reference ]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn['as'], create=True) self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid, statements)
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for RIPE Atlas self.atlas_qid = self.wh.get_qid( 'RIPE Atlas', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.', # Item description 'aliases': 'Atlas|atlas', 'statements': [[self.wh.get_pid('managed by'), self.wh.get_qid('RIPE NCC')]] }) # Get the QID for Atlas Probe self.atlas_probe_qid = self.wh.get_qid( 'Atlas probe', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.', # Item description 'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID for Atlas Anchor self.atlas_anchor_qid = self.wh.get_qid( 'Atlas anchor', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.', # Item description 'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID of the item representing PeeringDB IX IDs self.probeid_qid = self.wh.get_qid( PROBEID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probes', # Commit message 'description': 'Identifier for a probe in the RIPE Atlas measurement platform' # Description }) # Load the QIDs for probes already available in the wikibase self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] self.v4_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv4'))] self.v6_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv6'))] def run(self): """Fetch probe information from Atlas API and push to wikibase. """ next_page = URL while next_page is not None: req = requests.get(next_page) if req.status_code != 200: sys.exit('Error while fetching the blocklist') info = json.loads(req.text) next_page = info['next'] for i, probe in enumerate(info['results']): self.update_probe(probe) sys.stderr.write(f'\rProcessed {i+1} probes') sys.stderr.write(f'\n') def update_probe(self, probe): """Add the probe to wikibase if it's not already there and update its properties.""" # TODO add status, geometry (geo-location) and IPs? # Properties for this probe statements = [] if probe['is_anchor']: statements.append( [self.wh.get_pid('instance of'), self.atlas_probe_qid]) statements.append( [self.wh.get_pid('instance of'), self.atlas_anchor_qid]) if probe['asn_v4']: as_qid = self.wh.asn2qid(probe['asn_v4']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v4_qualifiers ]) if probe['asn_v6']: as_qid = self.wh.asn2qid(probe['asn_v6']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v6_qualifiers ]) if probe['prefix_v4']: prefix_qid = self.wh.prefix2qid(probe['prefix_v4']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['prefix_v6']: prefix_qid = self.wh.prefix2qid(probe['prefix_v6']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['country_code']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(probe['country_code']), self.reference ]) if probe['first_connected']: statements.append([ self.wh.get_pid('start time'), self.wh.to_wbtime(probe['first_connected']), self.reference ]) if 'name' in probe['status']: # Get the QIDs for probes status status_qid = self.wh.get_qid( f'RIPE Atlas probe status: {probe["status"]["name"]}', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probe status', # Commit message }) if probe['status_since']: statements.append([ self.wh.get_pid('status'), status_qid, self.reference, [(self.wh.get_pid('start time'), self.wh.to_wbtime(probe['status_since']))] ]) # set end time if the probe is abandonned if probe['status']['name'] == 'Abandoned' and probe['status_since']: statements.append([ self.wh.get_pid('end time'), self.wh.to_wbtime(probe['status_since']) ]) # Add probe tags for tag in probe['tags']: statements.append([ self.wh.get_pid('tag'), self.wh.get_qid(tag['name'], create={ 'summary': 'Add RIPE Atlas tag', }) ]) # Commit to wikibase # Get the probe QID (create if probe is not yet registered) and commit changes probe_qid = self.probe_qid(probe) self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid, statements) def probe_qid(self, probe): """Find the ix QID for the given probe ID. If this probe is not yet registered in the wikibase then add it. Return the probe QID.""" id = str(probe['id']) # Check if the IX is in the wikibase if id not in self.probeid2qid: # Set properties for this new probe probeid_qualifiers = [ (self.wh.get_pid('instance of'), self.probeid_qid), ] statements = [ (self.wh.get_pid('instance of'), self.atlas_probe_qid), (self.wh.get_pid('external ID'), id, [], probeid_qualifiers) ] # Add this probe to the wikibase probe_qid = self.wh.add_item('add new RIPE Atlas probe', label=f'RIPE Atlas probe #{id}', description=probe['description'], statements=statements) # keep track of this QID self.probeid2qid[id] = probe_qid return self.probeid2qid[id]
class Crawler(object): def __init__(self, fdns_url=URL): """Fetch QID for Rapid7 and initialize wikihandy.""" sys.stderr.write('Initialization...\n') self.fdns_url = fdns_url # Helper for wiki access self.wh = Wikihandy() self.org_qid = self.wh.get_qid( 'Rapid7', create={ # Create it if it doesn't exist 'summary': 'add Rapid7 forward DNS data', # Commit message 'description': 'Rapid7, a security company that provides unified vulnerability management solutions', # Item description 'statements': [ [ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ], [self.wh.get_pid('website'), 'https://www.rapid7.com/'], ] }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), fdns_url), (self.wh.get_pid('point in time'), today)] self.ia = ip2asn(wikihandy=self.wh) # keep track of all resolved prefixes so we just make one push per # domain self.tld_pfx = defaultdict(set) def match_domain_prefix(self, line): """Parse a line from the rapid7 dataset, extract the domain and ip, and find the corresponding IP prefix. return: (domain name, prefix) or None, None if the domain is not in the wiki """ tld = None prefix = None datapoint = json.loads(line) if (datapoint['type'] in ['a', 'aaaa'] and 'value' in datapoint and 'name' in datapoint): ext = tldextract.extract(datapoint['name']) tld = ext[-2] + '.' + ext[-1] # skip domains not in the wiki if self.wh.domain2qid(tld) is None: return tld, None ip_info = self.ia.lookup(datapoint['value']) if ip_info is None: return tld, None prefix = ip_info['prefix'] self.tld_pfx[tld].add(prefix) return tld, prefix def run(self): """Fetch Rapid7 DNS forward data, find corresponding BGP prefixes and push resolution for domains already in the wikibase. """ # download rapid7 data and find corresponding prefixes sys.stderr.write('Downloading Rapid7 dataset...\n') fname = self.fdns_url.split('/')[-1] if not os.path.exists(fname): fname = download_file(self.fdns_url, fname) sys.stderr.write('Processing dataset...\n') if os.path.exists(fname + '.pickle'): sys.stderr.write('Load data from cache!') self.tld_pfx = pickle.load(open(fname + '.pickle', 'rb')) else: with gzip.open(fname, 'rt') as finput: for line in finput: self.match_domain_prefix(line) pickle.dump(self.tld_pfx, open(fname + '.pickle', 'wb')) sys.stderr.write( f'Found {len(self.tld_pfx)} domain names in Rapid7 dataset out of the {len(self.wh._domain2qid)} domain names in wiki\n' ) # push data to wiki for i, (tld, pfxs) in enumerate(self.tld_pfx.items()): sys.stderr.write( f'\33[2K\rUpdating iyp... {i+1}/{len(self.tld_pfx)}\t{tld} {len(pfxs)} prefixes' ) self.update(tld, pfxs) sys.stderr.write('\n') def update(self, tld, pfxs): """Update statements for the given domain name.""" # make all statements statements = [] for pfx in pfxs: pfx_qid = self.wh.prefix2qid(pfx, create=True) statements.append( [self.wh.get_pid('forward DNS'), pfx_qid, self.reference]) # Commit to wikibase # Get the domain name QID and commit changes dn_qid = self.wh.domain2qid(tld) try: # TODO remove old data with URL regex self.wh.upsert_statements('update from Rapid7 forward DNS data', dn_qid, statements) except Exception as e: logging.error(f"Could not update domain {dn_qid}") logging.error(str(e))
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for Spamhaus organization self.spamhaus_qid = self.wh.get_qid( 'Spamhaus', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus organization', # Commit message 'description': 'The Spamhaus Project is an international organisation to track email spammers and spam-related activity', # Item description 'aliases': 'The Spamhaus Project|the spamhaus project', 'statements': [[ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ]] }) # Get the QID for Spamhaus DROP project self.drop_qid = self.wh.get_qid( 'Spamhaus DROP lists', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus block list', # Commit message 'description': "The Spamhaus Don't Route Or Peer Lists", # Item description 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid]] }) # Get the QID for Spamhaus ASN-DROP list self.asn_drop_qid = self.wh.get_qid( 'Spamhaus ASN-DROP list', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus block list', # Commit message 'description': 'ASN-DROP contains a list of Autonomous System Numbers controlled by spammers or cyber criminals, as well as "hijacked" ASNs. ', # Item description 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid], [self.wh.get_pid('part of'), self.drop_qid]] }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch blocklist from Spamhaus and push to wikibase. """ req = requests.get(URL) if req.status_code != 200: sys.exit('Error while fetching the blocklist') for i, row in enumerate(req.text.splitlines()): # Skip the header if row.startswith(';'): continue self.update_net(row) sys.stderr.write(f'\rProcessed {i+1} ASes') sys.stderr.write(f'\n') def update_net(self, one_line): """Add the network to wikibase if it's not already there and update its properties.""" asn, _, cc_name = one_line.partition(';') asn = int(asn[2:]) cc, name = [word.strip() for word in cc_name.split('|')] # Properties for this AS statements = [ [ self.wh.get_pid('reported in'), self.asn_drop_qid, self.reference ], [self.wh.get_pid('name'), name, self.reference], ] # set countries if len(cc) == 2: cc_qid = self.wh.country2qid(cc) if cc_qid is not None: statements.append( [self.wh.get_pid('country'), cc_qid, self.reference]) # Commit to wikibase # Get the AS QID (create if AS is not yet registered) and commit changes net_qid = self.wh.asn2qid(asn, create=True) self.wh.upsert_statements('update from Spamhaus ASN DROP list', net_qid, statements)