class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.org_qid = self.wh.get_qid('RIPE NCC') self.url = URL_API # url will change for each country self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today) ] def get_last_line(self,line): """Keep the end of the last given line""" self.last_line = line.rpartition(' ')[2] def get_all_lines(self, line): """Keep the end of each given lines""" self.all_lines.append(line.rpartition(' ')[2]) def run(self): """Fetch data from RIPE and push to wikibase. """ now = date.today() today = f'{now.year}/{now.month:02d}/{now.day:02d}' logging.info('Connecting to the FTP server..') # Find latest roa files filepaths = [] ftp = FTP(FTP_URL) ftp.login() ftp.cwd(FTP_ROOT) self.all_lines = [] self.last_line = '' ftp.retrlines('LIST', callback=self.get_all_lines) logging.info('Listing directories...') logging.info(f'{self.all_lines}') for dir in self.all_lines: path = FTP_ROOT+'/'+dir ftp.cwd(path) self.last_line = '' while self.last_line not in ['roas.csv', 'repo.tar.gz']: ftp.cwd(self.last_line) path += self.last_line + '/' ftp.retrlines('LIST', callback=self.get_last_line) if self.last_line == 'roas.csv' and today in path: path += 'roas.csv' logging.info(f'Found ROA file: {path}') filepaths.append(path) for filepath in filepaths: self.url = URL_API+filepath logging.info(f'Fetching ROA file: {self.url}') req = requests.get( self.url ) if req.status_code != 200: sys.exit('Error while fetching data for '+filepath) # Aggregate data per prefix prefix_info = defaultdict(list) for line in req.text.splitlines(): url, asn, prefix, max_length, start, end = line.split(',') # Skip header if url=='URI': continue prefix_info[prefix].append({ 'url': url, 'asn': asn, 'max_length': max_length, 'start': start, 'end': end}) for i, (prefix, attributes) in enumerate(prefix_info.items()): self.update(prefix, attributes) sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix}) ') def update(self, prefix, attributes): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] for att in attributes: qualifiers = [ [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])], [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])], # [self.wh.get_pid('reference URL'), url ] ] if att['max_length']: qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] ) # Properties asn_qid = self.wh.asn2qid(att['asn'], create=True) if asn_qid is None: print('Error: ', line) return statements.append( [ self.wh.get_pid('route origin authorization'), asn_qid, self.reference, qualifiers ] ) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for RIPE Atlas self.atlas_qid = self.wh.get_qid( 'RIPE Atlas', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.', # Item description 'aliases': 'Atlas|atlas', 'statements': [[self.wh.get_pid('managed by'), self.wh.get_qid('RIPE NCC')]] }) # Get the QID for Atlas Probe self.atlas_probe_qid = self.wh.get_qid( 'Atlas probe', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.', # Item description 'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID for Atlas Anchor self.atlas_anchor_qid = self.wh.get_qid( 'Atlas anchor', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.', # Item description 'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID of the item representing PeeringDB IX IDs self.probeid_qid = self.wh.get_qid( PROBEID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probes', # Commit message 'description': 'Identifier for a probe in the RIPE Atlas measurement platform' # Description }) # Load the QIDs for probes already available in the wikibase self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] self.v4_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv4'))] self.v6_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv6'))] def run(self): """Fetch probe information from Atlas API and push to wikibase. """ next_page = URL while next_page is not None: req = requests.get(next_page) if req.status_code != 200: sys.exit('Error while fetching the blocklist') info = json.loads(req.text) next_page = info['next'] for i, probe in enumerate(info['results']): self.update_probe(probe) sys.stderr.write(f'\rProcessed {i+1} probes') sys.stderr.write(f'\n') def update_probe(self, probe): """Add the probe to wikibase if it's not already there and update its properties.""" # TODO add status, geometry (geo-location) and IPs? # Properties for this probe statements = [] if probe['is_anchor']: statements.append( [self.wh.get_pid('instance of'), self.atlas_probe_qid]) statements.append( [self.wh.get_pid('instance of'), self.atlas_anchor_qid]) if probe['asn_v4']: as_qid = self.wh.asn2qid(probe['asn_v4']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v4_qualifiers ]) if probe['asn_v6']: as_qid = self.wh.asn2qid(probe['asn_v6']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v6_qualifiers ]) if probe['prefix_v4']: prefix_qid = self.wh.prefix2qid(probe['prefix_v4']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['prefix_v6']: prefix_qid = self.wh.prefix2qid(probe['prefix_v6']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['country_code']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(probe['country_code']), self.reference ]) if probe['first_connected']: statements.append([ self.wh.get_pid('start time'), self.wh.to_wbtime(probe['first_connected']), self.reference ]) if 'name' in probe['status']: # Get the QIDs for probes status status_qid = self.wh.get_qid( f'RIPE Atlas probe status: {probe["status"]["name"]}', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probe status', # Commit message }) if probe['status_since']: statements.append([ self.wh.get_pid('status'), status_qid, self.reference, [(self.wh.get_pid('start time'), self.wh.to_wbtime(probe['status_since']))] ]) # set end time if the probe is abandonned if probe['status']['name'] == 'Abandoned' and probe['status_since']: statements.append([ self.wh.get_pid('end time'), self.wh.to_wbtime(probe['status_since']) ]) # Add probe tags for tag in probe['tags']: statements.append([ self.wh.get_pid('tag'), self.wh.get_qid(tag['name'], create={ 'summary': 'Add RIPE Atlas tag', }) ]) # Commit to wikibase # Get the probe QID (create if probe is not yet registered) and commit changes probe_qid = self.probe_qid(probe) self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid, statements) def probe_qid(self, probe): """Find the ix QID for the given probe ID. If this probe is not yet registered in the wikibase then add it. Return the probe QID.""" id = str(probe['id']) # Check if the IX is in the wikibase if id not in self.probeid2qid: # Set properties for this new probe probeid_qualifiers = [ (self.wh.get_pid('instance of'), self.probeid_qid), ] statements = [ (self.wh.get_pid('instance of'), self.atlas_probe_qid), (self.wh.get_pid('external ID'), id, [], probeid_qualifiers) ] # Add this probe to the wikibase probe_qid = self.wh.add_item('add new RIPE Atlas probe', label=f'RIPE Atlas probe #{id}', description=probe['description'], statements=statements) # keep track of this QID self.probeid2qid[id] = probe_qid return self.probeid2qid[id]