class Crawler(object): def __init__(self, url=URL): """ """ #API endpoint self.url = url # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for Spamhaus organization self.spamhaus_qid = self.wh.get_qid( 'Spamhaus', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus organization', # Commit message 'description': 'The Spamhaus Project is an international organisation to track email spammers and spam-related activity', # Item description 'aliases': 'The Spamhaus Project|the spamhaus project', 'statements': [[ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ]] }) # Get the QID for Spamhaus DROP project self.drop_qid = self.wh.get_qid( 'Spamhaus DROP lists', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus block list', # Commit message 'description': "The Spamhaus Don't Route Or Peer Lists", # Item description 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid]] }) # Get the QID for Spamhaus DROP list self.drop_qid = self.wh.get_qid( 'Spamhaus DROP list', create={ # Create it if it doesn't exist 'summary': 'add Spamhaus block list', # Commit message 'description': 'The DROP list only include netblocks allocated directly by an established RIR or NIR.', 'statements': [[self.wh.get_pid('managed by'), self.spamhaus_qid], [self.wh.get_pid('part of'), self.drop_qid]] }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today)] def run(self): """Fetch blocklist from Spamhaus and push to wikibase. """ req = requests.get(self.url) if req.status_code != 200: sys.exit('Error while fetching the blocklist') for i, row in enumerate(req.text.splitlines()): # Skip the header if row.startswith(';'): continue self.update_net(row) sys.stderr.write(f'\rProcessed {i+1} prefixes') sys.stderr.write(f'\n') def update_net(self, one_line): """Add the prefix to wikibase if it's not already there and update its properties.""" prefix, _, _ = one_line.partition(';') # Properties for this prefix statements = [ [self.wh.get_pid('reported in'), self.drop_qid, self.reference], ] # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes net_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from Spamhaus DROP list', net_qid, statements)
class Crawler(object): def __init__(self): """Initialize wikihandy and qualifiers for pushed data""" # Helper for wiki access self.wh = Wikihandy() # Added properties will have this additional information today = self.wh.today() self.org_qid = self.wh.get_qid('RIPE NCC') self.url = URL_API # url will change for each country self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), self.url), (self.wh.get_pid('point in time'), today) ] def get_last_line(self,line): """Keep the end of the last given line""" self.last_line = line.rpartition(' ')[2] def get_all_lines(self, line): """Keep the end of each given lines""" self.all_lines.append(line.rpartition(' ')[2]) def run(self): """Fetch data from RIPE and push to wikibase. """ now = date.today() today = f'{now.year}/{now.month:02d}/{now.day:02d}' logging.info('Connecting to the FTP server..') # Find latest roa files filepaths = [] ftp = FTP(FTP_URL) ftp.login() ftp.cwd(FTP_ROOT) self.all_lines = [] self.last_line = '' ftp.retrlines('LIST', callback=self.get_all_lines) logging.info('Listing directories...') logging.info(f'{self.all_lines}') for dir in self.all_lines: path = FTP_ROOT+'/'+dir ftp.cwd(path) self.last_line = '' while self.last_line not in ['roas.csv', 'repo.tar.gz']: ftp.cwd(self.last_line) path += self.last_line + '/' ftp.retrlines('LIST', callback=self.get_last_line) if self.last_line == 'roas.csv' and today in path: path += 'roas.csv' logging.info(f'Found ROA file: {path}') filepaths.append(path) for filepath in filepaths: self.url = URL_API+filepath logging.info(f'Fetching ROA file: {self.url}') req = requests.get( self.url ) if req.status_code != 200: sys.exit('Error while fetching data for '+filepath) # Aggregate data per prefix prefix_info = defaultdict(list) for line in req.text.splitlines(): url, asn, prefix, max_length, start, end = line.split(',') # Skip header if url=='URI': continue prefix_info[prefix].append({ 'url': url, 'asn': asn, 'max_length': max_length, 'start': start, 'end': end}) for i, (prefix, attributes) in enumerate(prefix_info.items()): self.update(prefix, attributes) sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix}) ') def update(self, prefix, attributes): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] for att in attributes: qualifiers = [ [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])], [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])], # [self.wh.get_pid('reference URL'), url ] ] if att['max_length']: qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] ) # Properties asn_qid = self.wh.asn2qid(att['asn'], create=True) if asn_qid is None: print('Error: ', line) return statements.append( [ self.wh.get_pid('route origin authorization'), asn_qid, self.reference, qualifiers ] ) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for Routeviews organization self.org_qid = self.wh.get_qid('Route Views') self.today = self.wh.today() def run(self): """Fetch BGP data from collectors and push to wikibase. """ today = arrow.now().replace(hour=0, minute=0) start = today.shift(hours=-1) end = today.shift(hours=1) stream = pybgpstream.BGPStream( from_time=int(start.timestamp()), until_time=int(end.timestamp()), record_type="ribs", ) rtree = radix.Radix() sys.stderr.write(f'\nReading BGP data:\n') for i, elem in enumerate(stream): # Extract the prefix and origin ASN msg = elem.fields prefix = msg['prefix'] origin_asn_str = msg['as-path'].split(' ')[-1] origin_asns = [] if '{' in origin_asn_str: origin_asns = origin_asn_str[1:-1].split(',') else: origin_asns = [origin_asn_str] # Store origin ASN in radix tree rnode = rtree.search_exact(prefix) if rnode is None: rnode = rtree.add(prefix) rnode.data['origin'] = defaultdict(set) for asn in origin_asns: rnode.data['origin'][asn].add(elem.collector) sys.stderr.write(f'\rProcessed {i+1} BGP messages') sys.stderr.write(f'\nPushing data to IYP...\n') # Push all prefixes data to IYP for i, rnode in enumerate(rtree): data = rnode.data['origin'] self.update_entry(rnode.prefix, data) sys.stderr.write(f'\rProcessed {i+1} prefixes') def update_entry(self, prefix, originasn_collector): """Add the prefix to wikibase if it's not already there and update its properties.""" statements = [] # set origin AS for asn, collectors in originasn_collector.items(): for collector in collectors: # Added properties will have this additional information url = URL_RV if 'rrc' in collector: url = URL_RIS self.reference = [ (self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), url.format(collector)), (self.wh.get_pid('point in time'), self.today) ] as_qid = self.wh.asn2qid(asn, create=True) statements.append( [self.wh.get_pid('originated by'), as_qid, self.reference]) # Commit to wikibase # Get the prefix QID (create if prefix is not yet registered) and commit changes prefix_qid = self.wh.prefix2qid(prefix, create=True) self.wh.upsert_statements('update from RIS/Routeviews RIBs', prefix_qid, statements)
class Crawler(object): def __init__(self): """Create an item representing the PeeringDB exchange point ID class if doesn't already exist. And fetch QIDs for exchange points already in the wikibase.""" # Helper for wiki access self.wh = Wikihandy() # Get the QID of the item representing PeeringDB IX IDs ixid_qid = self.wh.get_qid( IXID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add PeeringDB ix IDs', # Commit message 'description': 'Identifier for an exchange point in the PeeringDB database' # Description }) # Load the QIDs for ix already available in the wikibase self.ixid2qid = self.wh.extid2qid(qid=ixid_qid) # Load the QIDs for peeringDB organizations self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL) # Added properties will have this reference information self.today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), URL_PDB_IXS), (self.wh.get_pid('point in time'), self.today)] def run(self): """Fetch ixs information from PeeringDB and push to wikibase. Using multiple threads for better performances.""" req = requests.get(URL_PDB_IXS) if req.status_code != 200: sys.exit('Error while fetching IXs data') ixs = json.loads(req.text)['data'] self.wh.login() # Login once for all threads for i, ix in enumerate(ixs): # Get more info for this IX req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}') if req.status_code != 200: sys.exit('Error while fetching IXs data') ix_info = json.loads(req.text)['data'][0] # Update info in wiki self.update_ix(ix_info) sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}') def update_ix(self, ix): """Add the ix to wikibase if it's not already there and update its properties.""" # set property name statements = [[ self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point') ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]] # link to corresponding organization org_qid = self.orgid2qid.get(str(ix['org_id'])) if org_qid is not None: statements.append( [self.wh.get_pid('managed by'), org_qid, self.reference]) else: print('Error this organization is not in wikibase: ', ix['org_id']) # set property country if ix['country']: country_qid = self.wh.country2qid(ix['country']) if country_qid is not None: statements.append( [self.wh.get_pid('country'), country_qid, self.reference]) # set property website if ix['website']: statements.append( [self.wh.get_pid('website'), ix['website'], self.reference]) # set traffic webpage if ix['url_stats']: statements.append([ self.wh.get_pid('website'), ix['url_stats'], # statement self.reference, # reference [ (self.wh.get_pid('instance of'), self.wh.get_qid('traffic statistics')), ] # qualifier ]) ix_qid = self.ix_qid(ix) # Update name, website, and organization for this IX self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements) # update LAN corresponding to this IX if 'ixlan_set' in ix: for ixlan in ix['ixlan_set']: pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}' pfx_ref = [(self.wh.get_pid('source'), self.wh.get_qid('PeeringDB')), (self.wh.get_pid('reference URL'), pfx_url), (self.wh.get_pid('point in time'), self.today)] req = requests.get(pfx_url) if req.status_code != 200: sys.exit('Error while fetching IXs data') lans = json.loads(req.text)['data'] for lan in lans: for prefix in lan['ixpfx_set']: pfx_qid = self.wh.prefix2qid(prefix['prefix'], create=True) pfx_stmts = [[ self.wh.get_pid('instance of'), self.wh.get_qid('peering LAN'), pfx_ref ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]] self.wh.upsert_statements('update peeringDB ixlan', pfx_qid, pfx_stmts) return ix_qid def ix_qid(self, ix): """Find the ix QID for the given ix. If this ix is not yet registered in the wikibase then add it. Return the ix QID.""" # Check if the IX is in the wikibase if str(ix['id']) not in self.ixid2qid: # Set properties for this new ix ix_qualifiers = [ (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)), ] statements = [(self.wh.get_pid('instance of'), self.wh.get_qid('Internet exchange point')), (self.wh.get_pid('external ID'), str(ix['id']), [], ix_qualifiers)] # Add this ix to the wikibase ix_qid = self.wh.add_item('add new peeringDB IX', label=ix['name'], description=ix['name_long'], statements=statements) # keep track of this QID self.ixid2qid[str(ix['id'])] = ix_qid return self.ixid2qid[str(ix['id'])]
class Crawler(object): def __init__(self): """ """ # Helper for wiki access self.wh = Wikihandy(preload=True) # Get the QID for RIPE Atlas self.atlas_qid = self.wh.get_qid( 'RIPE Atlas', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.', # Item description 'aliases': 'Atlas|atlas', 'statements': [[self.wh.get_pid('managed by'), self.wh.get_qid('RIPE NCC')]] }) # Get the QID for Atlas Probe self.atlas_probe_qid = self.wh.get_qid( 'Atlas probe', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.', # Item description 'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID for Atlas Anchor self.atlas_anchor_qid = self.wh.get_qid( 'Atlas anchor', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas', # Commit message 'description': 'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.', # Item description 'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor', 'statements': [[self.wh.get_pid('part of'), self.atlas_qid]] }) # Get the QID of the item representing PeeringDB IX IDs self.probeid_qid = self.wh.get_qid( PROBEID_LABEL, create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probes', # Commit message 'description': 'Identifier for a probe in the RIPE Atlas measurement platform' # Description }) # Load the QIDs for probes already available in the wikibase self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')), (self.wh.get_pid('reference URL'), URL), (self.wh.get_pid('point in time'), today)] self.v4_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv4'))] self.v6_qualifiers = [(self.wh.get_pid('IP version'), self.wh.get_qid('IPv6'))] def run(self): """Fetch probe information from Atlas API and push to wikibase. """ next_page = URL while next_page is not None: req = requests.get(next_page) if req.status_code != 200: sys.exit('Error while fetching the blocklist') info = json.loads(req.text) next_page = info['next'] for i, probe in enumerate(info['results']): self.update_probe(probe) sys.stderr.write(f'\rProcessed {i+1} probes') sys.stderr.write(f'\n') def update_probe(self, probe): """Add the probe to wikibase if it's not already there and update its properties.""" # TODO add status, geometry (geo-location) and IPs? # Properties for this probe statements = [] if probe['is_anchor']: statements.append( [self.wh.get_pid('instance of'), self.atlas_probe_qid]) statements.append( [self.wh.get_pid('instance of'), self.atlas_anchor_qid]) if probe['asn_v4']: as_qid = self.wh.asn2qid(probe['asn_v4']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v4_qualifiers ]) if probe['asn_v6']: as_qid = self.wh.asn2qid(probe['asn_v6']) if as_qid: statements.append([ self.wh.get_pid('part of'), as_qid, self.reference, self.v6_qualifiers ]) if probe['prefix_v4']: prefix_qid = self.wh.prefix2qid(probe['prefix_v4']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['prefix_v6']: prefix_qid = self.wh.prefix2qid(probe['prefix_v6']) if prefix_qid: statements.append( [self.wh.get_pid('part of'), prefix_qid, self.reference]) if probe['country_code']: statements.append([ self.wh.get_pid('country'), self.wh.country2qid(probe['country_code']), self.reference ]) if probe['first_connected']: statements.append([ self.wh.get_pid('start time'), self.wh.to_wbtime(probe['first_connected']), self.reference ]) if 'name' in probe['status']: # Get the QIDs for probes status status_qid = self.wh.get_qid( f'RIPE Atlas probe status: {probe["status"]["name"]}', create={ # Create it if it doesn't exist 'summary': 'add RIPE Atlas probe status', # Commit message }) if probe['status_since']: statements.append([ self.wh.get_pid('status'), status_qid, self.reference, [(self.wh.get_pid('start time'), self.wh.to_wbtime(probe['status_since']))] ]) # set end time if the probe is abandonned if probe['status']['name'] == 'Abandoned' and probe['status_since']: statements.append([ self.wh.get_pid('end time'), self.wh.to_wbtime(probe['status_since']) ]) # Add probe tags for tag in probe['tags']: statements.append([ self.wh.get_pid('tag'), self.wh.get_qid(tag['name'], create={ 'summary': 'Add RIPE Atlas tag', }) ]) # Commit to wikibase # Get the probe QID (create if probe is not yet registered) and commit changes probe_qid = self.probe_qid(probe) self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid, statements) def probe_qid(self, probe): """Find the ix QID for the given probe ID. If this probe is not yet registered in the wikibase then add it. Return the probe QID.""" id = str(probe['id']) # Check if the IX is in the wikibase if id not in self.probeid2qid: # Set properties for this new probe probeid_qualifiers = [ (self.wh.get_pid('instance of'), self.probeid_qid), ] statements = [ (self.wh.get_pid('instance of'), self.atlas_probe_qid), (self.wh.get_pid('external ID'), id, [], probeid_qualifiers) ] # Add this probe to the wikibase probe_qid = self.wh.add_item('add new RIPE Atlas probe', label=f'RIPE Atlas probe #{id}', description=probe['description'], statements=statements) # keep track of this QID self.probeid2qid[id] = probe_qid return self.probeid2qid[id]
class Crawler(object): def __init__(self, fdns_url=URL): """Fetch QID for Rapid7 and initialize wikihandy.""" sys.stderr.write('Initialization...\n') self.fdns_url = fdns_url # Helper for wiki access self.wh = Wikihandy() self.org_qid = self.wh.get_qid( 'Rapid7', create={ # Create it if it doesn't exist 'summary': 'add Rapid7 forward DNS data', # Commit message 'description': 'Rapid7, a security company that provides unified vulnerability management solutions', # Item description 'statements': [ [ self.wh.get_pid('instance of'), self.wh.get_qid('organization') ], [self.wh.get_pid('website'), 'https://www.rapid7.com/'], ] }) # Added properties will have this additional information today = self.wh.today() self.reference = [(self.wh.get_pid('source'), self.org_qid), (self.wh.get_pid('reference URL'), fdns_url), (self.wh.get_pid('point in time'), today)] self.ia = ip2asn(wikihandy=self.wh) # keep track of all resolved prefixes so we just make one push per # domain self.tld_pfx = defaultdict(set) def match_domain_prefix(self, line): """Parse a line from the rapid7 dataset, extract the domain and ip, and find the corresponding IP prefix. return: (domain name, prefix) or None, None if the domain is not in the wiki """ tld = None prefix = None datapoint = json.loads(line) if (datapoint['type'] in ['a', 'aaaa'] and 'value' in datapoint and 'name' in datapoint): ext = tldextract.extract(datapoint['name']) tld = ext[-2] + '.' + ext[-1] # skip domains not in the wiki if self.wh.domain2qid(tld) is None: return tld, None ip_info = self.ia.lookup(datapoint['value']) if ip_info is None: return tld, None prefix = ip_info['prefix'] self.tld_pfx[tld].add(prefix) return tld, prefix def run(self): """Fetch Rapid7 DNS forward data, find corresponding BGP prefixes and push resolution for domains already in the wikibase. """ # download rapid7 data and find corresponding prefixes sys.stderr.write('Downloading Rapid7 dataset...\n') fname = self.fdns_url.split('/')[-1] if not os.path.exists(fname): fname = download_file(self.fdns_url, fname) sys.stderr.write('Processing dataset...\n') if os.path.exists(fname + '.pickle'): sys.stderr.write('Load data from cache!') self.tld_pfx = pickle.load(open(fname + '.pickle', 'rb')) else: with gzip.open(fname, 'rt') as finput: for line in finput: self.match_domain_prefix(line) pickle.dump(self.tld_pfx, open(fname + '.pickle', 'wb')) sys.stderr.write( f'Found {len(self.tld_pfx)} domain names in Rapid7 dataset out of the {len(self.wh._domain2qid)} domain names in wiki\n' ) # push data to wiki for i, (tld, pfxs) in enumerate(self.tld_pfx.items()): sys.stderr.write( f'\33[2K\rUpdating iyp... {i+1}/{len(self.tld_pfx)}\t{tld} {len(pfxs)} prefixes' ) self.update(tld, pfxs) sys.stderr.write('\n') def update(self, tld, pfxs): """Update statements for the given domain name.""" # make all statements statements = [] for pfx in pfxs: pfx_qid = self.wh.prefix2qid(pfx, create=True) statements.append( [self.wh.get_pid('forward DNS'), pfx_qid, self.reference]) # Commit to wikibase # Get the domain name QID and commit changes dn_qid = self.wh.domain2qid(tld) try: # TODO remove old data with URL regex self.wh.upsert_statements('update from Rapid7 forward DNS data', dn_qid, statements) except Exception as e: logging.error(f"Could not update domain {dn_qid}") logging.error(str(e))