Ejemplo n.º 1
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy()

        # Reference information for data pushed to the wikibase
        self.reference = [
            (self.wh.get_pid('source'), self.wh.get_qid('RIPE NCC')),
            (self.wh.get_pid('reference URL'), URL_RIPE_AS_NAME),
            (self.wh.get_pid('point in time'), self.wh.today())

    def run(self):
        """Fetch the AS name file from RIPE website and process lines one by one"""

        req = requests.get(URL_RIPE_AS_NAME)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')

        self.wh.login() # Login once for all threads, not needed with OAuth
        for i, res in enumerate(map(self.update_asn, req.text.splitlines())):
            sys.stderr.write(f'\rProcessed {i} ASes')

    def update_asn(self, one_line):
        # Parse given line to get ASN, name, and country code 
        asn, _, name_cc = one_line.partition(' ')
        name, _, cc = name_cc.rpartition(', ')

        asn_qid = self.wh.asn2qid(asn, create=True)
        cc_qid = self.wh.country2qid(cc, create=True)

        statements = []
        statements.append( [self.wh.get_pid('country'), cc_qid, self.reference] )  # Set country
        if cc_qid is not None:
            statements.append( [self.wh.get_pid('name'), name, self.reference] )       # Set AS name

            # Update AS name and country
            self.wh.upsert_statements('updates from RIPE AS names', asn_qid, statements)

        except Exception as error:
            # print errors and continue running
            print('Error for: ', one_line)

        return asn_qid
Ejemplo n.º 2
class Crawler(object):
    def __init__(self):
        """Fetch QIDs for MANRS actions (create them if they are not in the 

        # Helper for wiki access
        self.wh = Wikihandy()

        # Actions defined by MANRS
        self.actions = [{
            'MANRS Action 1: Filtering',
            'Prevent propagation of incorrect routing information'
        }, {
            'MANRS Action 2: Anti-spoofing',
            'Prevent traffic with spoofed source IP addresses'
        }, {
            'MANRS Action 3: Coordination',
            'Facilitate global operational communication and coordination'
        }, {
            'MANRS Action 4: Global Validation',
            'Facilitate routing information on a global scale'

        # Get the QID for the four items representing MANRS actions
        for action in self.actions:
            action['qid'] = self.wh.get_qid(
                create={  # Create it if it doesn't exist
                    'summary': 'add MANRS actions',  # Commit message
                    'description': action['description']  # Item description

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                          (self.wh.get_pid('reference URL'), URL_MANRS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from MANRS and push to wikibase. """

        req = requests.get(URL_MANRS)
        if req.status_code != 200:
            sys.exit('Error while fetching MANRS csv file')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if i == 0:

            sys.stderr.write(f'\rProcessed {i} organizations')

    def update_net(self, one_line):
        """Add the network to wikibase if it's not already there and update its

        _, areas, asns, act1, act2, act3, act4 = [
            col.strip() for col in one_line.split(',')

        # Properties
        statements = [
                self.wh.get_pid('member of'),
                self.wh.get_qid('MANRS'), self.reference

        # set countries
        for cc in areas.split(';'):
                self.wh.country2qid(cc), self.reference

        # set actions
        for i, action_bool in enumerate([act1, act2, act3, act4]):
            if action_bool == 'Yes':
                    self.wh.get_pid('implements'), self.actions[i]['qid'],

        # Commit to wikibase
        for asn in asns.split(';'):
            if asn:  # ignore organizations with no ASN
                # Get the AS QID (create if AS is not yet registered) and commit changes
                net_qid = self.wh.asn2qid(asn, create=True)
                self.wh.upsert_statements('update from MANRS membership',
                                          net_qid, statements)
Ejemplo n.º 3
class Crawler(object):
    def __init__(self):
        """Create an item representing the 'PeeringDB organization ID' class if 
        doesn't already exist. And fetch QIDs for organizations already in the


        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID for the item representing the organization IDs
        orgid_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add PeeringDB org IDs',  # Commit message
                'Identifier for an organization in the PeeringDB database'

        # Load the QIDs for organizations already available in the wikibase
        self.orgid2qid = self.wh.extid2qid(qid=orgid_qid)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                          (self.wh.get_pid('reference URL'), URL_PDB_ORGS),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch organizations information from PeeringDB and push to wikibase"""

        sys.stderr.write('Fetching PeeringDB data...\n')
        req = requests.get(URL_PDB_ORGS)
        if req.status_code != 200:
            sys.exit('Error while fetching AS names')
        organizations = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, res in enumerate(map(self.update_org, organizations)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(organizations)}')

    def update_org(self, organization):
        """Add the organization to wikibase if it's not there and update properties"""

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
                          organization['name'].strip(), self.reference

        # set property website
        if organization['website']:
                self.wh.get_pid('website'), organization['website'],

        # set property country
        if organization['country'] in iso3166.countries_by_alpha2:
            country_qid = self.wh.get_qid(
            if country_qid is not None:
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # Update name, website, and country for this organization
        org_qid = self.org_qid(organization)
        self.wh.upsert_statements('update peeringDB organization', org_qid,

        return org_qid

    def org_qid(self, organization):
        """Find the organization QID or add it to wikibase if it is not yet there.
        Return the organization QID."""

        # Check if the organization is in the wikibase
        if str(organization['id']) not in self.orgid2qid:
            # Set properties for this new organization
            org_qualifier = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(ORGID_LABEL)),
            statements = [[
                self.wh.get_pid('external ID'),
                str(organization['id']), [], org_qualifier

            # Add this organization to the wikibase
            org_qid = self.wh.add_item('add new peeringDB organization',
            # keep track of this QID
            self.orgid2qid[str(organization['id'])] = org_qid

        return self.orgid2qid[str(organization['id'])]
class Crawler(object):
    def __init__(self, url=URL):

        #API endpoint
        self.url = url

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for Spamhaus organization
        self.spamhaus_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add Spamhaus organization',  # Commit message
                'The Spamhaus Project is an international organisation to track email spammers and spam-related activity',  # Item description
                'The Spamhaus Project|the spamhaus project',
                'statements': [[
                    self.wh.get_pid('instance of'),

        # Get the QID for Spamhaus DROP project
        self.drop_qid = self.wh.get_qid(
            'Spamhaus DROP lists',
            create={  # Create it if it doesn't exist
                'summary': 'add Spamhaus block list',  # Commit message
                "The Spamhaus Don't Route Or Peer Lists",  # Item description
                [[self.wh.get_pid('managed by'), self.spamhaus_qid]]

        # Get the QID for Spamhaus DROP list
        self.drop_qid = self.wh.get_qid(
            'Spamhaus DROP list',
            create={  # Create it if it doesn't exist
                'add Spamhaus block list',  # Commit message
                'The DROP list only include netblocks allocated directly by an established RIR or NIR.',
                [[self.wh.get_pid('managed by'), self.spamhaus_qid],
                 [self.wh.get_pid('part of'), self.drop_qid]]

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid),
                          (self.wh.get_pid('reference URL'), self.url),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch blocklist from Spamhaus and push to wikibase. """

        req = requests.get(self.url)
        if req.status_code != 200:
            sys.exit('Error while fetching the blocklist')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if row.startswith(';'):

            sys.stderr.write(f'\rProcessed {i+1} prefixes')

    def update_net(self, one_line):
        """Add the prefix to wikibase if it's not already there and update its

        prefix, _, _ = one_line.partition(';')

        # Properties for this prefix
        statements = [
            [self.wh.get_pid('reported in'), self.drop_qid, self.reference],

        # Commit to wikibase
        # Get the prefix QID (create if prefix is not yet registered) and commit changes
        net_qid = self.wh.prefix2qid(prefix, create=True)
        self.wh.upsert_statements('update from Spamhaus DROP list', net_qid,
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy """

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        self.org_qid = self.wh.get_qid(ORG)
        self.countries = iso3166.countries_by_alpha2

        # Session object to fetch peeringdb data
        retries = Retry(total=15,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch data from API and push to wikibase. """

        for cc, country in self.countries.items():
            # Query IHR
            self.url = URL_API.format(country=cc)
            req = self.http_session.get(self.url + '&format=json')
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)
            data = json.loads(req.text)
            ranking = data['results']

            # Setup references
            today = self.wh.today()
            self.references = [
                (self.wh.get_pid('source'), self.org_qid),
                (self.wh.get_pid('reference URL'), self.url),
                (self.wh.get_pid('point in time'), today),

            # Setup qualifiers
            country_qid = self.wh.country2qid(country.name)
            if country_qid is not None:
                self.qualifiers = [(self.wh.get_pid('country'), country_qid)]
                self.qualifiers = []

            # Find the latest timebin in the data
            last_timebin = '1970-01-01'
            for r in ranking:
                if arrow.get(r['timebin']) > arrow.get(last_timebin):
                    last_timebin = r['timebin']

            # Make ranking and push data
            for metric, weight in [('Total eyeball', 'eyeball'),
                                   ('Total AS', 'as')]:

                # Get the QID of the selected country / create this country if needed
                self.countryrank_qid = self.wh.get_qid(
                    f'IHR country ranking: {metric} ({cc})',
                    create={  # Create it if it doesn't exist
                        f'add IHR {metric} ranking for ' + cc,
                        f"IHR's ranking of networks ({metric}) for " +
                        [[self.wh.get_pid('managed by'), self.org_qid]]

                # Filter out unnecessary data
                selected = [
                    r for r in ranking if
                    (r['weightscheme'] == weight and r['transitonly'] == False
                     and r['hege'] > MIN_HEGE and r['timebin'] == last_timebin)

                # Make sure the ranking is sorted and add rank field
                selected.sort(key=lambda x: x['hege'], reverse=True)
                for i, asn in enumerate(selected):
                    asn['rank'] = i

                # Push data to wiki
                for i, res in enumerate(map(self.update_entry, selected)):
                        f'\rProcessing {country.name}... {i+1}/{len(selected)}'


    def update_entry(self, asn):
        """Add the network to wikibase if it's not already there and update its

        # Properties
        statements = []

        # set rank
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.references, self.qualifiers

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from IHR country ranking',
class Crawler(object):
    def __init__(self):
        """Fetch QIDs for Tranco and affiliation (create them if they are not 
        in the wikibase)."""

        # Helper for wiki access
        self.wh = Wikihandy()

        self.tranco_qid = self.wh.get_qid(
            'Tranco Top 1M',
            create={  # Create it if it doesn't exist
                'add Tranco ranking',  # Commit message
                'A Research-Oriented Top Sites Ranking Hardened Against Manipulation',  # Item description
                'statements': [
                    [self.wh.get_pid('website'), 'https://tranco-list.eu/'],
                        self.wh.get_pid('source code repository'),

        self.org_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add Tranco ranking',  # Commit message
                'The imec-DistriNet research group is part of the Department of Computer Science at the KU Leuven and part of the imec High Impact Initiative Distributed Trust.',  # Item description
                'statements': [

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'), self.org_qid),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch Tranco top 1M and push to wikibase. """

        sys.stderr.write('Downloading latest list...\n')
        req = requests.get(URL)
        if req.status_code != 200:
            sys.exit('Error while fetching Tranco csv file')

        # open zip file and read top list
        with ZipFile(io.BytesIO(req.content)) as z:
            with z.open('top-1m.csv') as list:
                for i, row in enumerate(io.TextIOWrapper(list)):
                    row = row.rstrip()
                    sys.stderr.write(f'\rProcessed {i} domains \t {row}')

    def update(self, one_line):
        """Add the network to wikibase if it's not already there and update its

        rank, domain = one_line.split(',')

        # set rank
        statements = [[
            self.wh.get_pid('ranking'), {
                'amount': rank,
                'unit': self.tranco_qid,
            }, self.reference

        # Commit to wikibase
        # Get the domain name QID (create if it is not yet registered) and commit changes
        dn_qid = self.wh.get_qid(domain,
                                     'add Tranco ranking',
                                     'statements': [[
                                         self.wh.get_pid('instance of'),
                                         self.wh.get_qid('domain name')
        self.wh.upsert_statements('update from tranco top 1M', dn_qid,
Ejemplo n.º 7
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""
        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.org_qid = self.wh.get_qid('RIPE NCC')
        self.url = URL_API  # url will change for each country
        self.reference = [
                (self.wh.get_pid('source'), self.org_qid),
                (self.wh.get_pid('reference URL'), self.url),
                (self.wh.get_pid('point in time'), today)

    def get_last_line(self,line):
        """Keep the end of the last given line"""

        self.last_line = line.rpartition(' ')[2]

    def get_all_lines(self, line):
        """Keep the end of each given lines"""

        self.all_lines.append(line.rpartition(' ')[2])

    def run(self):
        """Fetch data from RIPE and push to wikibase. """

        now = date.today()
        today = f'{now.year}/{now.month:02d}/{now.day:02d}'

        logging.info('Connecting to the FTP server..')
        # Find latest roa files
        filepaths = []
        ftp = FTP(FTP_URL)

        self.all_lines = []
        self.last_line = ''
        ftp.retrlines('LIST', callback=self.get_all_lines)

        logging.info('Listing directories...')
        for dir in self.all_lines:
            path = FTP_ROOT+'/'+dir
            self.last_line = ''
            while self.last_line not in ['roas.csv', 'repo.tar.gz']:
                path += self.last_line + '/'
                ftp.retrlines('LIST', callback=self.get_last_line)

            if self.last_line == 'roas.csv' and today in path:
                path += 'roas.csv'
                logging.info(f'Found ROA file: {path}')

        for filepath in filepaths:
            self.url = URL_API+filepath
            logging.info(f'Fetching ROA file: {self.url}')
            req = requests.get( self.url )
            if req.status_code != 200:
                sys.exit('Error while fetching data for '+filepath)
            # Aggregate data per prefix
            prefix_info = defaultdict(list)
            for line in req.text.splitlines():
                url, asn, prefix, max_length, start, end = line.split(',')
                # Skip header
                if url=='URI':

                    'url': url, 
                    'asn': asn, 
                    'max_length': max_length, 
                    'start': start, 
                    'end': end})

            for i, (prefix, attributes) in enumerate(prefix_info.items()):
                self.update(prefix, attributes)
                sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix})     ')

    def update(self, prefix, attributes):
        """Add the prefix to wikibase if it's not already there and update its

        statements = []
        for att in attributes:
            qualifiers = [
                    [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])],
                    [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])],
                #    [self.wh.get_pid('reference URL'), url ] 

            if att['max_length']:
                qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] )

            # Properties
            asn_qid = self.wh.asn2qid(att['asn'], create=True)
            if asn_qid is None:
                print('Error: ', line)

                        [ self.wh.get_pid('route origin authorization'), 

        # Commit to wikibase
        # Get the prefix QID (create if prefix is not yet registered) and commit changes
        prefix_qid = self.wh.prefix2qid(prefix, create=True) 
        self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB network ID class if 
        doesn't already exist. And fetch QIDs for networks already in the

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB network IDs
        netid_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'summary': 'add PeeringDB net IDs',  # Commit message
                'Identifier for a network in the PeeringDB database'  # Description

        # Load the QIDs for networks already available in the wikibase
        self.netid2qid = self.wh.extid2qid(qid=netid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)
        # Load the QIDs for peeringDB IXs
        self.ixid2qid = self.wh.extid2qid(label=IXID_LABEL)

        # Added properties will have this reference information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                          (self.wh.get_pid('reference URL'), URL_PDB_NETS),
                          (self.wh.get_pid('point in time'), today)]

        # Session object to fetch peeringdb data
        retries = Retry(total=5,
                        status_forcelist=[104, 500, 502, 503, 504])

        self.http_session = requests.Session()
        self.http_session.mount('https://', HTTPAdapter(max_retries=retries))

    def run(self):
        """Fetch networks information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = self.http_session.get(URL_PDB_NETS)
        if req.status_code != 200:
            sys.exit('Error while fetching data from API')
        networks = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, _ in enumerate(map(self.update_net, networks)):
            sys.stderr.write(f'\rProcessing... {i+1}/{len(networks)}')

    def update_net(self, network):
        """Add the network to wikibase if it's not already there and update its

        # set property name
        statements = [[
            self.wh.get_pid('name'), network['name'].strip(), self.reference

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(network['org_id']))
        if org_qid is not None:
                [self.wh.get_pid('managed by'), org_qid, self.reference])
            print('Error this organization is not in wikibase: ',

        # set property website
        if network['website']:
                self.wh.get_pid('website'), network['website'], self.reference

        # Update IX membership
        # Fetch membership for this network
        netixlan_url = URL_PDB_NETS + f'/{network["id"]}'

        req = self.http_session.get(netixlan_url)
        if req.status_code != 200:
            sys.exit(f'Error while fetching network data (id={network["id"]})')

        net_details = json.loads(req.text)['data']
        if len(net_details) > 1:

        net_details = net_details[0]

        # Push membership to wikidata
        today = self.wh.today()
        netixlan_ref = [(self.wh.get_pid('source'),
                        (self.wh.get_pid('reference URL'), netixlan_url),
                        (self.wh.get_pid('point in time'), today)]

        for ixlan in net_details['netixlan_set']:
            ix_qid = self.ixid2qid.get(str(ixlan['ix_id']))
            if ix_qid is None:
                print(f'Unknown IX: ix_id={ixlan["ix_id"]}')
                [self.wh.get_pid('member of'), ix_qid, netixlan_ref])

        # Update name, website, and organization for this network
        net_qid = self.net_qid(network)
        self.wh.upsert_statements('update peeringDB networks', net_qid,

        return net_qid

    def net_qid(self, network):
        """Find the network QID for the given network.
        If this network is not yet registered in the wikibase then find (or 
        create) the item corresponding to the network ASN and register 
        the peeringDB network ID with this item.

        Return the network QID."""

        # Check if the network is in the wikibase
        if str(network['id']) not in self.netid2qid:
            # Find or create the corresponding ASN item
            net_qid = self.wh.asn2qid(network['asn'], create=True)
            # Set properties for this new network
            net_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(NETID_LABEL)),
            statements = [[
                self.wh.get_pid('external ID'),
                str(network['id']), [], net_qualifiers

            # Add this network to the wikibase
            self.wh.upsert_statements('add new peeringDB network',
            # keep track of this QID
            self.netid2qid[str(network['id'])] = net_qid

        return self.netid2qid[str(network['id'])]
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for Routeviews organization
        self.org_qid = self.wh.get_qid('Route Views')
        self.today = self.wh.today()

    def run(self):
        """Fetch BGP data from collectors and push to wikibase. """

        today = arrow.now().replace(hour=0, minute=0)
        start = today.shift(hours=-1)
        end = today.shift(hours=1)
        stream = pybgpstream.BGPStream(

        rtree = radix.Radix()

        sys.stderr.write(f'\nReading BGP data:\n')
        for i, elem in enumerate(stream):
            # Extract the prefix and origin ASN
            msg = elem.fields
            prefix = msg['prefix']
            origin_asn_str = msg['as-path'].split(' ')[-1]
            origin_asns = []
            if '{' in origin_asn_str:
                origin_asns = origin_asn_str[1:-1].split(',')
                origin_asns = [origin_asn_str]

            # Store origin ASN in radix tree
            rnode = rtree.search_exact(prefix)
            if rnode is None:
                rnode = rtree.add(prefix)
                rnode.data['origin'] = defaultdict(set)

            for asn in origin_asns:
                sys.stderr.write(f'\rProcessed {i+1} BGP messages')

        sys.stderr.write(f'\nPushing data to IYP...\n')

        # Push all prefixes data to IYP
        for i, rnode in enumerate(rtree):
            data = rnode.data['origin']
            self.update_entry(rnode.prefix, data)
            sys.stderr.write(f'\rProcessed {i+1} prefixes')

    def update_entry(self, prefix, originasn_collector):
        """Add the prefix to wikibase if it's not already there and update its properties."""

        statements = []

        # set origin AS
        for asn, collectors in originasn_collector.items():
            for collector in collectors:
                # Added properties will have this additional information
                url = URL_RV
                if 'rrc' in collector:
                    url = URL_RIS

                self.reference = [
                    (self.wh.get_pid('source'), self.org_qid),
                    (self.wh.get_pid('reference URL'), url.format(collector)),
                    (self.wh.get_pid('point in time'), self.today)

                as_qid = self.wh.asn2qid(asn, create=True)
                    [self.wh.get_pid('originated by'), as_qid, self.reference])

        # Commit to wikibase
        # Get the prefix QID (create if prefix is not yet registered) and commit changes
        prefix_qid = self.wh.prefix2qid(prefix, create=True)
        self.wh.upsert_statements('update from RIS/Routeviews RIBs',
                                  prefix_qid, statements)
Ejemplo n.º 10
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.caida_qid = self.wh.get_qid('CAIDA')

        # Get the QID for ASRank project
        self.asrank_qid = self.wh.get_qid(
            'CAIDA ASRank',
            create={  # Create it if it doesn't exist
                'summary': 'add CAIDA ASRank',  # Commit message
                "CAIDA's AS ranking derived from topological data collected by CAIDA's Archipelago Measurement Infrastructure and BGP routing data collected by the Route Views Project and RIPE NCC.",  # Item description
                'statements': [[self.wh.get_pid('managed by'), self.caida_qid]]

        self.reference = [(self.wh.get_pid('source'), self.caida_qid),
                          (self.wh.get_pid('reference URL'), URL_API),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch networks information from ASRank and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()
        has_next = True
        i = 0
        while has_next:
            req = requests.get(URL_API + f'?offset={i}')
            if req.status_code != 200:
                sys.exit('Error while fetching data from API')

            ranking = json.loads(req.text)['data']['asns']
            has_next = ranking['pageInfo']['hasNextPage']

            for res in pool.map(self.update_net, ranking['edges']):
                    f'\rProcessing... {i+1}/{ranking["totalCount"]}')
                i += 1


    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its

        asn = asn['node']

        # Properties
        statements = []

        if asn['asnName']:
                [self.wh.get_pid('name'), asn['asnName'], self.reference])

        # set countries
        cc = asn['country']['iso']
        if cc:
                self.wh.country2qid(cc), self.reference

        # set rank
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.asrank_qid,
            }, self.reference

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['asn'], create=True)
        self.wh.upsert_statements('update from CAIDA ASRank', net_qid,
class Crawler(object):
    def __init__(self):
        """Create an item representing the PeeringDB exchange point ID class if 
        doesn't already exist. And fetch QIDs for exchange points already in the

        # Helper for wiki access
        self.wh = Wikihandy()

        # Get the QID of the item representing PeeringDB IX IDs
        ixid_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add PeeringDB ix IDs',  # Commit message
                'Identifier for an exchange point in the PeeringDB database'  # Description

        # Load the QIDs for ix already available in the wikibase
        self.ixid2qid = self.wh.extid2qid(qid=ixid_qid)
        # Load the QIDs for peeringDB organizations
        self.orgid2qid = self.wh.extid2qid(label=ORGID_LABEL)

        # Added properties will have this reference information
        self.today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                          (self.wh.get_pid('reference URL'), URL_PDB_IXS),
                          (self.wh.get_pid('point in time'), self.today)]

    def run(self):
        """Fetch ixs information from PeeringDB and push to wikibase. 
        Using multiple threads for better performances."""

        req = requests.get(URL_PDB_IXS)
        if req.status_code != 200:
            sys.exit('Error while fetching IXs data')
        ixs = json.loads(req.text)['data']

        self.wh.login()  # Login once for all threads

        for i, ix in enumerate(ixs):

            # Get more info for this IX
            req = requests.get(f'{URL_PDB_IXS}/{ix["id"]}')
            if req.status_code != 200:
                sys.exit('Error while fetching IXs data')
            ix_info = json.loads(req.text)['data'][0]

            # Update info in wiki

            sys.stderr.write(f'\rProcessing... {i+1}/{len(ixs)}')

    def update_ix(self, ix):
        """Add the ix to wikibase if it's not already there and update its

        # set property name
        statements = [[
            self.wh.get_pid('instance of'),
            self.wh.get_qid('Internet exchange point')
        ], [self.wh.get_pid('name'), ix['name'].strip(), self.reference]]

        # link to corresponding organization
        org_qid = self.orgid2qid.get(str(ix['org_id']))
        if org_qid is not None:
                [self.wh.get_pid('managed by'), org_qid, self.reference])
            print('Error this organization is not in wikibase: ', ix['org_id'])

        # set property country
        if ix['country']:
            country_qid = self.wh.country2qid(ix['country'])
            if country_qid is not None:
                    [self.wh.get_pid('country'), country_qid, self.reference])

        # set property website
        if ix['website']:
                [self.wh.get_pid('website'), ix['website'], self.reference])

        # set traffic webpage
        if ix['url_stats']:
                ix['url_stats'],  # statement
                self.reference,  # reference 
                    (self.wh.get_pid('instance of'),
                     self.wh.get_qid('traffic statistics')),
                ]  # qualifier

        ix_qid = self.ix_qid(ix)
        # Update name, website, and organization for this IX
        self.wh.upsert_statements('update peeringDB ixs', ix_qid, statements)

        # update LAN corresponding to this IX
        if 'ixlan_set' in ix:
            for ixlan in ix['ixlan_set']:
                pfx_url = f'{URL_PDB_LAN}/{ixlan["id"]}'
                pfx_ref = [(self.wh.get_pid('source'),
                           (self.wh.get_pid('reference URL'), pfx_url),
                           (self.wh.get_pid('point in time'), self.today)]

                req = requests.get(pfx_url)
                if req.status_code != 200:
                    sys.exit('Error while fetching IXs data')
                lans = json.loads(req.text)['data']

                for lan in lans:
                    for prefix in lan['ixpfx_set']:
                        pfx_qid = self.wh.prefix2qid(prefix['prefix'],

                        pfx_stmts = [[
                            self.wh.get_pid('instance of'),
                            self.wh.get_qid('peering LAN'), pfx_ref
                        ], [self.wh.get_pid('managed by'), ix_qid, pfx_ref]]

                        self.wh.upsert_statements('update peeringDB ixlan',
                                                  pfx_qid, pfx_stmts)

        return ix_qid

    def ix_qid(self, ix):
        """Find the ix QID for the given ix.
        If this ix is not yet registered in the wikibase then add it.

        Return the ix QID."""

        # Check if the IX is in the wikibase
        if str(ix['id']) not in self.ixid2qid:
            # Set properties for this new ix
            ix_qualifiers = [
                (self.wh.get_pid('instance of'), self.wh.get_qid(IXID_LABEL)),
            statements = [(self.wh.get_pid('instance of'),
                           self.wh.get_qid('Internet exchange point')),
                          (self.wh.get_pid('external ID'), str(ix['id']), [],

            # Add this ix to the wikibase
            ix_qid = self.wh.add_item('add new peeringDB IX',
            # keep track of this QID
            self.ixid2qid[str(ix['id'])] = ix_qid

        return self.ixid2qid[str(ix['id'])]
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""

        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.apnic_qid = self.wh.get_qid('APNIC')
        self.url = URL_API  # url will change for each country
        self.reference = [(self.wh.get_pid('source'), self.apnic_qid),
                          (self.wh.get_pid('reference URL'), self.url),
                          (self.wh.get_pid('point in time'), today)]

        self.countries = iso3166.countries_by_alpha2

    def run(self):
        """Fetch data from APNIC and push to wikibase. """

        self.wh.login()  # Login once for all threads
        pool = ThreadPoolExecutor()

        for cc, country in self.countries.items():

            # Get the QID of the selected country / create this country if needed
            self.countryrank_qid = self.wh.get_qid(
                f'APNIC eyeball estimates ({cc})',
                create={  # Create it if it doesn't exist
                    'add APNIC eyeball estimates for ' + cc,
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],

            self.countrypercent_qid = self.wh.get_qid(
                f'% of Internet users in {country.name}',
                create={  # Create it if it doesn't exist
                    'add APNIC eyeball estimates for ' + cc,
                    "APNIC's AS population estimates" +
                    "based on advertisement for " + country.name,
                    'statements': [
                        [self.wh.get_pid('managed by'), self.apnic_qid],
                        [self.wh.get_pid('website'), URL_API],

            self.url = URL_API + f'{cc}/{cc}.asns.json?m={MIN_POP_PERC}'
            req = requests.get(self.url)
            if req.status_code != 200:
                sys.exit('Error while fetching data for ' + cc)

            ranking = json.loads(req.text)
            # Make sure the ranking is sorted and add rank field
            ranking.sort(key=lambda x: x['percent'], reverse=True)
            for i, asn in enumerate(ranking):
                asn['rank'] = i

            # Push data to wiki
            for i, res in enumerate(pool.map(self.update_net, ranking)):
                    f'\rProcessing {country.name}... {i+1}/{len(ranking)}')


    def update_net(self, asn):
        """Add the network to wikibase if it's not already there and update its

        # Properties
        statements = []

        # set name
        if asn['autnum']:
                [self.wh.get_pid('name'), asn['autnum'], self.reference])

        # set country
        if asn['cc']:
                self.wh.country2qid(asn['cc']), self.reference

        # set rank
            self.wh.get_pid('ranking'), {
                'amount': asn['rank'],
                'unit': self.countryrank_qid,
            }, self.reference

        # set population
            self.wh.get_pid('population'), {
                'amount': asn['percent'],
                'unit': self.countrypercent_qid,
            }, self.reference

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn['as'], create=True)
        self.wh.upsert_statements('update from APNIC eyeball ranking', net_qid,
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for RIPE Atlas
        self.atlas_qid = self.wh.get_qid(
            'RIPE Atlas',
            create={  # Create it if it doesn't exist
                'add RIPE Atlas',  # Commit message
                'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.',  # Item description
                [[self.wh.get_pid('managed by'),
                  self.wh.get_qid('RIPE NCC')]]

        # Get the QID for Atlas Probe
        self.atlas_probe_qid = self.wh.get_qid(
            'Atlas probe',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.',  # Item description
                'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]

        # Get the QID for Atlas Anchor
        self.atlas_anchor_qid = self.wh.get_qid(
            'Atlas anchor',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.',  # Item description
                'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]

        # Get the QID of the item representing PeeringDB IX IDs
        self.probeid_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add RIPE Atlas probes',  # Commit message
                'Identifier for a probe in the RIPE Atlas measurement platform'  # Description

        # Load the QIDs for probes already available in the wikibase
        self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid)

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('RIPE NCC')),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

        self.v4_qualifiers = [(self.wh.get_pid('IP version'),

        self.v6_qualifiers = [(self.wh.get_pid('IP version'),

    def run(self):
        """Fetch probe information from Atlas API and push to wikibase. """

        next_page = URL

        while next_page is not None:
            req = requests.get(next_page)
            if req.status_code != 200:
                sys.exit('Error while fetching the blocklist')

            info = json.loads(req.text)
            next_page = info['next']

            for i, probe in enumerate(info['results']):

                sys.stderr.write(f'\rProcessed {i+1} probes')

    def update_probe(self, probe):
        """Add the probe to wikibase if it's not already there and update its

        # TODO add status, geometry (geo-location) and IPs?

        # Properties for this probe
        statements = []

        if probe['is_anchor']:
                [self.wh.get_pid('instance of'), self.atlas_probe_qid])
                [self.wh.get_pid('instance of'), self.atlas_anchor_qid])
        if probe['asn_v4']:
            as_qid = self.wh.asn2qid(probe['asn_v4'])
            if as_qid:
                    self.wh.get_pid('part of'), as_qid, self.reference,
        if probe['asn_v6']:
            as_qid = self.wh.asn2qid(probe['asn_v6'])
            if as_qid:
                    self.wh.get_pid('part of'), as_qid, self.reference,
        if probe['prefix_v4']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v4'])
            if prefix_qid:
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['prefix_v6']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v6'])
            if prefix_qid:
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['country_code']:
                self.wh.country2qid(probe['country_code']), self.reference
        if probe['first_connected']:
                self.wh.get_pid('start time'),
                self.wh.to_wbtime(probe['first_connected']), self.reference

        if 'name' in probe['status']:
            # Get the QIDs for probes status
            status_qid = self.wh.get_qid(
                f'RIPE Atlas probe status: {probe["status"]["name"]}',
                create={  # Create it if it doesn't exist
                    'summary': 'add RIPE Atlas probe status',  # Commit message

            if probe['status_since']:
                    self.wh.get_pid('status'), status_qid, self.reference,
                    [(self.wh.get_pid('start time'),

            # set end time if the probe is abandonned
            if probe['status']['name'] == 'Abandoned' and probe['status_since']:
                    self.wh.get_pid('end time'),

        # Add probe tags
        for tag in probe['tags']:
                                    'summary': 'Add RIPE Atlas tag',

        # Commit to wikibase
        # Get the probe QID (create if probe is not yet registered) and commit changes
        probe_qid = self.probe_qid(probe)
        self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid,

    def probe_qid(self, probe):
        """Find the ix QID for the given probe ID.
        If this probe is not yet registered in the wikibase then add it.

        Return the probe QID."""

        id = str(probe['id'])

        # Check if the IX is in the wikibase
        if id not in self.probeid2qid:
            # Set properties for this new probe
            probeid_qualifiers = [
                (self.wh.get_pid('instance of'), self.probeid_qid),
            statements = [
                (self.wh.get_pid('instance of'), self.atlas_probe_qid),
                (self.wh.get_pid('external ID'), id, [], probeid_qualifiers)

            # Add this probe to the wikibase
            probe_qid = self.wh.add_item('add new RIPE Atlas probe',
                                         label=f'RIPE Atlas probe #{id}',
            # keep track of this QID
            self.probeid2qid[id] = probe_qid

        return self.probeid2qid[id]
class Crawler(object):
    def __init__(self, fdns_url=URL):
        """Fetch QID for Rapid7 and initialize wikihandy."""

        self.fdns_url = fdns_url
        # Helper for wiki access
        self.wh = Wikihandy()

        self.org_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add Rapid7 forward DNS data',  # Commit message
                'Rapid7, a security company that provides unified vulnerability management solutions',  # Item description
                'statements': [
                        self.wh.get_pid('instance of'),
                    [self.wh.get_pid('website'), 'https://www.rapid7.com/'],

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'), self.org_qid),
                          (self.wh.get_pid('reference URL'), fdns_url),
                          (self.wh.get_pid('point in time'), today)]

        self.ia = ip2asn(wikihandy=self.wh)

        # keep track of all resolved prefixes so we just make one push per
        # domain
        self.tld_pfx = defaultdict(set)

    def match_domain_prefix(self, line):
        """Parse a line from the rapid7 dataset, extract the domain and ip, and
        find the corresponding IP prefix. 

        return: (domain name, prefix) or None, None if the domain is not in the wiki

        tld = None
        prefix = None

        datapoint = json.loads(line)
        if (datapoint['type'] in ['a', 'aaaa'] and 'value' in datapoint
                and 'name' in datapoint):

            ext = tldextract.extract(datapoint['name'])
            tld = ext[-2] + '.' + ext[-1]

            # skip domains not in the wiki
            if self.wh.domain2qid(tld) is None:
                return tld, None

            ip_info = self.ia.lookup(datapoint['value'])
            if ip_info is None:
                return tld, None

            prefix = ip_info['prefix']

        return tld, prefix

    def run(self):
        """Fetch Rapid7 DNS forward data, find corresponding BGP prefixes 
        and push resolution for domains already in the wikibase. """

        # download rapid7 data and find corresponding prefixes
        sys.stderr.write('Downloading Rapid7 dataset...\n')
        fname = self.fdns_url.split('/')[-1]
        if not os.path.exists(fname):
            fname = download_file(self.fdns_url, fname)

        sys.stderr.write('Processing dataset...\n')
        if os.path.exists(fname + '.pickle'):
            sys.stderr.write('Load data from cache!')
            self.tld_pfx = pickle.load(open(fname + '.pickle', 'rb'))
            with gzip.open(fname, 'rt') as finput:
                for line in finput:

            pickle.dump(self.tld_pfx, open(fname + '.pickle', 'wb'))

            f'Found {len(self.tld_pfx)} domain names in Rapid7 dataset out of the {len(self.wh._domain2qid)} domain names in wiki\n'
        # push data to wiki
        for i, (tld, pfxs) in enumerate(self.tld_pfx.items()):
                f'\33[2K\rUpdating iyp... {i+1}/{len(self.tld_pfx)}\t{tld} {len(pfxs)} prefixes'
            self.update(tld, pfxs)


    def update(self, tld, pfxs):
        """Update statements for the given domain name."""

        # make all statements
        statements = []
        for pfx in pfxs:
            pfx_qid = self.wh.prefix2qid(pfx, create=True)
                [self.wh.get_pid('forward DNS'), pfx_qid, self.reference])

        # Commit to wikibase
        # Get the domain name QID  and commit changes
        dn_qid = self.wh.domain2qid(tld)
            # TODO remove old data with URL regex
            self.wh.upsert_statements('update from Rapid7 forward DNS data',
                                      dn_qid, statements)
        except Exception as e:
            logging.error(f"Could not update domain {dn_qid}")
Ejemplo n.º 15
class Crawler(object):
    def __init__(self):

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for Spamhaus organization
        self.spamhaus_qid = self.wh.get_qid(
            create={  # Create it if it doesn't exist
                'add Spamhaus organization',  # Commit message
                'The Spamhaus Project is an international organisation to track email spammers and spam-related activity',  # Item description
                'The Spamhaus Project|the spamhaus project',
                'statements': [[
                    self.wh.get_pid('instance of'),

        # Get the QID for Spamhaus DROP project
        self.drop_qid = self.wh.get_qid(
            'Spamhaus DROP lists',
            create={  # Create it if it doesn't exist
                'summary': 'add Spamhaus block list',  # Commit message
                "The Spamhaus Don't Route Or Peer Lists",  # Item description
                [[self.wh.get_pid('managed by'), self.spamhaus_qid]]

        # Get the QID for Spamhaus ASN-DROP list
        self.asn_drop_qid = self.wh.get_qid(
            'Spamhaus ASN-DROP list',
            create={  # Create it if it doesn't exist
                'add Spamhaus block list',  # Commit message
                'ASN-DROP contains a list of Autonomous System Numbers controlled by spammers or cyber criminals, as well as "hijacked" ASNs. ',  # Item description
                [[self.wh.get_pid('managed by'), self.spamhaus_qid],
                 [self.wh.get_pid('part of'), self.drop_qid]]

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'), self.spamhaus_qid),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

    def run(self):
        """Fetch blocklist from Spamhaus and push to wikibase. """

        req = requests.get(URL)
        if req.status_code != 200:
            sys.exit('Error while fetching the blocklist')

        for i, row in enumerate(req.text.splitlines()):
            # Skip the header
            if row.startswith(';'):

            sys.stderr.write(f'\rProcessed {i+1} ASes')

    def update_net(self, one_line):
        """Add the network to wikibase if it's not already there and update its

        asn, _, cc_name = one_line.partition(';')
        asn = int(asn[2:])
        cc, name = [word.strip() for word in cc_name.split('|')]

        # Properties for this AS
        statements = [
                self.wh.get_pid('reported in'), self.asn_drop_qid,
            [self.wh.get_pid('name'), name, self.reference],

        # set countries
        if len(cc) == 2:
            cc_qid = self.wh.country2qid(cc)
            if cc_qid is not None:
                    [self.wh.get_pid('country'), cc_qid, self.reference])

        # Commit to wikibase
        # Get the AS QID (create if AS is not yet registered) and commit changes
        net_qid = self.wh.asn2qid(asn, create=True)
        self.wh.upsert_statements('update from Spamhaus ASN DROP list',
                                  net_qid, statements)