Beispiel #1
0
class Crawler(object):
    def __init__(self):
        """Initialize wikihandy and qualifiers for pushed data"""
    
        # Helper for wiki access
        self.wh = Wikihandy()

        # Added properties will have this additional information
        today = self.wh.today()
        self.org_qid = self.wh.get_qid('RIPE NCC')
        self.url = URL_API  # url will change for each country
        self.reference = [
                (self.wh.get_pid('source'), self.org_qid),
                (self.wh.get_pid('reference URL'), self.url),
                (self.wh.get_pid('point in time'), today)
                ]

    def get_last_line(self,line):
        """Keep the end of the last given line"""

        self.last_line = line.rpartition(' ')[2]

    def get_all_lines(self, line):
        """Keep the end of each given lines"""

        self.all_lines.append(line.rpartition(' ')[2])

    def run(self):
        """Fetch data from RIPE and push to wikibase. """

        now = date.today()
        today = f'{now.year}/{now.month:02d}/{now.day:02d}'

        logging.info('Connecting to the FTP server..')
        # Find latest roa files
        filepaths = []
        ftp = FTP(FTP_URL)
        ftp.login()
        ftp.cwd(FTP_ROOT)

        self.all_lines = []
        self.last_line = ''
        ftp.retrlines('LIST', callback=self.get_all_lines)

        logging.info('Listing directories...')
        logging.info(f'{self.all_lines}')
        for dir in self.all_lines:
            path = FTP_ROOT+'/'+dir
            ftp.cwd(path)
            self.last_line = ''
            while self.last_line not in ['roas.csv', 'repo.tar.gz']:
                ftp.cwd(self.last_line)
                path += self.last_line + '/'
                ftp.retrlines('LIST', callback=self.get_last_line)

            if self.last_line == 'roas.csv' and today in path:
                path += 'roas.csv'
                logging.info(f'Found ROA file: {path}')
                filepaths.append(path)

        for filepath in filepaths:
            self.url = URL_API+filepath
            logging.info(f'Fetching ROA file: {self.url}')
            req = requests.get( self.url )
            if req.status_code != 200:
                sys.exit('Error while fetching data for '+filepath)
            
            # Aggregate data per prefix
            prefix_info = defaultdict(list)
            for line in req.text.splitlines():
                url, asn, prefix, max_length, start, end = line.split(',')
                
                # Skip header
                if url=='URI':
                    continue

                prefix_info[prefix].append({
                    'url': url, 
                    'asn': asn, 
                    'max_length': max_length, 
                    'start': start, 
                    'end': end})

            for i, (prefix, attributes) in enumerate(prefix_info.items()):
                self.update(prefix, attributes)
                sys.stderr.write(f'\rProcessing {filepath}... {i+1} prefixes ({prefix})     ')

    def update(self, prefix, attributes):
        """Add the prefix to wikibase if it's not already there and update its
        properties."""

        statements = []
        for att in attributes:
        
            qualifiers = [
                    [self.wh.get_pid('start time'), self.wh.to_wbtime(att['start'])],
                    [self.wh.get_pid('end time'), self.wh.to_wbtime(att['end'])],
                #    [self.wh.get_pid('reference URL'), url ] 
                    ]

            if att['max_length']:
                qualifiers.append( [self.wh.get_pid('maxLength'), {'amount': att['max_length']} ] )

            # Properties
            asn_qid = self.wh.asn2qid(att['asn'], create=True)
            if asn_qid is None:
                print('Error: ', line)
                return

            statements.append(
                        [ self.wh.get_pid('route origin authorization'), 
                            asn_qid,
                            self.reference,
                            qualifiers
                        ]
                    )

        # Commit to wikibase
        # Get the prefix QID (create if prefix is not yet registered) and commit changes
        prefix_qid = self.wh.prefix2qid(prefix, create=True) 
        self.wh.upsert_statements('update from RIPE RPKI data', prefix_qid, statements )
class Crawler(object):
    def __init__(self):
        """
        """

        # Helper for wiki access
        self.wh = Wikihandy(preload=True)

        # Get the QID for RIPE Atlas
        self.atlas_qid = self.wh.get_qid(
            'RIPE Atlas',
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas is a global, open, distributed Internet measurement platform, consisting of thousands of measurement devices that measure Internet connectivity in real time.',  # Item description
                'aliases':
                'Atlas|atlas',
                'statements':
                [[self.wh.get_pid('managed by'),
                  self.wh.get_qid('RIPE NCC')]]
            })

        # Get the QID for Atlas Probe
        self.atlas_probe_qid = self.wh.get_qid(
            'Atlas probe',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas probes form the backbone of the RIPE Atlas infrastructure.',  # Item description
                'aliases': 'RIPE Atlas probe|atlas probe|RIPE atlas probe',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID for Atlas Anchor
        self.atlas_anchor_qid = self.wh.get_qid(
            'Atlas anchor',
            create={  # Create it if it doesn't exist
                'summary': 'add RIPE Atlas',  # Commit message
                'description':
                'RIPE Atlas Anchors are located at hosts that can provide sufficient bandwidth to support a large number of incoming and outgoing measurements.',  # Item description
                'aliases': 'RIPE Atlas anchor|atlas anchor|RIPE atlas anchor',
                'statements': [[self.wh.get_pid('part of'), self.atlas_qid]]
            })

        # Get the QID of the item representing PeeringDB IX IDs
        self.probeid_qid = self.wh.get_qid(
            PROBEID_LABEL,
            create={  # Create it if it doesn't exist
                'summary':
                'add RIPE Atlas probes',  # Commit message
                'description':
                'Identifier for a probe in the RIPE Atlas measurement platform'  # Description
            })

        # Load the QIDs for probes already available in the wikibase
        self.probeid2qid = self.wh.extid2qid(qid=self.probeid_qid)

        # Added properties will have this additional information
        today = self.wh.today()
        self.reference = [(self.wh.get_pid('source'),
                           self.wh.get_qid('RIPE NCC')),
                          (self.wh.get_pid('reference URL'), URL),
                          (self.wh.get_pid('point in time'), today)]

        self.v4_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv4'))]

        self.v6_qualifiers = [(self.wh.get_pid('IP version'),
                               self.wh.get_qid('IPv6'))]

    def run(self):
        """Fetch probe information from Atlas API and push to wikibase. """

        next_page = URL

        while next_page is not None:
            req = requests.get(next_page)
            if req.status_code != 200:
                sys.exit('Error while fetching the blocklist')

            info = json.loads(req.text)
            next_page = info['next']

            for i, probe in enumerate(info['results']):

                self.update_probe(probe)
                sys.stderr.write(f'\rProcessed {i+1} probes')
            sys.stderr.write(f'\n')

    def update_probe(self, probe):
        """Add the probe to wikibase if it's not already there and update its
        properties."""

        # TODO add status, geometry (geo-location) and IPs?

        # Properties for this probe
        statements = []

        if probe['is_anchor']:
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_probe_qid])
            statements.append(
                [self.wh.get_pid('instance of'), self.atlas_anchor_qid])
        if probe['asn_v4']:
            as_qid = self.wh.asn2qid(probe['asn_v4'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v4_qualifiers
                ])
        if probe['asn_v6']:
            as_qid = self.wh.asn2qid(probe['asn_v6'])
            if as_qid:
                statements.append([
                    self.wh.get_pid('part of'), as_qid, self.reference,
                    self.v6_qualifiers
                ])
        if probe['prefix_v4']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v4'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['prefix_v6']:
            prefix_qid = self.wh.prefix2qid(probe['prefix_v6'])
            if prefix_qid:
                statements.append(
                    [self.wh.get_pid('part of'), prefix_qid, self.reference])
        if probe['country_code']:
            statements.append([
                self.wh.get_pid('country'),
                self.wh.country2qid(probe['country_code']), self.reference
            ])
        if probe['first_connected']:
            statements.append([
                self.wh.get_pid('start time'),
                self.wh.to_wbtime(probe['first_connected']), self.reference
            ])

        if 'name' in probe['status']:
            # Get the QIDs for probes status
            status_qid = self.wh.get_qid(
                f'RIPE Atlas probe status: {probe["status"]["name"]}',
                create={  # Create it if it doesn't exist
                    'summary': 'add RIPE Atlas probe status',  # Commit message
                })

            if probe['status_since']:
                statements.append([
                    self.wh.get_pid('status'), status_qid, self.reference,
                    [(self.wh.get_pid('start time'),
                      self.wh.to_wbtime(probe['status_since']))]
                ])

            # set end time if the probe is abandonned
            if probe['status']['name'] == 'Abandoned' and probe['status_since']:
                statements.append([
                    self.wh.get_pid('end time'),
                    self.wh.to_wbtime(probe['status_since'])
                ])

        # Add probe tags
        for tag in probe['tags']:
            statements.append([
                self.wh.get_pid('tag'),
                self.wh.get_qid(tag['name'],
                                create={
                                    'summary': 'Add RIPE Atlas tag',
                                })
            ])

        # Commit to wikibase
        # Get the probe QID (create if probe is not yet registered) and commit changes
        probe_qid = self.probe_qid(probe)
        self.wh.upsert_statements('update from RIPE Atlas probes', probe_qid,
                                  statements)

    def probe_qid(self, probe):
        """Find the ix QID for the given probe ID.
        If this probe is not yet registered in the wikibase then add it.

        Return the probe QID."""

        id = str(probe['id'])

        # Check if the IX is in the wikibase
        if id not in self.probeid2qid:
            # Set properties for this new probe
            probeid_qualifiers = [
                (self.wh.get_pid('instance of'), self.probeid_qid),
            ]
            statements = [
                (self.wh.get_pid('instance of'), self.atlas_probe_qid),
                (self.wh.get_pid('external ID'), id, [], probeid_qualifiers)
            ]

            # Add this probe to the wikibase
            probe_qid = self.wh.add_item('add new RIPE Atlas probe',
                                         label=f'RIPE Atlas probe #{id}',
                                         description=probe['description'],
                                         statements=statements)
            # keep track of this QID
            self.probeid2qid[id] = probe_qid

        return self.probeid2qid[id]