def remove_element(self, data, source_url=None, whitelist=False):
     '''
     Remove a single element from the data table, by default the
     function removes all entries matching 'data' regardless of source_url.
     You can do a more selective removal by defining the source_url of the
     entry to be removed.
     '''
     if not isinstance(data, str):
         raise Exceptions.NotString('address must be a string')
     if not isinstance(source_url, None) and not isinstance(
             source_url, str):
         raise Exceptions.NotString('source_url must be a string or None')
     element = data.rstrip()
     if not source_url and not whitelist:
         data_remove = (element, )
         remove_line = ('''DELETE FROM data WHERE name=?''')
     elif not source_url and whitelist:
         data_remove = (element, )
         remove_line = ('''DELETE FROM exceptions WHERE name=?''')
     elif source_url and whitelist:
         errmsg = 'Can not operate on whitelist with source url'
         raise Exceptions.DatabaseError(errmsg)
     else:
         data_remove = (element, source_url)
         remove_line = ('''DELETE FROM data WHERE name=? AND source=?''')
     self.db_cur.execute(remove_line, data_remove)
Beispiel #2
0
    def action_output(self):
        '''
        Validate everything from the db then format and finally write output
        '''
        self.logger.log.info('Started output module')
        err = 0  # invalid lines
        valid = 0  # valid lines
        results = self.db.pull_names_2(self.args.expiry, self.base_type)
        if not results:
            raise Exceptions.DatabaseError('No results from db found')

        pending = []
        for result in results:
            if Data.VALIDATOR[self.base_type](result[0]):
                valid += 1
                pending.append(result[0])
            else:
                err += 1
        ## LOG errors and valid counts
        icountmsg = ('Counted ' + str(err) + ' invalid addresses')
        # log how many addresses where dropped
        self.logger.log.debug(icountmsg)
        countmsg = ('Counted ' + str(valid) + ' valid addresses')
        # log how many addresses are valid
        self.logger.log.debug(countmsg)

        if len(pending) < 1:
            self.logger.log.error('No addresses found. Exiting.')
            raise Exceptions.UnsuccessfulExit()

        # format the page
        output = Data.FORMAT[self.args.format](pending)
        if not output:
            self.logger.log.error('Nothing to output, exiting non-zero')
            raise Exceptions.UnsuccessfulExit()

        # gather existing filemode
        if os.path.exists(self.args.output):
            stats = os.stat(self.args.output)
        else:
            stats = None

        with NamedTemporaryFile(mode='w+', delete=True) as tmp:
            tmp.write(output)
            tmp.flush()
            copy(tmp.name, self.args.output)
            self.logger.log.warning('Wrote to ' + str(self.args.output))
        # attempt to set old filemode
        if stats:
            try:
                os.chmod(self.args.output, mode=stats.st_mode)
            except OSError:
                err = 'Failed to chmod permissions from original file'
                self.logger.log.error(err)
        return
    def extract_data(data):
        '''
        use re.finall to get domains out of ABP filters and return as a list of
        tuples containing regex groups
        returns the list or raises Exceptions.NoMatchesFound

        matches AdblockPlus (ABP) filter syntax for domain names
        eg. ||google.com^
        eg. ||google.com^$third-party
        '''
        # regex string for ABP domains in Regex.abp_domain
        pattern = re.compile(Regex.ABP_DOMAIN, re.MULTILINE)

        # exclude third party rules
        #   if third_party is not True:
        #       pattern = re.compile(Regex.abp_domain_nothird, re.MULTILINE)

        matches = re.findall(pattern, data)
        if matches:
            '''
            match objects come in a tuple for each regex group so the group
            containing the domain name must be separated
            '''
            index = len(matches)
            while index > 0:
                index = index - 1
                matches[index] = matches[index][0]
            return matches
        else:
            raise Exceptions.NoMatchesFound("No ABP syntax domains found.")
 def pull_active_source_urls(self):
     '''
     return a list of the blacklist urls that need updating from sources
     '''
     cur = self.db_cur
     pull_line = ('''SELECT url, page_format, last_modified_head FROM''' +
                  ''' sources WHERE ? > last_updated + timeout''')
     self.db_cur.execute(pull_line, (time(), ))
     # any invalid urls found increment this
     errcnt = 0
     urls = []
     for url_result in cur.fetchall():
         if url_result:
             result = {
                 'url': url_result[0],
                 'page_format': url_result[1],
                 'last_modified': url_result[2]
             }
             urls.append(result)
         else:
             errcnt += 1
     if urls:
         return urls
     else:
         errmsg = ('All urls on cooldown or none in database.' +
                   ' Invalid urls found in db: ' + str(errcnt))
         raise Exceptions.NoMatchesFound(errmsg)
    def add_element(self, data, data_type, source_url, whitelist=False):
        '''
        add a single element to the db
        NOTE: this checks time.time() every time it is executed, probably
        going to be very slow if it's called lots
        '''
        if not isinstance(data, str):
            raise Exceptions.NotString('address must be a string')
        current_time = time()
        data_insert = (data.rstrip(), data_type, current_time, current_time,
                       source_url)
        time_update = (current_time, source_url, data.rstrip())
        whitelist_insert = (data.rstrip(), data_type)

        line = ('''INSERT OR IGNORE INTO data''' +
                '''VALUES ( ?, ?, ?, ?, ? ) ''')
        time_line = (''' UPDATE data ''' +
                     '''SET last_seen=?, source_url=? WHERE name=?''')

        white_line = ('''INSERT OR IGNORE INTO exceptions ''' +
                      '''VALUES (?, ?)''')

        if whitelist:
            self.db_cur.execute(white_line, whitelist_insert)
        else:
            self.db_cur.execute(line, data_insert)
            self.db_cur.execute(time_line, time_update)
    def bulk_add(self, data_lst, data_type, source_url):
        '''
        add a list of items to the db using executemany
        ! Does not validate do it elsewhere TODO integrate val here
        ! Does not explicitly commit
        '''
        current_time = time()

        time_update = []
        data_insert = []

        if not data_lst:
            errmsg = 'No items to add.'
            raise Exceptions.EmptyList(errmsg)
        for each in data_lst:
            data = each.rstrip()
            data_insert.append(
                (data, data_type, current_time, current_time, source_url))
            time_update.append((current_time, data, source_url))

        #NOTE: this code above only writes the last url w/ addr to source
        iline = (''' INSERT OR IGNORE INTO data''' +
                 ''' VALUES ( ?, ?, ?, ?, ? )''')
        tline = (''' UPDATE data''' +
                 ''' SET last_seen=? WHERE name=? AND source_url=?''')
        try:
            self.db_cur.executemany(iline, data_insert)
            self.db_cur.executemany(tline, time_update)
        except:
            raise

        return True
    def extract_data(data):
        pattern = re.compile(Regex.IPV4_ADDR, re.MULTILINE)
        matches = re.findall(pattern, data)

        if matches:
            return matches
        else:
            raise Exceptions.NoMatchesFound("No ip addreses found.")
 def __init__(self, db_path=None):
     '''
     - opens a connection to a sqlite3 db (or creates a new one)
     - db_path is the pathname of the sqlite3 database
     '''
     if path.isfile(db_path):
         # check file is sqlite3 format
         if self.sqlite3_db_file_type(db_path) is False:
             errmsg = 'Existing file ' + str(db_path) + ' not a sqlite3 db'
             raise Exceptions.BadFileType(errmsg)
         # check file has the right application_id for blocklistparser
         if self.sqlite3_db_application_id(db_path) is False:
             errmsg = 'File is a sqlite3 db, but the application_id is wrong'
             raise Exceptions.BadFileType(errmsg)
     self.db_conn = connect(db_path)
     self.db_cur = self.db_conn.cursor()
     self.init_db()
 def test_source_url(self, url):
     try:
         line = '''SELECT * FROM sources WHERE url=?'''
         self.db_cur.execute(line, (str(url), ))
     except DatabaseError:
         raise
     if self.db_cur.fetchone() is None:
         errmsg = 'No source urls matching input found'
         raise Exceptions.NoMatchesFound(errmsg)
     return True
 def add_to_db(self, db_manager):
     '''
     add this list to a databaseb via db connection
     '''
     # re-validate before entering into db
     if db_manager.bulk_add(
             self.data,
             self.base_type,
             self.source_url) is True:
         pass
     else:
         errmsg = 'Error adding list to database'
         raise Exceptions.ExtractorError(errmsg)
 def extract_data(data):
     '''
     Extract newline data into a list
     eg.
     google.com
     wikipedia.org
     '''
     pattern = re.compile(Regex.NEWLINE_DOMAIN, re.MULTILINE)
     matches = re.findall(pattern, data)
     if matches:
         return matches
     else:
         raise Exceptions.NoMatchesFound(
             "No newline formatted domains found.")
    def __init__(self, data, datatype, source=None, raise_errors=False):
        self.data = []
        self.index = -1 # start index at -1 b/c it is inc before return
        self.source_url = source

        if datatype not in VALIDATOR.keys():
            errmsg = 'data type ' + str(datatype) + ' not supported'
            raise Exceptions.IncorrectDataType(errmsg)
        self.datatype = datatype
        self.base_type = BASE_TYPE[self.datatype]
    
        assert len(data) > 0, 'DataList argument data is empty'

        for line in data:
            errmsg = ('Not a valid ' + self.datatype + ' address')
            try:
                assert VALIDATOR[self.datatype](str(line)), errmsg
                self.data.append(str(line))
            except AssertionError:
                pass
Beispiel #13
0
def get_webpage(url, proxy=False, fake_user_agent=True, last_modified=None):
    '''
    - open a webpage and return the result using urllib2
    - use proxy=True to let urllib2 detect system proxy
    - fake_user_agent is on by default because some blacklists reject
      urllib user agents (spoofs a windows/ff ua)
    - last_modified should be exact string returned in server header
      for Last-Modified header
    '''
    # if proxy is set establish proxy handler
    # NOTE: python3 docs say urllib looks for proxy anyway
    # so this may still proxy even if proxy is set to False
    if proxy:
        proxy = ProxyHandler()
        opener = build_opener(proxy)
    else:
        opener = build_opener()

    # spoof the user agent
    # TODO: add more user agents
    agent = "Mozilla/5.0 (Windows NT 6.2; rv:10.0) Gecko/20100101 Firefox/33.0)"
    headers = []
    if fake_user_agent:
        headers.append(('User-Agent', agent))
    if last_modified:
        headers.append(('If-Modified-Since', last_modified))  # TODO VALIDATE

    opener.addheaders = headers

    # try and get the webpage
    page = opener.open(url)

    # check the page is something
    if not page:
        raise Exceptions.NetError('Unknown problem opening webpage')
    # return the result
    return page
def format_detector(data):
    '''
    Automatic detection of data type. Returns either a string naming the content
    type or raises a BadDataType exception
    Currently supported types and return values are below.
    supported, return_value
        - adblock plus filter format, 'adblock'
        - domain per line, 'newline'
    '''
    # test adblock plus filter format
    try:
        ABPParser.extract_data(data)
        return 'adblock'
    except Exceptions.NoMatchesFound:
        pass
    # test newline format
    try:
        NewlineParser.extract_data(data)
        return 'newline'
    except Exceptions.NoMatchesFound:
        pass

    raise Exceptions.IncorrectDataType(
        'Unable to detect format of input data.')
Beispiel #15
0
    def action_update(self):
        self.logger.log.info('Started update module')
        retr = []
        try:
            # this will contain a tuple of url, last_modified
            # the last_modified header will be None or a Last-Modified header
            to_be_updated = self.db.pull_active_source_urls()
        except Exceptions.NoMatchesFound:
            self.logger.log.error('No sources ready to update. Exiting.')
            raise Exceptions.UnsuccessfulExit()
        self.logger.log.debug(
            str(len(to_be_updated)) + ' sources to be updated')

        # GET THE WEBPAGES
        self.logger.log.debug('Started retrieving webpages')
        for entry in to_be_updated:  # get the webpages
            self.logger.log.debug('URL ' + str(entry['url']) +
                                  ' last updated ' +
                                  str(entry['last_modified']))
            try:
                response = Net.get_webpage(
                    url=entry['url'], last_modified=entry['last_modified'])
                result = {
                    'web_response': response,
                    'source_config': entry,
                    'url': entry['url']
                }
                retr.append(result)
            except error.HTTPError as ue:
                if ue.code == 304:
                    self.logger.log.debug('Not Modified ' + str(entry['url']))
                else:
                    self.logger.log.error(
                        str(ue.code) + ' Error ' + str(entry['url']))
            except error.URLError as ue:
                self.logger.log.error('ERROR ' + str(ue))

        if not retr:
            self.logger.log.warning('No webpages to parse. Exiting.')
            raise Exceptions.UnsuccessfulExit()

        # Process webpages into data
        self.logger.log.info('Processing webpages')
        db_modified = True
        for result in retr:
            try:
                page = result['web_response'].read().decode('utf-8')
                lines = page.splitlines()

                self.logger.log.debug(str(len(page)) + ' lines in page.')
                self.logger.log.debug(str(result['web_response'].info()))

                # check page actually contains something
                assert len(lines) > 0

            except URLError:
                self.logger.log.debug('Webpage failed to decode into utf-8')
                page = result['web_response'].read()
            except AssertionError:
                self.logger.log.error('page was empty')
            else:  # try and enter data into db and update values only if success
                # IPList will only put validated data in self.data
                processed_data = Data.DataList(
                    lines,
                    datatype=result['source_config']['page_format'],
                    source=result['web_response'].geturl())
                # Add data to DB
                try:
                    processed_data.add_to_db(self.db)
                    self.logger.log.debug('Added uncommitted content to db')
                except Exceptions.ExtractorError:
                    self.logger.log.error('Failed to add page content to db')
                    # raise # this causes bugs when page has no valid content
                # Update Last-Modified into DB
                else:
                    try:
                        wurl = result['web_response'].geturl()
                        lmod = result['web_response'].info()['Last-Modified']
                        self.db.update_last_modified(wurl, lmod)
                        self.logger.log.debug('Last-Modified updated for ' +
                                              str(wurl) + ' to ' + str(lmod))
                        # Update last_updated into sources
                        self.db.touch_source_url(result['url'])
                    except SQLError:
                        self.logger.log.error('Failed to update Last-Modified')
                        self.logger.log.error(
                            'Failed to update source last updated')
                        self.logger.log.error('Aborting without commit')
        # COMMIT
        try:
            self.db.db_conn.commit()
            self.logger.log.debug('Commit to sqlite3 db success')
        except SQLError:
            self.logger.log.error('Commit to sqlite3 db FAILED')
            raise
Beispiel #16
0
    def action_source(self):
        self.logger.log.debug('starting source action')
        if self.args.add is not None:
            # attempt to add a url
            logmsg = ('attempting to add source url: ' + self.args.add)
            self.logger.log.debug(logmsg)
            try:
                Database.Manager.test_source_url(self.db, self.args.add)
                self.logger.log.info('source url already present in database')
                if self.args.group:
                    self._action_group()
                    self.db.db_conn.commit()
                # success
                return
            except Exceptions.NoMatchesFound:
                # excpected when adding a new url
                pass

            Database.Manager.add_source_url(self.db, self.args.add,
                                            self.args.format,
                                            self.args.interval)
            # commit
            self.db.db_conn.commit()
            # check url is added
            try:
                Database.Manager.test_source_url(self.db, self.args.add)
                self.logger.log.info('source added to database OK')
                if self.args.group:
                    self._action_group()
                    self.db.db_conn.commit()
                # success
                return
            except Exceptions.NoMatchesFound as error:
                self.logger.log.error('FAILED to add source url to database!')
                # fail
                raise Exceptions.UnsuccessfulExit(str(error))

        # remove a url
        elif self.args.remove is not None:
            # check url is actually in db
            try:
                self.db.test_source_url(self.args.remove)
            except Exceptions.NoMatchesFound:
                msg = 'Entry does not exist in the database.'
                self.logger.log.info(msg)
                return
            # attempt to remove url
            self.db.delete_source_url(self.args.remove)
            self.db.db_conn.commit()
            # check removal is ok
            try:
                self.db.test_source_url(self.args.remove)
                msg = 'FAILED removing source url from database!'
                self.logger.log.error(msg)
            except Exceptions.NoMatchesFound:
                # success removing url
                self.logger.log.info('Removed source url from database OK')

        else:
            msg = 'source action must include --add or --remove'
            raise self.source_parser.error(msg)