def check_diff(self):
        idx_list = [
            idx.id for idx in History.select(History.id).order_by(
                History.id.desc()).limit(self.cfg.DiffCount())
        ]
        ip_diff_add_sql = IP.select(fn.Count(fn.Distinct(
            IP.ip))).join(Item).where(IP.add == idx_list[0]).scalar()
        ip_diff_purge_sql = IP.select(fn.Count(fn.Distinct(
            IP.ip))).join(Item).where(IP.purge == idx_list[0]).scalar()
        domain_diff_add_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\
            .join(Item).where(Domain.add == idx_list[0]).scalar()
        domain_diff_purge_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\
            .join(Item).where(Domain.purge == idx_list[0]).scalar()
        url_diff_add_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\
            .join(Item).where(URL.add == idx_list[0]).scalar()
        url_diff_purge_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\
            .join(Item).where(URL.purge == idx_list[0]).scalar()

        if ip_diff_add_sql or ip_diff_purge_sql or domain_diff_add_sql or \
                domain_diff_purge_sql or url_diff_add_sql or url_diff_purge_sql:
            History.update(dump=True).where(
                History.id == idx_list[0]).execute()
            return True
        else:
            # History.update(dump=False).where(History.id == idx_list[0]).execute()
            return False
Esempio n. 2
0
 def cleaner(self):
     logger.info('cleaner run')
     # history = History.select(History.id).order_by(History.id.desc()).limit(self.cfg.DiffCount())
     # Item.delete().where(~(Item.purge << history)).execute()
     history_del = History.select(History.id).order_by(History.id.desc()).offset(self.cfg.DiffCount())
     Item.delete().where(Item.purge << history_del).execute()
     IP.delete().where(IP.purge << history_del).execute()
     Domain.delete().where(Domain.purge << history_del).execute()
     URL.delete().where(URL.purge << history_del).execute()
Esempio n. 3
0
File: tests.py Progetto: xecgr/kafka
    def test_01_insert_init_data(self):
        """sample domain insertion test"""
        _domains = dict(domains)
        for name, url in _domains.items():
            d = Domain(domain=name, url=url)
            d.save()

        for d in Domain.get_by_filters():
            _domains.pop(d.domain, None)
        self.assertEqual(_domains, {})
Esempio n. 4
0
 def _domain_rollback_sql(self, rollback, bt):
     rb_list = self.idx_list[:rollback]
     if bt == 'ignore':
         domain_sql = Domain.select(fn.Distinct(Domain.domain))\
             .where(~(Domain.add << rb_list) & ((Domain.purge >> None) | (Domain.purge << rb_list)))
         return domain_sql
     elif bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask':
         domain_sql = Domain.select(fn.Distinct(Domain.domain))\
             .join(Item).where((Item.blockType == bt) & ~(Domain.add << rb_list) &
                               ((Domain.purge >> None) | (Domain.purge << rb_list)))
         return domain_sql
Esempio n. 5
0
 def _domain_diff_sql(self, diff, bt, stat):
     if stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'):
         domain_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
             .where(Item.blockType == bt, Domain.add == self.idx_list[diff])
         return domain_sql
     elif not stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'):
         domain_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
             .where(Item.blockType == bt, Domain.purge == self.idx_list[diff])
         return domain_sql
     elif stat and bt == 'ignore':
         domain_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.add == self.idx_list[diff])
         return domain_sql
     elif not stat and bt == 'ignore':
         domain_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.purge == self.idx_list[diff])
         return domain_sql
 def cleaner(self):
     private_nets = [
         '0.%', '127.%', '192.168.%', '10.%', '172.16.%', '172.17.%',
         '172.18.%', '172.19.%', '172.20.%', '172.21.%', '172.22.%',
         '172.23.%', '172.24.%', '172.25.%', '172.26.%', '172.27.%',
         '172.28.%', '172.29.%', '172.30.%', '172.31.%'
     ]
     logger.info('Dump cleaner run')
     # history = History.select(History.id).order_by(History.id.desc()).limit(self.cfg.DiffCount())
     # Item.delete().where(~(Item.purge << history)).execute()
     history_clear = History.select(History.id).order_by(
         History.id.desc()).offset(self.cfg.DiffCount())
     item_del = Item.delete().where(Item.purge << history_clear).execute()
     logger.info('Item deleted: %d', item_del)
     ip_del = IP.delete().where(IP.purge << history_clear).execute()
     logger.info('IP deleted: %d', ip_del)
     domain_del = Domain.delete().where(
         Domain.purge << history_clear).execute()
     logger.info('Domain deleted: %d', domain_del)
     url_del = URL.delete().where(URL.purge << history_clear).execute()
     logger.info('URL deleted: %d', url_del)
     history_rm = History.select(History.id).order_by(
         History.id.desc()).offset(self.cfg.HistoryCount())
     hist_del = History.delete().where(History.id << history_rm).execute()
     logger.info('History deleted: %d', hist_del)
     for net in private_nets:
         ip_count = IP.delete().where(IP.ip % net).execute()
         if ip_count:
             logger.info('IP error LIKE %s, count %d', net, ip_count)
Esempio n. 7
0
File: tests.py Progetto: xecgr/kafka
 def test_03_regexp(self):
     """regexp process test"""
     d = Domain(domain='dummy', url='dummy')
     d.save()
     s = Snapshot(
         domain_id=d.id,
         pulled_at=datetime.utcnow(),
         html="find me with a regexp!!",
     )
     s.save()
     dc = DomainCheck(domain_id=d.id,
                      name="dummy_check",
                      regexp="find (me|you)")
     dc.save()
     run_regexp(snapshot_id=s.id)
     scd = SnapshotCheckData.get_by_filters(check_id=dc.id)[0]
     self.assertEqual(json.loads(scd.check_value), ["me"])
Esempio n. 8
0
File: tests.py Progetto: xecgr/kafka
 def test_02_collector(self):
     """collector process test"""
     done_snapshots = Snapshot.count_by_filters()
     for d in Domain.get_by_filters():
         ok = run_collector(domain_id=d.id)
         if ok:
             done_snapshots += 1
     total_snapshots = Snapshot.count_by_filters()
     self.assertEqual(done_snapshots, total_snapshots)
Esempio n. 9
0
    def run(self):
        """
        Main function for thread that will
        """

        # It might be good to create a process rather than a thread per file, and to create a multithreading environment
        # to process the list rather than a single thread for all of it (test under big file environment).

        # One json per line
        my_list = self.buf_file.splitlines()

        for line in my_list:
            # Ignoring if there is an empty entrance in the list
            if line:

                # load json
                json_line = json.loads(line)

                # we are going to look for creative_size, if it does not exist we will get the information from ad_width
                # and ad_height
                creative_size = find_key("creative_size", json_line)
                if not creative_size:
                    value_width = find_key("ad_width", json_line)
                    value_height = find_key("ad_height", json_line)
                    if value_width and value_height:
                        creative_size = [value_width[0] + "x" + value_height[0]]

                # We are going to look for the keys page_url and Referer
                referer = find_key("Referer", json_line)
                url = find_key("page_url", json_line)

                # If the three elements were found, introduce them in the DB.
                if creative_size and referer and url:
                    with transaction() as session:
                        added = False
                        # Check the existence of the entrance before introducing a repetitive one
                        if not session.query(Domain).filter(Domain.url==url[0]).first():
                            session.add(Domain(url[0]))
                            added = True
                        if not session.query(Referer).filter(Referer.url==referer[0]).first():
                            session.add(Referer(referer[0]))
                            added = True

                        session.flush()

                        # If one of the previous tables has a new entry. No need to check of existence in here.
                        if added:
                            session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0]))

                        elif not session.query(Information).filter(Information.domain_url==url[0])\
                                .filter(Information.referer_url==referer[0])\
                                .filter(Information.creative_size==creative_size[0]).first():
                            session.add(Information(domain_url=url[0], referer_url=referer[0], creative_size=creative_size[0]))

        print "Database updated with information from file: %s" % self.f_name
Esempio n. 10
0
    def check_diff(self):
        idx_list = [idx.id for idx in History.select(History.id).where(History.diff == True)
                    .order_by(History.id.desc()).limit(self.cfg.DiffCount())]
        ip_diff_add_sql = IP.select(fn.Count(fn.Distinct(IP.ip))).join(Item).where(IP.add == idx_list[0]).scalar()
        ip_diff_purge_sql = IP.select(fn.Count(fn.Distinct(IP.ip))).join(Item).where(IP.purge == idx_list[0]).scalar()
        domain_diff_add_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\
            .join(Item).where(Domain.add == idx_list[0]).scalar()
        domain_diff_purge_sql = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\
            .join(Item).where(Domain.purge == idx_list[0]).scalar()
        url_diff_add_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\
            .join(Item).where(URL.add == idx_list[0]).scalar()
        url_diff_purge_sql = URL.select(fn.Count(fn.Distinct(URL.url)))\
            .join(Item).where(URL.purge == idx_list[0]).scalar()

        if ip_diff_add_sql or ip_diff_purge_sql or domain_diff_add_sql or \
                domain_diff_purge_sql or url_diff_add_sql or url_diff_purge_sql:
            return True
        else:
            History.update(diff=False).where(History.id == idx_list[0]).execute()
            return False
Esempio n. 11
0
def run(*args, domain_id='', **kwargs):
    """Collector execution, get domains HTML running an HTTP GET request
    
    Keyword arguments:
    domain_id  -- domain_id that should be retrieved
    """
    logging.info("start collector run")
    domain = Domain.get_by_filters(id=domain_id)
    if domain:
        domain = domain[0]
        headers = {
            #custom headers
        }
        proxies = {
            #custom proxies
        }
        params = {
            #custom query string
        }
        res = None

        try:
            res = requests.get(
                domain.url,
                headers=headers,
                proxies=proxies,
                data=params,
                timeout=
                10  #avoid hangs! -> https://requests.readthedocs.io/en/master/user/advanced/#timeouts
            )
        except:
            #bad things can happen with users' input and i/o operations
            #here a good error callback function will be super!
            logging.exception(f"Error fething {domain.url}")
        snapshot_id = None
        if res:
            s = Snapshot(domain_id=domain_id,
                         pulled_at=datetime.utcnow(),
                         html=res.text)
            s.save()

            domain.last_snapshot_at = datetime.utcnow()
            domain.save()

            snapshot_id = s.id
            SnapshotMetadata(
                snapshot_id=s.id,
                domain_id=domain_id,
                request_time=res.elapsed.total_seconds(),
                request_status=res.status_code,
            ).save()
        return snapshot_id
Esempio n. 12
0
  def message_handler(self, channel, method, properties, body):
    if self.verbose:
      logger.info("received %r" % (body,))

    message = json.loads(body)
    domain = Domain.normalize(message.get('domain'))
    response = message.get('response')    
    query_time = message.get('time', time.time())
        
    # parse dns message if there was one
    if response:
      try:
        response = wdns.parse_message(response.decode('base64'))
      except wdns.MessageParseException, e:
        response = None
Esempio n. 13
0
    def message_handler(self, channel, method, properties, body):
        if self.verbose:
            logger.info("received %r" % (body, ))

        message = json.loads(body)
        domain = Domain.normalize(message.get('domain'))
        response = message.get('response')
        query_time = message.get('time', time.time())

        # parse dns message if there was one
        if response:
            try:
                response = wdns.parse_message(response.decode('base64'))
            except wdns.MessageParseException, e:
                response = None
Esempio n. 14
0
    def statistics_show(self, diff=0, stdout=False):

        date_time = datetime.fromtimestamp(int(Dump.get(Dump.param == 'lastDumpDate')
                                               .value)).strftime('%Y-%m-%d %H:%M:%S')

        message = 'vigruzki.rkn.gov.ru update: ' + date_time + '\n'

        url_add_sql = self._url_diff_sql(diff, 'ignore', 1)
        message += '\nURLs added: \n\n'
        for url_add in url_add_sql:
            message += url_add.url + '\n'

        ip_add_sql = self._ip_diff_sql(diff, 'ignore', 1)
        message += '\nIPs added: \n\n'
        for ip_add in ip_add_sql:
            if ip_add.mask < 32:
                message += ip_add.ip + '/' + str(ip_add.mask)
            else:
                message += ip_add.ip + '\n'

        domain_add_sql = self._domain_diff_sql(diff, 'ignore', 1)
        message += '\nDOMAINs added: \n\n'
        for domain_add in domain_add_sql:
            message += domain_add.domain + '\n'

        url_del_sql = self._url_diff_sql(diff, 'ignore', 0)
        message += '\nURLs deleted: \n\n'
        for url_del in url_del_sql:
            message += url_del.url + '\n'

        ip_del_sql = self._ip_diff_sql(diff, 'ignore', 0)
        message += '\nIPs deleted: \n\n'
        for ip_del in ip_del_sql:
            if ip_del.mask < 32:
                message += ip_del.ip + '/' + str(ip_del.mask)
            else:
                message += ip_del.ip + '\n'

        domain_del_sql = self._domain_diff_sql(diff, 'ignore', 0)
        message += '\nDOMAINs deleted: \n\n'
        for domain_del in domain_del_sql:
            message += domain_del.domain + '\n'

        rb_list = self.idx_list[:diff]
        domain_count = Domain.select(fn.Count(fn.Distinct(Domain.domain)))\
            .where(~(Domain.add << rb_list) & ((Domain.purge >> None) | (Domain.purge << rb_list))).scalar()
        url_count = URL.select(fn.Count(fn.Distinct(URL.url)))\
            .where(~(URL.add << rb_list) & ((URL.purge >> None) | (URL.purge << rb_list))).scalar()
        ip_count = IP.select(fn.Count(fn.Distinct(IP.ip)))\
            .where(~(IP.add << rb_list) & ((IP.purge >> None) | (IP.purge << rb_list))).scalar()
        id_count = Item.select(fn.Count(fn.Distinct(Item.content_id)))\
            .where(~(Item.add << rb_list) & ((Item.purge >> None) | (Item.purge << rb_list))).scalar()

        message += '\nURLs count: ' + str(url_count) + '\n'
        message += 'IPs count: ' + str(ip_count) + '\n'
        message += 'DOMAINs count: ' + str(domain_count) + '\n'
        message += 'Item count: ' + str(id_count) + '\n'

        if stdout:
            print(message)
            return False
        else:
            return message
Esempio n. 15
0
    def parse_dump(self):
        if not os.path.exists(self.path_py + '/dump.xml'):
            logger.info('dump.xml not found: s%', self.path_py + '/dump.xml')
            return 0
        logger.info('dump.xml already exists.')
        tree_xml = ElementTree().parse(self.path_py + '/dump.xml')

        dt = datetime.strptime(tree_xml.attrib['updateTime'][:19],
                               '%Y-%m-%dT%H:%M:%S')
        update_time = int(time.mktime(dt.timetuple()))
        Dump.update(value=update_time).where(
            Dump.param == 'lastDumpDate').execute()
        logger.info('Got updateTime: %s.', update_time)

        dt = datetime.strptime(tree_xml.attrib['updateTimeUrgently'][:19],
                               '%Y-%m-%dT%H:%M:%S')
        update_time_urgently = int(time.mktime(dt.timetuple()))
        Dump.update(value=update_time_urgently).where(
            Dump.param == 'lastDumpDateUrgently').execute()
        logger.info('Got updateTimeUrgently: %s.', update_time_urgently)

        list_xml = tree_xml.findall(".//*[@id]")
        id_set_dump = set()
        id_set_db = set()
        for content_xml in list_xml:
            # print(content_xml.tag, content_xml.attrib, content_xml.text)
            id_set_dump.add(int(content_xml.attrib['id']))

        select_content_id_db = Item.select(
            Item.content_id).where(Item.purge >> None)
        for content_db in select_content_id_db:
            id_set_db.add(content_db.content_id)

        common_id_set = id_set_dump.intersection(id_set_db)
        delete_id_set = id_set_db.difference(common_id_set)
        add_id_set = id_set_dump.difference(common_id_set)
        # print(delete_id_set)
        # print(add_id_set)

        if len(delete_id_set) > 0:
            with self.transact.atomic():
                for del_item in delete_id_set:
                    logger.info('Full delete Item, IP, Domain, URL id: %s.',
                                del_item)

                    Item.update(purge=self.code_id).where(
                        Item.content_id == del_item,
                        Item.purge >> None).execute()
                    Domain.update(purge=self.code_id).where(
                        Domain.content_id == del_item,
                        Domain.purge >> None).execute()
                    URL.update(purge=self.code_id).where(
                        URL.content_id == del_item,
                        URL.purge >> None).execute()
                    IP.update(purge=self.code_id).where(
                        IP.content_id == del_item, IP.purge >> None).execute()

        if len(add_id_set) > 0:
            include_time = str()
            urgency_type = int()
            entry_type = int()
            block_type = str()
            hash_value = str()
            with self.transact.atomic():
                for new_item in add_id_set:
                    logger.info('New Item, IP, Domain, URL id: %s.', new_item)
                    new_item_xml = tree_xml.find(".//content[@id='" +
                                                 str(new_item) + "']")
                    for data_xml in new_item_xml.iter():
                        if data_xml.tag == 'content':
                            content_id = int(data_xml.attrib['id'])
                            try:
                                urgency_type = int(
                                    data_xml.attrib['urgencyType'])
                            except KeyError:
                                urgency_type = 0
                            include_time = self.date_time_xml_to_db(
                                data_xml.attrib['includeTime'])
                            try:
                                block_type = data_xml.attrib['blockType']
                            except KeyError:
                                block_type = 'default'
                            entry_type = int(data_xml.attrib['entryType'])
                            hash_value = data_xml.attrib['hash']
                        if data_xml.tag == 'decision':
                            decision_date = data_xml.attrib['date']
                            decision_number = data_xml.attrib['number']
                            decision_org = data_xml.attrib['org']
                            item_new = Item(content_id=content_id,
                                            includeTime=include_time,
                                            urgencyType=urgency_type,
                                            entryType=entry_type,
                                            blockType=block_type,
                                            hashRecord=hash_value,
                                            decision_date=decision_date,
                                            decision_num=decision_number,
                                            decision_org=decision_org,
                                            add=self.code_id)
                            item_new.save()
                        if data_xml.tag == 'url':
                            if not self.only_ascii(data_xml.text):
                                url_split = str(data_xml.text).split(':')
                                url = url_split[0] + ':' + urllib.parse.quote(
                                    url_split[1])
                            else:
                                url = data_xml.text
                            URL.create(item=item_new.id,
                                       content_id=content_id,
                                       url=url,
                                       add=self.code_id)
                        if data_xml.tag == 'domain':
                            if not self.only_ascii(data_xml.text):
                                domain = (str(
                                    data_xml.text).encode('idna')).decode()
                            else:
                                domain = data_xml.text
                            Domain.create(item=item_new.id,
                                          content_id=content_id,
                                          domain=domain,
                                          add=self.code_id)
                        if data_xml.tag == 'ip':
                            ip = data_xml.text
                            IP.create(item=item_new.id,
                                      content_id=content_id,
                                      ip=ip,
                                      add=self.code_id)
                        if data_xml.tag == 'ipSubnet':
                            net = data_xml.text.split('/')
                            ip = net[0]
                            mask = net[1]
                            IP.create(item=item_new.id,
                                      content_id=content_id,
                                      ip=ip,
                                      mask=mask,
                                      add=self.code_id)

        url_db_set = set()
        url_xml_set = set()
        ip_db_set = set()
        ip_xml_set = set()
        sub_ip_xml_set = set()
        sub_ip_db_set = set()
        domain_db_set = set()
        domain_xml_set = set()
        data_update = False
        with self.transact.atomic():
            for item_xml in list_xml:
                for data_xml in item_xml.iter():
                    # print(data_xml.tag, data_xml.attrib, data_xml.text)
                    if data_xml.tag == 'content':
                        content_id = int(data_xml.attrib['id'])
                        hash_value = data_xml.attrib['hash']
                        item_db = Item.get(Item.content_id == content_id,
                                           Item.purge >> None)

                        if hash_value != item_db.hashRecord:
                            logger.info('Hashes not equal, update hash id: %s',
                                        content_id)
                            try:
                                urgency_type = int(
                                    data_xml.attrib['urgencyType'])
                            except KeyError:
                                urgency_type = 0
                            include_time = self.date_time_xml_to_db(
                                data_xml.attrib['includeTime'])
                            try:
                                block_type = data_xml.attrib['blockType']
                            except KeyError:
                                block_type = 'default'
                            entry_type = int(data_xml.attrib['entryType'])
                            item_db.hashRecord = hash_value
                            # Item.update(purge=None).where(Item.content_id == content_id).execute()
                            data_update = True
                        else:
                            data_update = False
                            break

                    if data_xml.tag == 'decision':
                        decision_date = data_xml.attrib['date']
                        decision_number = data_xml.attrib['number']
                        decision_org = data_xml.attrib['org']
                        # print(item_db)
                        if str(item_db.includeTime) != include_time:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML includeTime: %s.', include_time)
                            logger.info('DB includeTime: %s.',
                                        item_db.includeTime)
                            item_db.includeTime = include_time
                            # Item.update(includeTime=include_time).where(Item.content_id == content_id,
                            #                                             Item.purge >> None).execute()
                        if item_db.urgencyType != urgency_type:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML urgencyType: %s.', urgency_type)
                            logger.info('DB urgencyType: %s.',
                                        item_db.urgencyType)
                            item_db.urgencyType = urgency_type
                            # Item.update(urgencyType=urgency_type).where(Item.content_id == content_id,
                            #                                             Item.purge >> None).execute()
                        if item_db.blockType != block_type:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML blockType: %s.', block_type)
                            logger.info('DB blockType: %s.', item_db.blockType)
                            item_db.blockType = block_type
                            # Item.update(blockType=block_type).where(Item.content_id == content_id,
                            #                                         Item.purge >> None).execute()
                        if item_db.entryType != entry_type:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML entryType: %s.', entry_type)
                            logger.info('DB entryType: %s.', item_db.entryType)
                            item_db.entryType = entry_type
                            # Item.update(entryType=entry_type).where(Item.content_id == content_id,
                            #                                         Item.purge >> None).execute()
                        if str(item_db.decision_date) != decision_date:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML date: %s.', decision_date)
                            logger.info('DB date: %s.',
                                        str(item_db.decision_date))
                            item_db.decision_date = decision_date
                            # Item.update(decision_date=decision_date).where(Item.content_id == content_id,
                            #                                                Item.purge >> None).execute()
                        if item_db.decision_num != decision_number:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML number: %s.', decision_number)
                            logger.info('DB number: %s.', item_db.decision_num)
                            item_db.decision_num = decision_number
                            # Item.update(decision_num=decision_number).where(Item.content_id == content_id,
                            #                                                 Item.purge >> None).execute()
                        if item_db.decision_org != decision_org:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML org: %s.', decision_org)
                            logger.info('DB org: %s.', item_db.decision_org)
                            item_db.decision_org = decision_org
                            # Item.update(decision_org=decision_org).where(Item.content_id == content_id,
                            #                                              Item.purge >> None).execute()

                    if data_xml.tag == 'url':
                        if not self.only_ascii(data_xml.text):
                            url_split = str(data_xml.text).split(':')
                            url = url_split[0] + ':' + urllib.parse.quote(
                                url_split[1])
                        else:
                            url = data_xml.text
                        url_xml_set.add(url)

                    if data_xml.tag == 'domain':
                        if not self.only_ascii(data_xml.text):
                            domain = (str(
                                data_xml.text).encode('idna')).decode()
                        else:
                            domain = data_xml.text
                        domain_xml_set.add(domain)

                    if data_xml.tag == 'ip':
                        ip_xml_set.add(data_xml.text)

                    if data_xml.tag == 'ipSubnet':
                        sub_ip_xml_set.add(data_xml.text)

                if data_update:
                    url_db = URL.select().where(URL.item == item_db.id,
                                                URL.purge >> None)

                    for url_item in url_db:
                        url_db_set.add(url_item.url)
                    if url_db_set != url_xml_set:
                        common_url_set = url_xml_set.intersection(url_db_set)
                        delete_url_set = url_db_set.difference(common_url_set)
                        add_url_set = url_xml_set.difference(common_url_set)
                        if len(delete_url_set) > 0:
                            logger.info('Delete id %s URL: %s', content_id,
                                        delete_url_set)
                            for delete_url in delete_url_set:
                                URL.update(purge=self.code_id).where(
                                    URL.item == item_db.id,
                                    URL.url == delete_url,
                                    URL.purge >> None).execute()
                        if len(add_url_set) > 0:
                            logger.info('Add id %s URL: %s', content_id,
                                        add_url_set)
                            for add_url in add_url_set:
                                URL.create(item=item_db.id,
                                           content_id=item_db.content_id,
                                           url=add_url,
                                           add=self.code_id)
                    url_db_set.clear()
                    url_xml_set.clear()

                    domain_db = Domain.select().where(
                        Domain.item == item_db.id, Domain.purge >> None)

                    for domain_item in domain_db:
                        domain_db_set.add(domain_item.domain)
                    if domain_db_set != domain_xml_set:
                        common_domain_set = domain_xml_set.intersection(
                            domain_db_set)
                        delete_domain_set = domain_db_set.difference(
                            common_domain_set)
                        add_domain_set = domain_xml_set.difference(
                            common_domain_set)
                        if len(delete_domain_set) > 0:
                            logger.info('Delete id %s Domain: %s', content_id,
                                        delete_domain_set)
                            for delete_domain in delete_domain_set:
                                Domain.update(purge=self.code_id).where(
                                    Domain.item == item_db.id,
                                    Domain.domain == delete_domain,
                                    Domain.purge >> None).execute()
                        if len(add_domain_set) > 0:
                            logger.info('Add id %s Domain: %s', content_id,
                                        add_domain_set)
                            for add_domain in add_domain_set:
                                Domain.create(item=item_db.id,
                                              content_id=item_db.content_id,
                                              domain=add_domain,
                                              add=self.code_id)
                    domain_db_set.clear()
                    domain_xml_set.clear()

                    ip_db = IP.select().where(IP.item == item_db.id,
                                              IP.mask == 32, IP.purge >> None)

                    for ip_item in ip_db:
                        ip_db_set.add(ip_item.ip)
                    if ip_db_set != ip_xml_set:
                        common_ip_set = ip_xml_set.intersection(ip_db_set)
                        delete_ip_set = ip_db_set.difference(common_ip_set)
                        add_ip_set = ip_xml_set.difference(common_ip_set)
                        if len(delete_ip_set) > 0:
                            logger.info('Delete id %s ip: %s', content_id,
                                        delete_ip_set)
                            for delete_ip in delete_ip_set:
                                IP.update(purge=self.code_id).where(
                                    IP.item == item_db.id, IP.ip == delete_ip,
                                    IP.mask == 32, IP.purge >> None).execute()
                        if len(add_ip_set) > 0:
                            logger.info('Add id %s ip: %s', content_id,
                                        add_ip_set)
                            for add_ip in add_ip_set:
                                IP.create(item=item_db.id,
                                          content_id=item_db.content_id,
                                          ip=add_ip,
                                          add=self.code_id)
                    ip_db_set.clear()
                    ip_xml_set.clear()

                    sub_ip_db = IP.select().where(IP.item == item_db.id,
                                                  IP.mask < 32,
                                                  IP.purge >> None)

                    for sub_ip_item in sub_ip_db:
                        sub_ip_db_set.add(
                            str(sub_ip_item.ip) + '/' + str(sub_ip_item.mask))
                    if sub_ip_db_set != sub_ip_xml_set:
                        common_sub_ip_set = sub_ip_xml_set.intersection(
                            sub_ip_db_set)
                        delete_sub_ip_set = sub_ip_db_set.difference(
                            common_sub_ip_set)
                        add_sub_ip_set = sub_ip_xml_set.difference(
                            common_sub_ip_set)
                        if len(delete_sub_ip_set) > 0:
                            logger.info('Delete id %s subnet: %s', content_id,
                                        delete_sub_ip_set)
                            for delete_sub_ip in delete_sub_ip_set:
                                del_subnet = str(delete_sub_ip).split('/')
                                del_ip = del_subnet[0]
                                del_mask = del_subnet[1]
                                IP.update(purge=self.code_id).where(
                                    IP.item == item_db.id, IP.ip == del_ip,
                                    IP.mask == del_mask,
                                    IP.purge >> None).execute()
                        if len(add_sub_ip_set) > 0:
                            logger.info('Add id %s subnet: %s', content_id,
                                        add_sub_ip_set)
                            for add_sub_ip in add_sub_ip_set:
                                add_subnet = str(add_sub_ip).split('/')
                                add_ip = add_subnet[0]
                                add_mask = add_subnet[1]
                                IP.create(item=item_db.id,
                                          content_id=item_db.content_id,
                                          ip=add_ip,
                                          mask=add_mask,
                                          add=self.code_id)
                    item_db.save()
                    sub_ip_db_set.clear()
                    sub_ip_xml_set.clear()

        if self.check_diff():
            self.cleaner()
            return 1
        else:
            logger.info('no updates')
            # print('no updates')
            return 2
Esempio n. 16
0
    def parse_dump(self):
        if not os.path.exists(self.path_py + '/dump.xml'):
            logger.info('dump.xml not found: s%', self.path_py + '/dump.xml')
            return 0
        logger.info('dump.xml already exists.')
        tree_xml = ElementTree().parse(self.path_py + '/dump.xml')

        dt = datetime.strptime(tree_xml.attrib['updateTime'][:19], '%Y-%m-%dT%H:%M:%S')
        update_time = int(time.mktime(dt.timetuple()))
        Dump.update(value=update_time).where(Dump.param == 'lastDumpDate').execute()
        logger.info('Got updateTime: %s.', update_time)

        dt = datetime.strptime(tree_xml.attrib['updateTimeUrgently'][:19], '%Y-%m-%dT%H:%M:%S')
        update_time_urgently = int(time.mktime(dt.timetuple()))
        Dump.update(value=update_time_urgently).where(Dump.param == 'lastDumpDateUrgently').execute()
        logger.info('Got updateTimeUrgently: %s.', update_time_urgently)

        list_xml = tree_xml.findall(".//*[@id]")
        id_set_dump = set()
        id_set_db = set()
        for content_xml in list_xml:
            # print(content_xml.tag, content_xml.attrib, content_xml.text)
            id_set_dump.add(int(content_xml.attrib['id']))

        select_content_id_db = Item.select(Item.content_id).where(Item.purge >> None)
        for content_db in select_content_id_db:
            id_set_db.add(content_db.content_id)

        common_id_set = id_set_dump.intersection(id_set_db)
        delete_id_set = id_set_db.difference(common_id_set)
        add_id_set = id_set_dump.difference(common_id_set)
        # print(delete_id_set)
        # print(add_id_set)

        if len(delete_id_set) > 0:
            with self.transact.atomic():
                for del_item in delete_id_set:
                    logger.info('Full delete Item, IP, Domain, URL id: %s.', del_item)

                    Item.update(purge=self.code_id).where(Item.content_id == del_item, Item.purge >> None).execute()
                    Domain.update(purge=self.code_id).where(Domain.content_id == del_item,
                                                            Domain.purge >> None).execute()
                    URL.update(purge=self.code_id).where(URL.content_id == del_item, URL.purge >> None).execute()
                    IP.update(purge=self.code_id).where(IP.content_id == del_item, IP.purge >> None).execute()

        if len(add_id_set) > 0:
            include_time = str()
            urgency_type = int()
            entry_type = int()
            block_type = str()
            hash_value = str()
            with self.transact.atomic():
                for new_item in add_id_set:
                    logger.info('New Item, IP, Domain, URL id: %s.', new_item)
                    new_item_xml = tree_xml.find(".//content[@id='" + str(new_item) + "']")
                    for data_xml in new_item_xml.iter():
                        if data_xml.tag == 'content':
                            content_id = int(data_xml.attrib['id'])
                            try:
                                urgency_type = int(data_xml.attrib['urgencyType'])
                            except KeyError:
                                urgency_type = 0
                            include_time = self.date_time_xml_to_db(data_xml.attrib['includeTime'])
                            try:
                                block_type = data_xml.attrib['blockType']
                            except KeyError:
                                block_type = 'default'
                            entry_type = int(data_xml.attrib['entryType'])
                            hash_value = data_xml.attrib['hash']
                        if data_xml.tag == 'decision':
                            decision_date = data_xml.attrib['date']
                            decision_number = data_xml.attrib['number']
                            decision_org = data_xml.attrib['org']
                            item_new = Item(content_id=content_id, includeTime=include_time,
                                            urgencyType=urgency_type, entryType=entry_type, blockType=block_type,
                                            hashRecord=hash_value, decision_date=decision_date,
                                            decision_num=decision_number, decision_org=decision_org,
                                            add=self.code_id)
                            item_new.save()
                        if data_xml.tag == 'url':
                            if not self.only_ascii(data_xml.text):
                                url_split = str(data_xml.text).split(':')
                                url = url_split[0] + ':' + urllib.parse.quote(url_split[1])
                            else:
                                url = data_xml.text
                            URL.create(item=item_new.id, content_id=content_id, url=url, add=self.code_id)
                        if data_xml.tag == 'domain':
                            if not self.only_ascii(data_xml.text):
                                domain = (str(data_xml.text).encode('idna')).decode()
                            else:
                                domain = data_xml.text
                            Domain.create(item=item_new.id, content_id=content_id, domain=domain, add=self.code_id)
                        if data_xml.tag == 'ip':
                            ip = data_xml.text
                            IP.create(item=item_new.id, content_id=content_id, ip=ip, add=self.code_id)
                        if data_xml.tag == 'ipSubnet':
                            net = data_xml.text.split('/')
                            ip = net[0]
                            mask = net[1]
                            IP.create(item=item_new.id, content_id=content_id, ip=ip, mask=mask, add=self.code_id)

        url_db_set = set()
        url_xml_set = set()
        ip_db_set = set()
        ip_xml_set = set()
        sub_ip_xml_set = set()
        sub_ip_db_set = set()
        domain_db_set = set()
        domain_xml_set = set()
        data_update = False
        with self.transact.atomic():
            for item_xml in list_xml:
                for data_xml in item_xml.iter():
                    # print(data_xml.tag, data_xml.attrib, data_xml.text)
                    if data_xml.tag == 'content':
                        content_id = int(data_xml.attrib['id'])
                        hash_value = data_xml.attrib['hash']
                        item_db = Item.get(Item.content_id == content_id, Item.purge >> None)

                        if hash_value != item_db.hashRecord:
                            logger.info('Hashes not equal, update hash id: %s', content_id)
                            try:
                                urgency_type = int(data_xml.attrib['urgencyType'])
                            except KeyError:
                                urgency_type = 0
                            include_time = self.date_time_xml_to_db(data_xml.attrib['includeTime'])
                            try:
                                block_type = data_xml.attrib['blockType']
                            except KeyError:
                                block_type = 'default'
                            entry_type = int(data_xml.attrib['entryType'])
                            item_db.hashRecord = hash_value
                            # Item.update(purge=None).where(Item.content_id == content_id).execute()
                            data_update = True
                        else:
                            data_update = False
                            break

                    if data_xml.tag == 'decision':
                        decision_date = data_xml.attrib['date']
                        decision_number = data_xml.attrib['number']
                        decision_org = data_xml.attrib['org']
                        # print(item_db)
                        if str(item_db.includeTime) != include_time:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML includeTime: %s.', include_time)
                            logger.info('DB includeTime: %s.', item_db.includeTime)
                            item_db.includeTime = include_time
                            # Item.update(includeTime=include_time).where(Item.content_id == content_id,
                            #                                             Item.purge >> None).execute()
                        if item_db.urgencyType != urgency_type:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML urgencyType: %s.', urgency_type)
                            logger.info('DB urgencyType: %s.', item_db.urgencyType)
                            item_db.urgencyType = urgency_type
                            # Item.update(urgencyType=urgency_type).where(Item.content_id == content_id,
                            #                                             Item.purge >> None).execute()
                        if item_db.blockType != block_type:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML blockType: %s.', block_type)
                            logger.info('DB blockType: %s.', item_db.blockType)
                            item_db.blockType = block_type
                            # Item.update(blockType=block_type).where(Item.content_id == content_id,
                            #                                         Item.purge >> None).execute()
                        if item_db.entryType != entry_type:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML entryType: %s.', entry_type)
                            logger.info('DB entryType: %s.', item_db.entryType)
                            item_db.entryType = entry_type
                            # Item.update(entryType=entry_type).where(Item.content_id == content_id,
                            #                                         Item.purge >> None).execute()
                        if str(item_db.decision_date) != decision_date:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML date: %s.', decision_date)
                            logger.info('DB date: %s.', str(item_db.decision_date))
                            item_db.decision_date = decision_date
                            # Item.update(decision_date=decision_date).where(Item.content_id == content_id,
                            #                                                Item.purge >> None).execute()
                        if item_db.decision_num != decision_number:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML number: %s.', decision_number)
                            logger.info('DB number: %s.', item_db.decision_num)
                            item_db.decision_num = decision_number
                            # Item.update(decision_num=decision_number).where(Item.content_id == content_id,
                            #                                                 Item.purge >> None).execute()
                        if item_db.decision_org != decision_org:
                            logger.info('content_id: %s.', content_id)
                            logger.info('XML org: %s.', decision_org)
                            logger.info('DB org: %s.', item_db.decision_org)
                            item_db.decision_org = decision_org
                            # Item.update(decision_org=decision_org).where(Item.content_id == content_id,
                            #                                              Item.purge >> None).execute()

                    if data_xml.tag == 'url':
                        if not self.only_ascii(data_xml.text):
                            url_split = str(data_xml.text).split(':')
                            url = url_split[0] + ':' + urllib.parse.quote(url_split[1])
                        else:
                            url = data_xml.text
                        url_xml_set.add(url)

                    if data_xml.tag == 'domain':
                        if not self.only_ascii(data_xml.text):
                            domain = (str(data_xml.text).encode('idna')).decode()
                        else:
                            domain = data_xml.text
                        domain_xml_set.add(domain)

                    if data_xml.tag == 'ip':
                        ip_xml_set.add(data_xml.text)

                    if data_xml.tag == 'ipSubnet':
                        sub_ip_xml_set.add(data_xml.text)

                if data_update:
                    url_db = URL.select().where(URL.item == item_db.id, URL.purge >> None)

                    for url_item in url_db:
                        url_db_set.add(url_item.url)
                    if url_db_set != url_xml_set:
                        common_url_set = url_xml_set.intersection(url_db_set)
                        delete_url_set = url_db_set.difference(common_url_set)
                        add_url_set = url_xml_set.difference(common_url_set)
                        if len(delete_url_set) > 0:
                            logger.info('Delete id %s URL: %s', content_id, delete_url_set)
                            for delete_url in delete_url_set:
                                URL.update(purge=self.code_id).where(URL.item == item_db.id, URL.url == delete_url,
                                                                     URL.purge >> None).execute()
                        if len(add_url_set) > 0:
                            logger.info('Add id %s URL: %s', content_id, add_url_set)
                            for add_url in add_url_set:
                                URL.create(item=item_db.id, content_id=item_db.content_id, url=add_url,
                                           add=self.code_id)
                    url_db_set.clear()
                    url_xml_set.clear()

                    domain_db = Domain.select().where(Domain.item == item_db.id, Domain.purge >> None)

                    for domain_item in domain_db:
                        domain_db_set.add(domain_item.domain)
                    if domain_db_set != domain_xml_set:
                        common_domain_set = domain_xml_set.intersection(domain_db_set)
                        delete_domain_set = domain_db_set.difference(common_domain_set)
                        add_domain_set = domain_xml_set.difference(common_domain_set)
                        if len(delete_domain_set) > 0:
                            logger.info('Delete id %s Domain: %s', content_id, delete_domain_set)
                            for delete_domain in delete_domain_set:
                                Domain.update(purge=self.code_id).where(Domain.item == item_db.id,
                                                                        Domain.domain == delete_domain,
                                                                        Domain.purge >> None).execute()
                        if len(add_domain_set) > 0:
                            logger.info('Add id %s Domain: %s', content_id, add_domain_set)
                            for add_domain in add_domain_set:
                                Domain.create(item=item_db.id, content_id=item_db.content_id, domain=add_domain,
                                              add=self.code_id)
                    domain_db_set.clear()
                    domain_xml_set.clear()

                    ip_db = IP.select().where(IP.item == item_db.id, IP.mask == 32, IP.purge >> None)

                    for ip_item in ip_db:
                        ip_db_set.add(ip_item.ip)
                    if ip_db_set != ip_xml_set:
                        common_ip_set = ip_xml_set.intersection(ip_db_set)
                        delete_ip_set = ip_db_set.difference(common_ip_set)
                        add_ip_set = ip_xml_set.difference(common_ip_set)
                        if len(delete_ip_set) > 0:
                            logger.info('Delete id %s ip: %s', content_id, delete_ip_set)
                            for delete_ip in delete_ip_set:
                                IP.update(purge=self.code_id).where(IP.item == item_db.id, IP.ip == delete_ip,
                                                                    IP.mask == 32, IP.purge >> None).execute()
                        if len(add_ip_set) > 0:
                            logger.info('Add id %s ip: %s', content_id, add_ip_set)
                            for add_ip in add_ip_set:
                                IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip,
                                          add=self.code_id)
                    ip_db_set.clear()
                    ip_xml_set.clear()

                    sub_ip_db = IP.select().where(IP.item == item_db.id, IP.mask < 32, IP.purge >> None)

                    for sub_ip_item in sub_ip_db:
                        sub_ip_db_set.add(str(sub_ip_item.ip) + '/' + str(sub_ip_item.mask))
                    if sub_ip_db_set != sub_ip_xml_set:
                        common_sub_ip_set = sub_ip_xml_set.intersection(sub_ip_db_set)
                        delete_sub_ip_set = sub_ip_db_set.difference(common_sub_ip_set)
                        add_sub_ip_set = sub_ip_xml_set.difference(common_sub_ip_set)
                        if len(delete_sub_ip_set) > 0:
                            logger.info('Delete id %s subnet: %s', content_id, delete_sub_ip_set)
                            for delete_sub_ip in delete_sub_ip_set:
                                del_subnet = str(delete_sub_ip).split('/')
                                del_ip = del_subnet[0]
                                del_mask = del_subnet[1]
                                IP.update(purge=self.code_id).where(IP.item == item_db.id, IP.ip == del_ip,
                                                                    IP.mask == del_mask, IP.purge >> None).execute()
                        if len(add_sub_ip_set) > 0:
                            logger.info('Add id %s subnet: %s', content_id, add_sub_ip_set)
                            for add_sub_ip in add_sub_ip_set:
                                add_subnet = str(add_sub_ip).split('/')
                                add_ip = add_subnet[0]
                                add_mask = add_subnet[1]
                                IP.create(item=item_db.id, content_id=item_db.content_id, ip=add_ip, mask=add_mask,
                                          add=self.code_id)
                    item_db.save()
                    sub_ip_db_set.clear()
                    sub_ip_xml_set.clear()

        if self.check_diff():
            self.cleaner()
            return 1
        else:
            logger.info('no updates')
            self.cleaner()
            return 2
Esempio n. 17
0
    def _domain_dedup_sql(self, diff, bt, stat):
        rb_list_add = self.idx_list[:diff+1]
        rb_list_purge = self.idx_list[:diff]
        if stat and bt == 'ignore':
            domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.add == self.idx_list[diff])
            domain_dup_sql = Domain.select(fn.Distinct(Domain.domain))\
                .where(~(Domain.add << rb_list_add) & ((Domain.purge >> None) | (Domain.purge << rb_list_add)) &
                       (Domain.domain << domain_diff_sql))
            domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).where((Domain.add == self.idx_list[diff]) &
                                                                               ~(Domain.domain << domain_dup_sql))
            return domain_dedup_sql
        elif not stat and bt == 'ignore':
            domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).where(Domain.purge == self.idx_list[diff])
            domain_dup_sql = Domain.select(fn.Distinct(Domain.domain))\
                .where(~(Domain.add << rb_list_purge) & (Domain.purge >> None) &
                       (Domain.domain << domain_diff_sql))
            domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).where((Domain.purge == self.idx_list[diff]) &
                                                                               ~(Domain.domain << domain_dup_sql))
            return domain_dedup_sql
        elif stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'):
            domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
                .where(Item.blockType == bt, Domain.add == self.idx_list[diff])
            domain_dup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
                .where((Item.blockType == bt) & ~(Domain.add << rb_list_add) &
                       (Domain.purge >> None) & (Domain.domain << domain_diff_sql))

            domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
                .where((Item.blockType == bt) & (Domain.add == self.idx_list[diff]) &
                       ~(Domain.domain << domain_dup_sql))
            return domain_dedup_sql
        elif not stat and (bt == 'ip' or bt == 'default' or bt == 'domain' or bt == 'domain-mask'):
            domain_diff_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
                .where(Item.blockType == bt, Domain.purge == self.idx_list[diff])
            domain_dup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
                .where((Item.blockType == bt) & ~(Domain.add << rb_list_purge) &
                       (Domain.purge >> None) & (Domain.domain << domain_diff_sql))
            domain_dedup_sql = Domain.select(fn.Distinct(Domain.domain)).join(Item)\
                .where((Item.blockType == bt) & (Domain.purge == self.idx_list[diff]) &
                       ~(Domain.domain << domain_dup_sql))
            return domain_dedup_sql
Esempio n. 18
0
if __name__ == "__main__":
    domains = {
        "helsinkitimes": "https://www.helsinkitimes.fi/",
        "berlin": "https://www.berlin.de/en/news/",
        "9news": "https://www.9news.com.au/sydney",
        "fail!": "fail://fail.com",
    }
    domain_checks = {
        "helsinkitimes": ["covid(\\d+) ", "govern(\\w+)"],
    }
    delete_tables()
    create_tables()
    domain_ids = []
    for name, url in domains.items():
        d = Domain(domain=name, url=url)
        d.save()
        domain_ids.append(d.id)
        for idx, regexp in enumerate(domain_checks.get(name, [])):
            DomainCheck(
                domain_id=d.id,
                name=f"{name}-{idx}",
                regexp=regexp,
            ).save()

    for x in range(n_pulls):
        for d_id in domain_ids:
            print(f"sending collector task for {d_id}")
            send_task(topic=collector_topic, domain_id=d_id)
        time.sleep(sleep_between_pulls)