def test_add_same_swf_to_db(self):
        swf_info = self.populate_swf_info()
        swf_info.duplicate = 1  # we've seen this swf before

        swf_id = swfu.add_swf_to_db(swf_info, self.db_conn)

        rows = swfu.get_swf_obj_from_db('hash', swf_info.hash, self.db_cursor)
        if rows:
            vector = swfu.str_to_vector(rows[0]['occ_vector'])
            swf_filename = rows[0]['local_path']
            new_swf = 1

        swf_info.filename = swf_filename
        swf_info.duplicate = new_swf
        swf_info.occ_vector = vector

        swf_id_2 = swfu.add_swf_to_db(swf_info, self.db_conn)

        if not self.db_cursor.execute('SELECT * FROM swf_obj WHERE id = %s',
                                      (swf_id, )):
            self.fail("Cannot find SWF in db")

        row_orig = self.db_cursor.fetchone()

        if not self.db_cursor.execute('SELECT * FROM swf_obj WHERE id = %s',
                                      (swf_id_2, )):
            self.fail("Cannot find second SWF in db")

        row_second = self.db_cursor.fetchone()

        self.compare_swf_info(row_orig, swf_info)
        self.compare_swf_info(row_second, swf_info)
Esempio n. 2
0
    def should_crawl_and_find_swfs(self, url, expected_strings=(None, None), unexpected_strings=(None, None), crawler_type='chrome_lazy'):
        expected_fields, expected_values = expected_strings
        unexpected_fields, unexpected_values = unexpected_strings
        
        crawl_id = ag.crawl_sites([(1, url),], crawler_type, num_crawl_urls=1)
        
        db_conn = dbu.mysql_init_db()
        db_curs = db_conn.cursor(mdb.cursors.DictCursor)
        
        rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs)
        
        if expected_fields:
            found_dict = {}
            for field in expected_fields:
                found_dict[field] = False
        
        for row in rows:
            if expected_fields:
                for expected_field, expected_value in zip(expected_fields, expected_values):
                    if expected_value in row[expected_field]:
                        found_dict[expected_field] = True
                        print 'found in ',  row[expected_field]
                    
            if unexpected_values:
                for unexpected_field, unexpected_value in zip(unexpected_fields, unexpected_values) :
                    if unexpected_value in row[unexpected_field]:
                        self.fail('Unexpected field %s with unexpected value %s found' %(unexpected_field, unexpected_value))                    
        if expected_fields:
            for field, found in found_dict.iteritems():
                if not found:
                    self.fail('Cannot find %s' % field)

        db_curs.close()
        db_conn.close()
 def test_get_swf_obj_from_db(self):
     swf_info = self.populate_swf_info()
     swf_id = swfu.add_swf_to_db(swf_info, self.db_conn)
     rows = swfu.get_swf_obj_from_db('id', swf_id, self.db_cursor)
     self.assert_(len(rows), 'No SWF can be found in DB')
     for row in rows:
         self.assertTrue('http' in row['swf_url'],
                         'swf url is does not have http in it')
Esempio n. 4
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):
    
    referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else ""
    
    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them
            
            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()
            
            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)
            
            if not rows:
                swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'
                    
                wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer))
                
                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1
            
            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()
            
            swf_info.rank = rank # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer        
            swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) 
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id
            
            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()
            
            
        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100]))
        else:
            pass
Esempio n. 5
0
    def should_crawl_and_find_swfs(self,
                                   url,
                                   expected_strings=(None, None),
                                   unexpected_strings=(None, None),
                                   crawler_type='chrome_lazy'):
        expected_fields, expected_values = expected_strings
        unexpected_fields, unexpected_values = unexpected_strings

        crawl_id = ag.crawl_sites([
            (1, url),
        ], crawler_type, num_crawl_urls=1)

        db_conn = dbu.mysql_init_db()
        db_curs = db_conn.cursor(mdb.cursors.DictCursor)

        rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs)

        if expected_fields:
            found_dict = {}
            for field in expected_fields:
                found_dict[field] = False

        for row in rows:
            if expected_fields:
                for expected_field, expected_value in zip(
                        expected_fields, expected_values):
                    if expected_value in row[expected_field]:
                        found_dict[expected_field] = True
                        print 'found in ', row[expected_field]

            if unexpected_values:
                for unexpected_field, unexpected_value in zip(
                        unexpected_fields, unexpected_values):
                    if unexpected_value in row[unexpected_field]:
                        self.fail(
                            'Unexpected field %s with unexpected value %s found'
                            % (unexpected_field, unexpected_value))
        if expected_fields:
            for field, found in found_dict.iteritems():
                if not found:
                    self.fail('Cannot find %s' % field)

        db_curs.close()
        db_conn.close()
Esempio n. 6
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):

    referer = msg.request.headers['Referer'][0] if msg.request.headers[
        'Referer'] else ""

    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS
            ):  # to wide, but decompiler will discard them

            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()

            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)

            if not rows:
                swf_filename = os.path.join(
                    dir_path,
                    "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'

                wl_log.info("SWF saved %s referrer: %s" %
                            (os.path.basename(swf_filename), referer))

                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(
                    swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" %
                            (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1

            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()

            swf_info.rank = rank  # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer
            swf_info.duplicate = duplicate_swf  # !!! Y for repeated swfs(that we know before)
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(
                swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id

            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()

        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" %
                           (msg.request.path, msg.response.content[:100]))
        else:
            pass