Example #1
0
class Hashedbitset():

    def __init__(self, size):
        self._value = Bitset(size)
        self._size = size
        self._hasher = Hasher()

    def Add(self, item):
        self._value.Add(self._hasher.hash(item, 1)[0])

    def Size(self):
        return self._value.Size()

    def Contains(self, item):
        n = self._hasher.hash(item, 1)[0]
        return self._value.Contains(n)
Example #2
0
def to_data(filename):
    h = Hasher(2 ** 20)
    df_all = pd.merge(pd.merge(df, user_df, on='u_st_2_uid', how='left'), item_df, on='d_st_2_did', how='left')
    print(df_all.dtypes)
    print('*' * 18)
    print(df_all.count())
    print('*' * 18)
    df_json = df_all.to_dict('records')

    # multiprocessing
    p = Pool(10)
    results = []

    for i, feature_value_dict in enumerate(df_json):
        # if i > 100:
        #     break
        results.append(p.apply_async(to_p, args=(i, feature_value_dict, h)))
    print('*' * 18)
    print(len(results))
    print('*' * 18)
    p.close()
    p.join()
    writer = tf.python_io.TFRecordWriter(filename)
    for r in results:
        writer.write(r.get())
    writer.close()
Example #3
0
    def write_to_cache_without_js(self):
        process = CrossPlatformProcess(self)
        (stdout, stderr) = process.run_sync(r'gulp -v')

        if process.failed or not GulpVersion(stdout).supports_tasks_simple():
            raise Exception(
                "Gulp: Could not get the current gulp version or your gulp CLI version is lower than 3.7.0"
            )

        (stdout, stderr) = process.run_sync(r'gulp --tasks-simple')

        gulpfile = self.get_gulpfile_path(self.working_dir)

        if not stdout:
            raise Exception(
                "Gulp: The result of `gulp --tasks-simple` was empty")

        self.write_cache_file({
            gulpfile: {
                "sha1":
                Hasher.sha1(gulpfile),
                "tasks":
                dict((task, {
                    "name": task,
                    "dependencies": ""
                }) for task in stdout.split("\n") if task)
            }
        })
Example #4
0
    def fetch_json(self):
        cache_file = CacheFile(self.working_dir)
        gulpfile = self.get_gulpfile_path(self.working_dir)
        data = None

        if cache_file.exists():
            filesha1 = Hasher.sha1(gulpfile)
            data = cache_file.read()

            if gulpfile in data and data[gulpfile]["sha1"] == filesha1:
                return data[gulpfile]["tasks"]

        self.callcount += 1

        if self.callcount == 1:
            return self.write_to_cache()

        if data is None:
            raise Exception("Could not write to cache gulpfile.")

        if gulpfile in data:
            raise Exception(
                "Sha1 from gulp cache ({0}) is not equal to calculated ({1}).\nTry erasing the cache and running Gulp again."
                .format(data[gulpfile]["sha1"], filesha1))
        else:
            raise Exception(
                "Have you renamed a folder?.\nSometimes Sublime doesn't update the project path, try removing the folder from the project and adding it again."
            )
Example #5
0
    def fetch_json(self):
        jsonfilename = os.path.join(self.working_dir, GulpCommand.cache_file_name)
        gulpfile = self.get_gulpfile_path(self.working_dir)
        data = None

        if os.path.exists(jsonfilename):
            filesha1 = Hasher.sha1(gulpfile)
            json_data = codecs.open(jsonfilename, "r", "utf-8", errors='replace')

            try:
                data = json.load(json_data)
                if gulpfile in data and data[gulpfile]["sha1"] == filesha1:
                    return data[gulpfile]["tasks"]
            finally:
                json_data.close()

        self.callcount += 1

        if self.callcount == 1: 
            return self.write_to_cache()

        if data is None:
            raise Exception("Could not write to cache gulpfile.")

        raise Exception("Sha1 from gulp cache ({0}) is not equal to calculated ({1}).\nTry erasing the cache and running Gulp again.".format(data[gulpfile]["sha1"], filesha1))
Example #6
0
    def __init__(self, init_dic):
        self.logger = getLogger()
        if not self.__isValidInfo(init_dic):
            self.logger.error(
                "Failed to init RequestURLCrawler : Invalid input information")
            exit(1)

        self.info_dic = init_dic
        self.cursor = None
        self.req_url_queue = [
        ]  # unvisited seeds (minimum heap ordered by page no.)
        # heappush(req_url_queue, (guid_hash, url_data))
        self.url_data_dic = dict(
        )  # visited + fully parsed data, dic[view_guid_hash] = URLData()
        self.hasher = Hasher()
        self.url_factory = None
        self.html_parser = None
        self.xml_producer = XMLPrinter(OUTPUT_PATH)
Example #7
0
    def spi_response(self, response, *args, **kwargs):
        '''Response of the spi_request are handled here
        '''

        if 'text/html' in response.headers['Content-Type']:
            hash_val = Hasher.HashMD5(response.content)
            if hash_val not in self.URLhash:
                self.URLhash.add(hash_val)
                self.URLset.union(Links.parse_link(response))
Example #8
0
class TestHash(unittest.TestCase):
    def setUp(self):
        self.hasher = Hasher("words.txt", nwords=3, delimeter="-")

    def test_smoke_test(self):
        data = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi"""
        self.assertEqual(self.hasher.process(data),
                         "Isaac-Bremely-Trueheartedly")

    def test_long_text(self):
        data = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. In non egestas dolor. Nulla molestie sed justo sed elementum. Aliquam erat volutpat. Morbi odio lectus, consequat nec nisl eu, vulputate convallis ex. Etiam faucibus lorem non tempus malesuada. Praesent aliquet, ligula et fringilla euismod, nulla felis accumsan est, bibendum semper ligula ligula ut leo. Donec nibh metus, fermentum in fermentum id, vulputate vel sapien. Quisque gravida eros in rhoncus convallis.
Proin eu dui finibus, maximus nisl sed, dignissim odio. Fusce vel est eu justo imperdiet suscipit eu mattis turpis. Nam sed odio sollicitudin, pulvinar purus non, mollis nulla. Nam sed euismod orci, sed vestibulum mauris. Curabitur cursus est in ornare mollis. Nulla urna turpis, tincidunt non tempor eu, auctor et nisi. Vivamus lobortis elit vel dolor pharetra blandit. Morbi in feugiat odio.
In nec augue velit. Suspendisse interdum purus in metus luctus, eu rhoncus mauris porta. Aliquam pharetra, elit vitae convallis congue, libero velit malesuada felis, sed sodales turpis enim sed leo. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam auctor tortor ut semper pretium. Aenean sed malesuada nisi, eget venenatis enim. Suspendisse in sagittis arcu, eu tristique turpis. Mauris dignissim eget ex sit amet egestas. Donec blandit dolor quis sapien aliquet, id rutrum lorem ultrices. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos.
Ut id scelerisque sem. Cras bibendum, lorem vel dapibus placerat, odio mauris finibus sapien, a efficitur purus massa et metus. Vestibulum dolor elit, ultrices quis enim in, convallis sagittis metus. Phasellus lacinia justo elit, non elementum augue pellentesque eu. Sed nec eleifend enim. Quisque blandit felis quis porta sodales. Morbi id rutrum tellus. Integer varius felis non luctus placerat. Praesent a lacus est. Nulla sollicitudin volutpat erat, pulvinar sagittis dui imperdiet sed. Nulla tempor, leo vel malesuada ullamcorper, libero eros rutrum sem, non ullamcorper tortor ex sed nibh. Cras ac lectus vitae elit dignissim rutrum. Etiam non semper mauris. Donec sem velit, elementum sit amet nibh a, pellentesque maximus velit. Nam ac velit ligula.
"""

        self.assertEqual(self.hasher.process(data),
                         "Blips-Laggingly-Trochilidae")

    def test_short_text(self):
        data = """s"""
        self.assertEqual(self.hasher.process(data), "Abaca-Abusage-Blisses")

    def test_non_text_data(self):
        data = {"hello": 1, "other": "lorem"}
        self.assertEqual(self.hasher.process(data),
                         "Interindividual-Fastbacks-Allochetite")

    def test_hash_distribution(self):
        results = defaultdict(list)
        collisions = 0
        with open("words.txt") as input:
            for line in input:
                hashed = self.hasher.process(line)
                if hashed in results:
                    collisions += 1
                results[hashed].append(line)

        print("Collided:")
        print({k: v for k, v in results.items() if len(v) > 1})

        # words has 370102 unique words, expect very few collisions
        self.assertLessEqual(collisions, 5)
Example #9
0
class Bloomfilter():
    def __init__(self, size, keys):
        self._value = Bitset(size)
        self._size = size
        self._keys = keys
        self._hasher = Hasher()

    def Add(self, item):
        [self._value.Add(i) for i in self._hasher.hash(item, self._keys)]

    def Size(self):
        return self._value.Size()

    def Contains(self, item):
        n = self._hasher.hash(item, self._keys)
        for i in n:
            if not self._value.Contains(i):
                return False
        return True
Example #10
0
 def spi_response(self, response):
     '''Response of the spi_request are handled here
     '''
     if 'text/html' in response.headers[
             'Content-Type'] and response.status_code == 200:
         hash_val = Hasher.HashMD5(response.content)
         if self.redis.getVariable(hash_val) is None:
             if self.database.isConn():
                 self.database.saveData(hash=hash_val,
                                        url=response.url,
                                        content=response)
             self.redis.setVariable(hash_val, response.url)
             [self.URLset.put(link) for link in Links.parse_link(response)]
Example #11
0
    def write_to_cache_without_js(self):
        process = CrossPlatformProcess(self.working_dir)
        (stdout, stderr) = process.run_sync(r'gulp -v')

        if process.failed or not GulpVersion(stdout).supports_tasks_simple():
            raise Exception("Gulp: Could not get the current gulp version or your gulp CLI version is lower than 3.7.0")

        (stdout, stderr) = process.run_sync(r'gulp --tasks-simple')

        gulpfile = self.get_gulpfile_path(self.working_dir)

        if not stdout:
            raise Exception("Gulp: The result of `gulp --tasks-simple` was empty")

        CacheFile(self.working_dir).write({
            gulpfile: {
                "sha1": Hasher.sha1(gulpfile),
                "tasks": dict((task, { "name": task, "dependencies": "" }) for task in stdout.split("\n") if task)
            }
        })
Example #12
0
    def fetch_json(self):
        cache_file = CacheFile(self.working_dir)
        gulpfile = self.get_gulpfile_path(self.working_dir)
        data = None

        if cache_file.exists():
            filesha1 = Hasher.sha1(gulpfile)
            data = cache_file.read()

            if gulpfile in data and data[gulpfile]["sha1"] == filesha1:
                return data[gulpfile]["tasks"]

        self.callcount += 1

        if self.callcount == 1:
            return self.write_to_cache()

        if data is None:
            raise Exception("Could not write to cache gulpfile.")

        if gulpfile in data:
            raise Exception("Sha1 from gulp cache ({0}) is not equal to calculated ({1}).\nTry erasing the cache and running Gulp again.".format(data[gulpfile]["sha1"], filesha1))
        else:
            raise Exception("Have you renamed a folder?.\nSometimes Sublime doesn't update the project path, try removing the folder from the project and adding it again.")
Example #13
0
    queue_hashed = queue.Queue()
    queue_ext_path = queue.Queue()
    queue_csv = queue.Queue()
    queue_csved = queue.Queue()
    queue_blk = queue.Queue()
    queue_mem = queue.Queue()
    queue_memed = queue.Queue()
    queue_rslt = queue.Queue()
    queue_elastic = queue.Queue()

    see = Seeker(queue_dis, IN_DIR, BASE_NAME, CHECK_TIME)
    dis = Dispatcher(queue_dis, queue_extrac, queue_extraced, queue_ext_path,
                     queue_av, queue_hash, queue_hashed, queue_csv,
                     queue_csved, queue_blk, queue_mem, queue_memed,
                     queue_elastic, IN_DIR, WORK_DIR, OUT_DIR, DIR_OUT)
    has = Hasher(queue_hash, queue_hashed, IN_DIR, WORK_DIR, BLOCK_SIZE_HASH)
    ext = Extractor(queue_extrac, queue_extraced, queue_ext_path, IN_DIR,
                    WORK_DIR)
    csv = Csver(queue_csv, queue_csved, WORK_DIR, OUT_DIR)
    blk = Bulker(queue_blk, queue_extraced, WORK_DIR, OUT_DIR)
    mem = Memer(queue_mem, queue_extraced, IN_DIR, WORK_DIR, OUT_DIR)
    #tim = Timeliner(queue_extrac,WORK_DIR,OUT_DIR)
    avc = Avcheck(queue_av, WORK_DIR, OUT_DIR)
    #elas = Elasticer(queue_elastic,WORK_DIR,OUT_DIR)

    see.start()
    dis.start()
    has.start()
    ext.start()
    csv.start()
    #blk.start()
    def generateRepostsForAll(self,
                              count_per_post=1,
                              res=None,
                              rot=None,
                              asp=None,
                              crop=None,
                              uid=None,
                              seed=None):
        '''generates reposts for every single non repost image in the image directory'''
        names = list(
            filter(lambda x: '_REPOST_' not in x, self.__imageToHash.keys()))
        self.vPrint('generating ' + str(len(names)) + ' reposts')
        interrupted = False
        try:
            for i, name in enumerate(sorted(names)):
                repname = (str(uid) if uid else '') + '_REPOST_' + name
                if count_per_post == 1:
                    if repname in self.__imageToHash and repname in self.__imageToText:
                        continue
                elif count_per_post > 1:
                    if (str(count_per_post - 1) + repname) in self.__imageToHash and \
                       (str(count_per_post - 1) + repname) in self.__imageToText:
                        continue
                else:
                    return

                if i < 30 or i % 10 == 0:
                    self.vPrint('partial: %5d/%d' % (i, len(names)))

                try:
                    target_path = join(self.img_dir, name)
                    loc = join(self.img_dir, repname)
                    bad_imgs = generate_bad_repost(target_path,
                                                   count=(count_per_post),
                                                   res=res,
                                                   rot=rot,
                                                   asp=asp,
                                                   crop=crop,
                                                   save_loc=loc,
                                                   seed=(seed + i))
                    if not isinstance(bad_imgs, list):
                        bad_imgs = [(repname, bad_imgs)]

                    for newrepname, bad_img in bad_imgs:
                        bad_img_hash = Hasher.hashImage(
                            bad_img, self.__imagehash_method)
                        bad_img_text = OCR.read2Normalized(bad_img)
                        self.__imageToHash[newrepname] = bad_img_hash
                        self.__imageToText[newrepname] = bad_img_text
                except FileNotFoundError as e:
                    print(e)
                    print("skipped an image that doesn't exist")
                    continue
                except UnidentifiedImageError as e:
                    print(e)
                    print('skipped an unidentified image')
                    continue

            self.vPrint('done!')
        except KeyboardInterrupt:
            self.vPrint('interrupted!')
            interrupted = True
        finally:
            self.saveProcessedDataToCache()
            self.vPrint('saved!')
        return not interrupted
Example #15
0
from hasher import Hasher

hasher = Hasher('')

print(hasher.hash('my name is jack'))
Example #16
0
def main():
    args = parse_args()

    library_paths = args.paths
    if not library_paths:
        logging.error('no libraries specified')
        last_library_path = osxphotos.utils.get_last_library_path()
        system_library_path = osxphotos.utils.get_system_library_path()

        resp = input(f"use last .photoslibrary ({last_library_path}) [Y/n] ")
        if not resp or resp.lower() == 'y':
            library_paths.append(last_library_path)
        else:
            exit(2)

    db_session = fetch_or_initialize_db(args.db_path)

    applephotos, directories = fetch_libraries(library_paths, db_session)
    photos, videos, albums = fetch_photos(applephotos[0])  # TODO

    # TODO replace these dry-run guards with decorators
    if args.dry_run:
        logging.info('[dry-run] skipping photo persistence')
    else:
        logging.info('Persisting photo data')
        persist_photos(photos, db_session)

    hasher = Hasher()

    if args.dry_run:
        logging.info('[dry-run] skipping image encoding')
    else:
        logging.info("Encoding images with imagededup")
        imagededup_encodings = hasher.imagededup_encode(photos)

        logging.info("Encoding images with imagehash")
        imagehash_encodings = hasher.imagehash_encode(photos)

        logging.info('Persisting photo encodings')
        encodings = []

        for photo in photos:
            photo_id = photo.id

            for hash_name, value in imagededup_encodings[photo_id].items():
                enc = Encoding(photo_id=photo_id, hash_library=HashLibrary.imagededup, \
                  algorithm=get_hash_algo(hash_name), value=value)
                encodings.append(enc)

            for hash_name, value in imagehash_encodings[photo_id].items():
                enc = Encoding(photo_id=photo_id, hash_library=HashLibrary.imagehash, \
                  algorithm=get_hash_algo(hash_name), value=value)
                encodings.append(enc)

        db_session.add_all(encodings)
        db_session.commit()

    if args.dry_run:
        logging.info('[dry-run] skipping deduplication check and persistence')
    else:
        pass
Example #17
0
class SQL:
    
    app = Flask(__name__)
    mysql = MySQL()
    hasher = Hasher()
    
    def __init__(self):
        self.app.config['MYSQL_DATABASE_USER'] = '******'
        self.app.config['MYSQL_DATABASE_PASSWORD'] = ''
        self.app.config['MYSQL_DATABASE_DB'] = 'library'
        self.app.config['MYSQL_DATABASE_HOST'] = 'localhost'
        
    def checkUser(self, email, password):
        self.mysql.init_app(self.app)
        
        query = "SELECT * FROM `users`"
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No username/password')
            else:
                for row in r:
                    emailUser = unicode(row['email'])
                    passwordUser = unicode(row['password'])
                    
                    if self.hasher.compareStrings(email, emailUser) and password == passwordUser:
                        return True
                return False
        except:
            print('Error CheckUser')
            
    def getUser(self, email, password):
        self.mysql.init_app(self.app)
        
        query = "SELECT * FROM `users`"
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No username/password')
            else:
                for row in r:
                    emailUser = unicode(row['email'])
                    passwordUser = unicode(row['password'])
                    
                    if self.hasher.compareStrings(email, emailUser) and password == passwordUser:
                        user = {}
                        user['iduser'] = row['id']
                        user['user'] = unicode(row['user'])
                        user['guser'] = row['guser']
                        
                        return user
                return False
        except:
            print('Error getUser')
            
    def registerUser(self, email, password, username, gUser = 0):
        self.mysql.init_app(self.app)
        
        query = '''
            INSERT INTO `users`(`id`, `user`, `password`, `email`, `guser`) VALUES
                (NULL,''' + "'" + username  + "', '" + password + "', '" + email + "', " + str(gUser) + ")"
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return True
        except:
            return False
            
    def updateUser(self, data, section, idUser):
        self.mysql.init_app(self.app)
        
        column = ""
        idUserStr = str(idUser)
        
        if section == 'email':
            column = '`email`'
        elif section == 'user':
            column = '`user`'
        elif section == 'pass':
            column = '`user`'
        
        query = "UPDATE `users` SET " + section + " = '" + data + "' WHERE id = " + idUserStr
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return True
        except:
            return False
    
            
    def getHomeBook(self, idUser):
        self.mysql.init_app(self.app)

        query = '''
            SELECT `books`.`id` ,  `books`.`photo` , 
                    `books`.`bfile` ,  `books`.`bname`
                FROM  `books` 
                    INNER JOIN  `readings` ON  `books`.`id` =  `readings`.`idbook` 
                WHERE  `readings`.`iduser` =''' + str(idUser) + '''
                ORDER BY  `readings`.`lastreading` DESC
                LIMIT 1'''
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No books')
            else:
                return r
        except:
            print('Error getHomeBooks')
        
        
    def getAllBooks(self):
        self.mysql.init_app(self.app)
        
        query = '''SELECT  `id` ,  `photo` ,  `bname` FROM  `books` 
                        ORDER BY `id` DESC'''
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No books')
            else:
                return r
        except:
            print('Error getAllBooks')
            
    def getReadingsBooks(self, iduser):
        self.mysql.init_app(self.app)
        
        idUserStr = str(iduser)
        
        query = '''SELECT  `books`.`id` , `books`.`photo` ,  `books`.`bname` FROM  `books`
                    	INNER JOIN `readings` on `books`.`id` = `readings`.`idbook`
                    		WHERE `readings`.`iduser` like ''' + idUserStr + '''
                    	ORDER BY  `readings`.`lastreading` DESC'''
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No books')
            else:
                return r
        except:
            print('Error getReadingsBooks')
            
    def getReadLaterBooks(self, iduser):
        self.mysql.init_app(self.app)
        
        idUserStr = str(iduser)
        
        query = '''SELECT  `books`.`id` ,  `books`.`photo` ,  `books`.`bname` FROM  `books`
                    	INNER JOIN `read_later` on `books`.`id` = `read_later`.`idbook`
                    		WHERE `read_later`.`iduser` like ''' + idUserStr
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No books')
            else:
                return r
        except:
            print('Error getReadingsBooks')
            
    def getBook(self, idBook):
        self.mysql.init_app(self.app)
        
        query = '''SELECT `books`.`id`, `books`.`photo`, `books`.`bname`,
                    `books`.`synopsis`, `genres`.`genre`, `books`.`idauthor` 
                    	FROM `books`
                    		INNER JOIN `genres` ON `books`.`idgenre` = `genres`.`id` 
                		WHERE `books`.`id` = ''' + idBook
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No book')
            else:
                return r
        except:
            print('Error getBook')
            
    def getAuthor(self, idAuthor):
        self.mysql.init_app(self.app)
    
        query = "SELECT * FROM `author` WHERE `id` = " + str(idAuthor)
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No author')
            else:
                return r
        except:
            print('Error getAuthor')
            
    def getSimilarBooksByBook(self, idBook):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idBook)
        
        query = '''
            SELECT t.`id`, t.`photo`, t.`bname`, t.`synopsis`, `genres`.`genre`, t.`idauthor`  from
            	((SELECT * FROM `books` as book WHERE `idgenre` like (SELECT `idgenre` from `books` WHERE `id` = ''' + idBookStr + ''')) UNION
            	(SELECT * FROM `books` as book WHERE `idauthor` like (SELECT `idauthor` from `books` where `id` = ''' + idBookStr + ''')) UNION
            	(SELECT * FROM `books` as book WHERE `idcollect` like (SELECT `idcollect` from `books` where `id` = ''' + idBookStr + '''))) as t
            		INNER JOIN `genres` ON t.`idgenre` = `genres`.`id`
            		WHERE t.`idgenre` = `genres`.`id`
            		    AND t.`id` NOT LIKE ''' + idBookStr + '''
            		ORDER BY RAND()
            		LIMIT 6
        '''
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No Books')
            else:
                return r
        except:
            print('Error getSimilarBooksByBook')

    def getBooksByAuthor(self, idAuthor):
        self.mysql.init_app(self.app)
        
        idAuthorStr = str(idAuthor)
        
        query = "SELECT * FROM `books` WHERE `idauthor` =" + idAuthorStr + '''
                ORDER BY RAND()
         		LIMIT 6'''
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No Books')
            else:
                return r
        except:
            print('Error getBooksByAuthor')
            
    def searchBooks(self, column, words):
        self.mysql.init_app(self.app)
        
        if column  == "name":
            query = "SELECT * FROM `books` WHERE `bname` like '%" + words + "%'"
        elif column == "genre":
            query = '''
                SELECT * FROM `books` WHERE `idgenre` like
                    (SELECT `id` FROM `genres` WHERE `genre` like '%''' + words + "%')"
        elif column == "author":
            query = '''
                SELECT * FROM `books` WHERE `idauthor` like
                    (SELECT `id` FROM `author` WHERE `first` like '%''' + words + "%' OR `last` like '%" + words + "%')"
        elif column == "collection":
            query = '''
                SELECT * FROM `books` WHERE `idcollect` like
                    (SELECT `id` FROM `collections` WHERE `namecollection` like '%''' + words + "%')"
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No Books')
            else:
                return r
        except:
            print('Error searchBooks')
            
    def searchReadLater(self, words, idUser):
        self.mysql.init_app(self.app)
        
        idUserStr = str(idUser)
        
        query = '''
            SELECT * FROM `books`
            	INNER JOIN `read_later` ON `books`.`id` = `read_later`.`idbook`
            	WHERE `read_later`.`iduser` = ''' + idUserStr + '''
            		AND `books`.`bname` like''' + "'%" + words + "%'"
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No Books')
            else:
                return r
        except:
            print('Error searchBooks')
            
    def searchPendings(self, words, idUser):
        self.mysql.init_app(self.app)
        
        idUserStr = str(idUser)
        
        query = '''
            SELECT * FROM `books`
            	INNER JOIN `readings` ON `books`.`id` = `readings`.`idbook`
            	WHERE `readings`.`iduser` = ''' + idUserStr + '''
            		AND `books`.`bname` like''' + "'%" + words + "%'"
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No Books')
            else:
                return r
        except:
            print('Error searchBooks')
            
            
    def checkReadLater(self, idUser, idBook):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idBook)
        idUserStr = str(idUser)
        
        query = '''
            SELECT * FROM `read_later` WHERE
                `iduser` like ''' + idUserStr + ''' AND
                `idbook` like ''' + idBookStr
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                return False
            else:
                return True
        except:
            print('Error checkReadLater')
         
            
    def addReadLater(self, idUser, idBook):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idBook)
        idUserStr = str(idUser)
        
        query = '''
            INSERT INTO `read_later`(`id`, `iduser`, `idbook`)
                VALUES (NULL, ''' + idUserStr + ", " + idBookStr + ")"
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return True
        except:
            return False
      
            
    def removeReadLater(self, idUser, idBook):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idBook)
        idUserStr = str(idUser)
        
        query = '''
            DELETE FROM `read_later` WHERE 
                `iduser` like ''' + idUserStr + ''' AND
                `idbook` like ''' + idBookStr
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return True
        except:
            return False
    
    def getBfile(self, idbook):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idbook)
        
        query = "SELECT `bfile` FROM `books` WHERE `id` =" + idBookStr
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                print('No Books')
            else:
                return r
        except:
            print('Error getSimilarBooksByBook')
            
    def getAlines(self, idbook, iduser):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idbook)
        idUserStr = str(iduser)
        
        query = '''
            SELECT `alines` FROM `readings`
            	WHERE `iduser` = ''' + idUserStr + '''
            	AND `idbook` = ''' + idBookStr
        
        cur = self.mysql.connect().cursor()
        cur.execute(query)
        
        try:
            r = [dict((cur.description[i][0], value)
                  for i, value in enumerate(row)) for row in cur.fetchall()]
            if len(r) == 0:
                return self.insertAlines(idbook,iduser)
            else:
                self.updateDateLastReading(idbook, iduser)
                return r
        except:
            print('Error getAlines')
            
    def insertAlines(self, idbook, iduser):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idbook)
        idUserStr = str(iduser)
        
        query = '''
            INSERT INTO `readings`(`id`, `iduser`, `idbook`, `alines`, `lastreading`)
	            VALUES (NULL''' + ", " + idUserStr + ", " + idBookStr + ", 0, NOW())"
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return self.getAlines(idbook,iduser)
        except:
            return False
            
    def updateDateLastReading(self, idbook, iduser):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idbook)
        idUserStr = str(iduser)
        
        query = '''
            UPDATE `readings` SET `lastreading`= NOW()
                WHERE 
                `iduser` like ''' + idUserStr + ''' AND
                `idbook` like ''' + idBookStr
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return True
        except:
            return False
            
    def updateAlines(self, idbook, iduser, alines):
        self.mysql.init_app(self.app)
        
        idBookStr = str(idbook)
        idUserStr = str(iduser)
        alinesStr = str(alines)
        
        query = '''
            UPDATE `readings` SET `alines`=''' + alines + '''
                WHERE 
                `iduser` like ''' + idUserStr + ''' AND
                `idbook` like ''' + idBookStr
        
        try:
            con = self.mysql.connect()
            cur = con.cursor()
            cur.execute(query)
            con.commit()
            return True
        except:
            return False
    def processData(self, only_cached_files=False, max_capacity=None):
        '''
        Processes all posts and returns two dictionaries in a tuple.
        The first maps image name to hash, and
        the second maps image name to OCR results.

        The results will also be cached in memory within the class and
        will be used in other methods for checking reposts

        Returns:
        A tuple of two dictionaries, first one containing image name to hash mappings
        and second one containing image name to OCR readings.
        '''

        if not only_cached_files:
            files = [
                f for f in listdir(self.img_dir)
                if isfile(join(self.img_dir, f)) and not f.startswith('.')
            ]
            files.sort()
            self.readProcessedDataFromCache()
        else:
            self.readProcessedDataFromCache()
            files = list(self.__imageToHash.keys())
            files.sort()

        if max_capacity is not None:
            files = files[:max_capacity]

        d = self.__imageToHash
        t = self.__imageToText

        self.vPrint("loading... " + str(len(files)) + ' items')
        for i, file in enumerate(files):
            if len(files) < 50 or i % (len(files) // 20) == 0:
                self.vPrint('partial: %5d/%d' % (i, len(files)))

            try:
                if file not in d or file not in t:
                    img = Image.open(join(self.img_dir, file))
                    d[file] = Hasher.hashImage(img, self.__imagehash_method)
                    t[file] = OCR.read2Normalized(img)
            except KeyboardInterrupt:
                self.vPrint('skipped remaining files')
                if file in d:
                    del d[file]
                if file in t:
                    del t[file]
                break
            except UnidentifiedImageError:
                self.vPrint('skipped ' + file + ' (not an image)')
                if file in d:
                    del d[file]
                if file in t:
                    del t[file]

        self.vPrint('loaded: ' + str(len(d.items())) + ' items')
        self.__imageToHash = d
        self.__imageToText = t
        self.saveProcessedDataToCache()
        return (d, t)
Example #19
0
 def __init__(self, size, keys):
     self._value = Bitset(size)
     self._size = size
     self._keys = keys
     self._hasher = Hasher()
Example #20
0
def _derive_key(key, salt=get_random_bytes(32)):
    h = Hasher(10)
    return h.hash(key, salt)[-32:], salt
Example #21
0
 def setUp(self):
     self.hasher = Hasher("words.txt", nwords=3, delimeter="-")
Example #22
0
class RequestURLCrawler:
    def __init__(self, init_dic):
        self.logger = getLogger()
        if not self.__isValidInfo(init_dic):
            self.logger.error(
                "Failed to init RequestURLCrawler : Invalid input information")
            exit(1)

        self.info_dic = init_dic
        self.cursor = None
        self.req_url_queue = [
        ]  # unvisited seeds (minimum heap ordered by page no.)
        # heappush(req_url_queue, (guid_hash, url_data))
        self.url_data_dic = dict(
        )  # visited + fully parsed data, dic[view_guid_hash] = URLData()
        self.hasher = Hasher()
        self.url_factory = None
        self.html_parser = None
        self.xml_producer = XMLPrinter(OUTPUT_PATH)

    def __isValidInfo(self, init_dic):
        """
		크롤에 필요한 모든 정보가 들어왔는지 유효성 체크
		:param init_dic: 크롤 정보
		:return: valid 여부
		"""
        if "request_urls" in input_dic:
            if input_dic["request_urls"]:
                return True
        elif ("url_table" in input_dic) and ("db_info_key" in input_dic):
            return True

        return False

    def run(self, _cursor=None):
        """ 크롤러를 처음 동작시키는 entry point """
        self.cursor = _cursor
        if not self.cursor:
            self.cursor = getDBConnectionByName(self.info_dic["db_info_key"])

        self.logger.info("Start RequestURLCrawler crawler!")

        self.loadRequestURLs()  # 일정량만큼의 Request URL 추출
        while self.req_url_queue:
            self.logger.info("Loaded [%s] view URLs", len(self.req_url_queue))

            crawl_count = self.startCrawl()  # req_url_queue 안의 URL 소모
            self.logger.info("Crawled [%s] view URLs" % crawl_count)

            save_count = self.saveURLData()  # 크롤한 View URL 전체 저장 + URL상태변경
            self.logger.info("Saved [%s] view URLs", save_count)

            self.loadRequestURLs()

        self.logger.info("Finished total crawler!")

        if not _cursor:
            self.cursor.close()

    def loadRequestURLs(self, load_count=1000):
        """ 요청 URL을 일정량만큼 LOAD 하여 Queue에 채운다. """

        if "request_urls" in input_dic:
            count = 0
            while count <= load_count:
                req_url = input_dic["request_urls"].pop()
                url_info = self.url_factory.getGuid(
                    req_url)  # url_info 는 dict 타입
                if url_info:
                    if url_info["url_type"] == "view":
                        guid_hash = self.hasher.md5(url_info["guid"])
                        url_data = URLData(guid_hash)
                        url_data.data_dic.update(url_info)
                        heappush(self.req_url_queue, (guid_hash, url_data))
                        count += 1
        else:
            query = "SELECT url_md5, request_url FROM " + self.info_dic[
                "url_table"] + " WHERE visited = 'N' ORDER BY request_time LIMIT %s" % load_count
            data_list = selectQuery(self.cursor, query,
                                    [self.info_dic["domain_id"], "N"])
            for no, video_url, insert_time in data_list:
                url_info = self.url_factory.getGuid(video_url)
                if url_info:
                    if url_info["url_type"] == "view":
                        guid_hash = self.hasher.md5(url_info["guid"])
                        url_data = URLData(guid_hash)
                        url_data.data_dic.update(url_info)
                        heappush(self.req_url_queue, (guid_hash, url_data))

    def startCrawl(self):
        """ queue의 URL을 하나씩 소모하며 파싱된 최종데이터 추출 """

        count = 0
        while self.req_url_queue:
            guid_hash, url_data = heappop(self.req_url_queue)
            self.visitURL(url_data)
            self.url_data_dic[guid_hash] = url_data
            count += 1
            time.sleep(CRAWL_DELAY)

        return count

    def visitURL(self, url_data):
        """  URL 방문, 파싱하여 URL 데이터 생성 """

        down_url = url_data["url_info"]["down_url"]
        down_data = downloadPage(down_url)

        if down_data:
            http_header, http_content, real_URL = down_data
            parse_result = self.html_parser.parse(http_header, http_content,
                                                  real_URL)
            crawl_data_count = len(parse_result)
            if parse_result:
                url_data.data_dic.update(parse_result)
            self.logger.info("	Crawled URL [%s] data from URL [%s]" %
                             (crawl_data_count, down_url))

    def saveURLData(self):
        """ 추출한 View URL을 DB 및 File로 출력 """

        # 1. Update flag from load table
        update_query = "UPDATE " + self.info_dic[
            "url_table"] + " SET visited = 'Y' WHERE url_md5 = %s"
        save_count = 0
        document_dic_list = []
        for guid_hash, url_data in self.url_data_dic.items():
            try:
                ret = executeQuery(self.cursor, update_query, [url_data.id])
                document_dic_list.append(url_data.data_dic)
                save_count += 1
                self.logger.info("	Updated URL %s : %s" %
                                 (ret, url_data.get("guid")))
            except Exception, msg:
                self.logger.error("	Update Failed : %s : %s",
                                  url_data.get("guid"), msg)

        # 2. Save data into XML file
        self.xml_producer.printXML(document_dic_list)

        return save_count
    def checkRepostDetection(self,
                             img: str,
                             img_sim_min: int = 0.8,
                             text_sim_min: float = 0.7,
                             recheck_img: bool = True,
                             generate_repost: bool = False,
                             save_generated_repost: bool = True):
        '''
        Checks whether reposts can be detected correctly using
        a naive algorithm considering image hashes and ocr text.

        This assumes the dataset is correctly labelled such that
        a reposted image is the image name prefixed with _REPOST_.

        If an image is custom crafted and you don't want it to
        make a deduction of whether it's a true positive or otherwise,
        simply avoid using the standard format name of:
            <subreddit>_<postID>.<imgExtension>
        '''
        distances = []
        name_dist_dict = {}
        d = self.__imageToHash
        t = self.__imageToText

        target_check = img
        target_path = join(self.img_dir, target_check)
        target_img = None
        self.vPrint('we\'ll process post : ' + target_check)
        if generate_repost or recheck_img:
            target_img = Image.open(target_path)
        if target_img and (recheck_img or target_check not in d
                           or target_check not in t):
            self.vPrint('computing target metadata')
            target_hash = Hasher.hashImage(target_img, self.__imagehash_method)
            target_text = OCR.read2Normalized(target_img)
            target_texthash = Hasher.hashText(target_text)
            d[target_check] = target_hash
            t[target_check] = target_text
            self.__imageToHash = d
            self.__imageToText = t
        else:
            target_hash = d[target_check]
            target_text = t[target_check]

        bad_check = '_REPOST_' + target_check
        if generate_repost:
            self.vPrint('generating dummy repost : _REPOST_' + target_check)
            bad_img = generate_bad_repost(target_path)
            bad_img_path = join(self.img_dir, bad_check)
            self.vPrint('computing target metadata')
            bad_img_hash = Hasher.hashImage(bad_img, self.__imagehash_method)
            bad_img_text = OCR.read2Normalized(bad_img)
            bad_img_texthash = Hasher.hashText(bad_img_text)
            d[bad_check] = bad_img_hash
            t[bad_check] = bad_img_text
            if save_generated_repost:
                bad_img.save(bad_img_path)
                self.__imageToHash = d
                self.__imageToText = t

        if self.update_cache:
            self.saveProcessedDataToCache()

        self.vPrint('\nchecking...')

        for key, value in d.items():
            if key == target_check:
                continue
            img_diff = Hasher.diff(value, target_hash, 'IMAGE')
            text_sim = 0.0 if text_sim_min <= 0.0 else Levenshtein.ratio(
                t[key], target_text)
            distances.append \
                    ( \
                     (key, \
                      img_diff, \
                      text_sim)
                     )
            name_dist_dict[key] = (distances[-1][1], distances[-1][2])

        def orderOfSort(x):
            '''dynamic sorting to prioritise text if image and text are both really close'''
            img_diff = x[1]
            txt_diff = 1 - x[2]
            if txt_diff <= 1 - text_sim_min and img_diff <= 1 - img_sim_min:
                return (txt_diff - 1, img_diff - 1)
            return (img_diff, txt_diff)

        distances.sort(key=orderOfSort)
        counter = 0

        results = {}
        FP = 0
        FN = 0

        self.vPrint('--- similar results ---')
        self.vPrint('  SAME?  | IMG_SIM | TEXT_SIM | IMAGE')
        for a, b, c in distances:
            standardFormat = len(a.split('.')) == 2 and len(
                a.split('.')[0].split('_REPOST_')[-1].split('_')) == 2
            is_known_same = a.split('_REPOST_')[-1] == target_check.split(
                '_REPOST_')[-1]
            is_repost = b <= 1 - img_sim_min and c >= text_sim_min
            if not standardFormat:
                validity = '??'
            else:
                if is_known_same:
                    if is_repost:
                        validity = 'TP'
                    else:
                        validity = 'FN'
                        FN += 1
                else:
                    if is_repost:
                        validity = 'FP'
                        FP += 1
                    else:
                        validity = 'TN'

            if counter < 10:
                counter += 1
                if self.verbose:
                    self.vPrint('%8s   %7.3f   %8.3f    %-50s' % \
                                (('YES, ' if is_repost else ' NO, ') + validity,1-b,c,a))

                    if standardFormat:
                        subreddit = a.split('_REPOST_')[-1].split('_')[0]
                        post_id = a.split('_REPOST_')[-1].split('_')[-1].split(
                            '.')[0]
                        self.vPrint('reddit.com/r/' + subreddit +
                                    '/comments/' + post_id + '/')
                    else:
                        self.vPrint(
                            '• this image isn\'t from the standard dataset')

                    if a == target_check:
                        self.vPrint('• this is the originally chosen image')
                    elif is_known_same:
                        self.vPrint(
                            '• this is a known to be the same as the chosen image'
                        )
                    self.vPrint()

            results[a] = {
                'imgName': a,
                'isRepost': is_repost,
                'validity': validity,
                'imgDiff': b,
                'textSim': c
            }

        if FP or FN:
            self.vPrint('important notes:')
            self.vPrint(
                'we have %d known false positives and %d known false negatives for this\n'
                % (FP, FN))

        return results
def __init_hasher__():
    global __HASHER_START_X__
    global __HASHER_START_Y__
    hasher = Hasher(__HASHER_START_X__, __HASHER_START_Y__)
    add_hasher(hasher)
Example #25
0
def main():
    hasher = Hasher(args.wordfile, nwords=3, delimeter="-")
    pprint(hasher.process(args.input))