Example #1
0
    def __init__(self, path):
        self.__path = path
        """:type: str"""
        self.__connection = orm.Connection("%s:///%s" % (DB_SCHEME, path),
                                           echo=config.LOG_DATABASE_QUERIES)
        self.__session = orm.Session(bind=self.__connection)
        self.__is_closed = False

        self.__parser = None
        """:type: database.dbparser.GenericParser or None"""
def process_source(source_name, posts_url=None, params=None, url_format=None):
    session = orm.Session()
    dontbreak = True
    while params['page'] < 5000 and dontbreak:
        response = requests.get(posts_url, params=params)

        if response.status_code == 421:
            time.sleep(60)
            continue
        elif response.status_code != 200:
            print "FACK GOT A %d CODE" % response.status_code
            break
        
        images = response.json()
        if not images:
            print "No images, done."
            break

        for image in images:
            if 'file_url' not in image:
                continue
            exists_query = session.query(
                orm.Image
            ).filter(
                orm.Image.remote_id == image['id'],
                orm.Image.source_name == source_name,
            ).exists()
            if session.query(exists_query).scalar():
                print "FOUND EXISTING POST"
                dontbreak = False
                break
            else:
                try:
                    if source_name==u'danbooru':
                        new_img, new_tags = orm.Image.from_danbooru_response(image, fork=False)
                    else:
                        new_img, new_tags = orm.Image.from_danbooru_response(
                            image, fork=True, fork_url_format=url_format, fork_name=source_name
                        )
                    if new_img:
                        new_img.source_name = source_name
                    else:
                        continue
                    session.merge(new_img)
                    for tag in new_tags:
                        session.merge(tag)
                    session.commit()
                except Exception as e:
                    import ipdb; ipdb.set_trace()
                
        params['page'] += 1
Example #3
0
def process_images():
    import multiprocessing
    import orm

    session = orm.Session()
    images = session.query(orm.Image.id, orm.Image.full_url,
                           orm.Image.thumb_url, orm.Image.remote_url).filter(
                               orm.Image.fetched == 0,
                               orm.Image.source_name.in_(
                                   (u'awwnime', u'danbooru')))
    #for image_id, full_url, thumb_url, source_url in images:
    #    process_image(image_id, full_url, thumb_url, source_url)
    pool = multiprocessing.Pool(10)
    pool.map(process_wrap, [thing for thing in images])
Example #4
0
from flask import Flask, send_from_directory, request, jsonify
import orm
from datetime import datetime, timedelta
from time import time
import conf

app = Flask(__name__)

session = orm.Session()


@app.route('/')
def index():
    return app.send_static_file('index.html')


@app.route('/items')
def page():
    before = datetime.fromtimestamp(int(request.args.get('before', time())))
    limit = int(request.args.get('limit', 40))
    keywords = request.args.get('keyword', '').split(',')

    query = session.query(
        orm.Image.id,
        orm.Image.thumb_url,
        orm.Image.title,
        orm.Image.date,
        orm.Image.source_name,
        orm.Image.url,
        orm.Image.atags,
    )
Example #5
0
def process_image(image_id, full_url, thumb_url, source_url):
    import requests
    import orm
    import boto
    import gcs_oauth2_boto_plugin
    import tempfile
    import mimetypes
    import conf
    from PIL import Image as pimage
    from PIL import ImageFile
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    import imagehash
    from hashtest import hash_image

    session = orm.Session()

    gcs_oauth2_boto_plugin.SetFallbackClientIdAndSecret(
        conf.client_id, conf.client_secret)

    fullbucket = boto.storage_uri(conf.fullbucket, 'gs').get_bucket()
    thumbbucket = boto.storage_uri(conf.thumbbucket, 'gs').get_bucket()

    # Fetch images
    print "%d: Starting" % image_id
    response = requests.get(source_url, stream=True)
    if not response.status_code == 200:
        session.query(orm.Image).filter(orm.Image.id == image_id).update(
            {'fetched': -1})
        session.commit()
        return

    fulltemp = tempfile.NamedTemporaryFile()
    thumbtemp = tempfile.NamedTemporaryFile()

    for block in response.iter_content(4096):
        fulltemp.write(block)
    fulltemp.seek(0)

    himg = pimage.open(fulltemp)
    ahash, phash, dhash = imagehash.average_hash(himg), imagehash.phash(
        himg), imagehash.dhash(himg)
    ahash, phash, dhash = int(str(ahash),
                              base=16), int(str(phash),
                                            base=16), int(str(dhash), base=16)

    # Save images, make thumb
    himg.thumbnail((640, 640))
    himg.convert("RGB").save(thumbtemp, format='WebP')

    del himg

    if ahash >= 2**63:
        ahash -= 2**64

    if phash >= 2**63:
        phash -= 2**64

    if dhash >= 2**63:
        dhash -= 2**64

    # Upload
    fulltemp.seek(0)
    thumbtemp.seek(0)

    fullkey = fullbucket.new_key(full_url.split('/')[-1])
    thumbkey = thumbbucket.new_key(thumb_url.split('/')[-1])

    meta = {
        'Cache-Control': 'public, max-age=3600',
        'Content-Type': response.headers['content-type'],
    }

    fullkey.set_contents_from_file(fulltemp, headers=meta)
    print "%d: Uploaded full" % image_id

    meta['Content-Type'] = 'image/webp'
    thumbkey.set_contents_from_file(thumbtemp, headers=meta)
    print "%d: Uploaded thumb" % image_id

    try:
        bmbhash = hash_image(fulltemp.name)
        session.add(orm.Hash(name=u'bmbhash', value=bmbhash,
                             image_id=image_id))
    except:
        pass

    session.add(orm.Hash(name=u'ahash', value=ahash, image_id=image_id))
    session.add(orm.Hash(name=u'phash', value=phash, image_id=image_id))
    session.add(orm.Hash(name=u'dhash', value=dhash, image_id=image_id))
    session.query(orm.Image).filter(orm.Image.id == image_id).update({
        'fetched':
        1,
        'size':
        int(response.headers['content-length'])
    })
    session.commit()
    fulltemp.close()
    thumbtemp.close()
Example #6
0
    def item_passed(self, item, spider, output):
        session = orm.Session()
        obj = None
        try:
            if (isinstance(output, items.RentalItem)):
                obj = orm.Rental(**output)
            elif (isinstance(output, items.SaleItem)):
                obj = orm.Listing(**output)
            elif (isinstance(output, items.ListingItem)):
                obj = orm.Listing(**output)
            elif (isinstance(output, items.PersonItem)):
                if 'age' in output.keys() and len(output['age']) == 1:
                    birth_year = (
                        datetime.now() -
                        timedelta(days=365 * int(output['age'][0]))).year
                    del output['age']
                    prop = session.query(orm.TCAD_2010).get(
                        int(output['prop_ref'][0]))
                    del output['prop_ref']

                    ref = session.query(orm.Person).filter(
                        orm.Person.first_name == output['first_name'][0]
                    ).filter(
                        orm.Person.last_name == output['last_name'][0]).filter(
                            orm.Person.city == output['city'][0]).filter(
                                orm.Person.state == output['state'][0]).filter(
                                    orm.Person.zipcode == output['zipcode']
                                    [0]).first()
                    if ref:
                        obj = orm.Person(id=ref.id,
                                         birth_year=birth_year,
                                         **output)
                    else:
                        obj = orm.Person(birth_year=birth_year, **output)

                    prop.person = obj
                    session.merge(prop)

            elif (isinstance(output, items.TCADParcelItem)):
                if 'prop_id' in output.keys():
                    parcel = session.query(orm.TCAD_2010).filter(
                        orm.TCAD_2010.prop_id == int(output['prop_id'][0]))
                    if parcel.count() == 1:
                        try:
                            improvements = output['improvements']
                            del (output['improvements'])

                            for i in improvements:
                                imp = orm.TCADImprovement(
                                    parcel=parcel.first(), **i)
                                session.merge(imp)
                        except KeyError:
                            print "No Improvements Found"
                        except sqlalchemy.exc.IntegrityError:
                            print "Improvements already Processed"
                            session.rollback()

                        try:
                            segments = output['segments']
                            del (output['segments'])

                            for i in segments:
                                seg = orm.TCADSegment(**i)
                                session.merge(seg)
                        except KeyError:
                            print "No Segments Found"

                        historical_values = output['historical_values']
                        del (output['historical_values'])

                        obj = orm.TCAD_2010(objectid=parcel.first().objectid,
                                            **output)

                        for i in historical_values:
                            orm.TCADValueHistory(parcel=obj, **i)
                    else:
                        print "duplicate / missing prop_id - not inserted"
                else:
                    print "prop_id not found"

            else:
                raise orm.Fail, 'unknown data type'
        except orm.Fail:
            log.msg("SQL handling failed")
        else:
            if obj:
                obj.last_crawl = datetime.now()

                session.merge(obj)
                session.commit()
            else:
                print "Duplicate handled"