Example #1
0
def get_newly_added_index(SITE_MAP):
    tree = ET.parse(SITE_MAP)
    root = tree.getroot()
    sitemap_list = []
    for atype in root.findall(
            '{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
        url = atype.find(
            '{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
        if (url.endswith('.html')):
            sitemap_list.append(url)

    engine = db_connect()
    session = get_session(engine)
    try:
        spider_list = [
            ans.product_url
            for ans in session.query(ProductIndexModel.product_url).all()
        ]
        xor_list = list(set(sitemap_list).symmetric_difference(spider_list))
        ans_list = []
        for url in xor_list:
            if (not session.query(exists().where(
                    ProductIndexModel.product_url == url)).scalar()):
                ans_list.append(url)

        print('database urls:', len(spider_list), 'sitemap urls:',
              len(sitemap_list), 'results:', len(ans_list))
        return ans_list

    except IntegrityError as e:
        print(e)
    finally:
        session.close()
        engine.dispose()
def check_product_detail():
    engine = db_connect()
    session = get_session(engine)
    try:
        product_index = session.query(ProductIndexModel).filter(ProductIndexModel.product_name == 'Vicino Table Large Molteni & C').first()
        product = json.loads(product_index.product.product_detail_information)

        print(product)
    except IntegrityError as e:
        print(e)

    finally:
        session.close()
        engine.dispose()
def fix_Dot_Dot_Dot():
    engine = db_connect()
    session = get_session(engine)
    header = {
        'method': 'GET',
        'authority': 'eu1-search.doofinder.com',
        'scheme': 'https',
        'origin': 'https://www.miliashop.com',
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
        'accept': '*/*',
        'referer': 'https://www.miliashop.com/en/',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
    }
    url = 'https://eu1-search.doofinder.com/5/search'
    params = {
        'hashid': 'cf518a4fc64ed58ed14863348a5bae18',
        'transformer': 'basic',
        'rpp': '50',
        'query': '',
        'query_counter': '5',
        'page': '1'
    }
    fix_count = 0
    for product_index in session.query(ProductIndexModel).filter(
            ProductIndexModel.product_name.like('%...%')).all():
        print('before: ', product_index.product_name)
        try:
            params['query'] = product_index.product_name
            r = requests.get(url, headers=header, params=params)
            fix_name = r.json()['results'][0].get('title')
            if (fix_name == 'Nimrod Low Chair'):
                print('ERROR: No search result!')
                continue
            fix_count += 1
            product_index.product_name = fix_name
            print('after: ', product_index.product_name)
            session.commit()
        except (IntegrityError, AttributeError) as e:
            session.rollback()
            if (e.orig.args[0] == 1062):  # Duplicate entry
                print('Duplicated entry: ', product_index.product_name)
                session.delete(product_index)
                session.commit()

    session.close()
    engine.dispose()
    print('fix ... in database complete, total {} items.'.format(fix_count))
def set_start_urls():
    engine = db_connect()
    session = get_session(engine)
    r_pool = redis_create_pool()
    r = redis_connect(r_pool)
    key = rd_miliashopSpider.redis_key
    url = 'https://www.miliashop.com/en/sofas/14351-swingus-dedon-2-seater-sofa.html'
    try:
        r.lpush(key, url)
        print('successful insert {} records into redis start_urls.'.format(r.llen(key)))
    except IntegrityError as e:
        print(e)
    finally:
        r.connection_pool.disconnect()
        session.close()
        engine.dispose()
def fix_Deco_in_DB():
    fix_count = 0
    engine = db_connect()
    session = get_session(engine)
    for product_index in session.query(ProductIndexModel).filter(
            ProductIndexModel.product_name.like('%Dec;ò%')).all():
        product_index.product_name = (fixDeco(product_index.product_name))
        try:
            session.commit()
        except IntegrityError as e:
            session.rollback()
            if (e.orig.args[0] == 1062):  # Duplicate entry
                print(product_index.product_name)
                fix_count += 1
                session.delete(product_index)
                session.commit()
    session.close()
    engine.dispose()
    print('fix deco in database complete, total {} items.'.format(fix_count))
 def __init__(self, *a, **kw):
     super(miliashopSpider, self).__init__(*a, **kw)
     self.engine = db_connect()
     self.session = get_session(self.engine)
 def __init__(self):
     self.engine = db_connect()
     self.sess = get_session(self.engine)
 def __init__(self):
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.engine = db_connect()
     self.session = get_session(self.engine)