def __init__(self, *args, **kwargs): super(Animals, self).__init__(orientation="vertical") global app app = App.get_running_app() scrollview = ScrollView() self.list = MDList() self.database = Database(dbtype='sqlite', dbname='animals.db') self.rewrite_list() scrollview.add_widget(self.list) self.add_widget(scrollview) button_box = BoxLayout(orientation='horizontal', size_hint_y=0.1) #button 1 btn_animal = MDFlatButton() btn_animal.text = "Add new animal" btn_animal.font_style = "Button" btn_animal.on_release = self.on_create_animal # button 2 btn_type = MDFlatButton() btn_type.text = "Add new animal type" btn_type.font_style = "Button" btn_type.on_release = self.on_create_type button_box.add_widget(btn_animal) button_box.add_widget(btn_type) self.add_widget(button_box)
def delete_user_alert(email, currency, rate_exchange): Database.remove(collection="all_alert", query={ "email": email, "currency": currency, "rate_exchange": rate_exchange })
def set_bingo_board(self, bingo_board): """ Update the restaurant user's bingo board using the board. """ try: # convert the date string to a python date if not isinstance(bingo_board["expiry_date"], datetime): date = [ int(part) for part in bingo_board["expiry_date"].split("/") ] bingo_board["expiry_date"] = datetime(date[2], date[0], date[1], 23, 59, 59) # convert ids to object ids bingo_board["board"] = Database.replace_object_id( bingo_board["board"]) bingo_board["board_reward"] = Database.replace_object_id( bingo_board["board_reward"]) # if new user, update current board as well as future board boards = {"future_board": bingo_board} if self.get_future_board()["name"] == "": boards["bingo_board"] = copy.deepcopy(bingo_board) boards["future_board"]["expiry_date"] = boards["future_board"]["expiry_date"] + \ timedelta(days=90) self.rpm.db.update('restaurant_users', {"username": self.rpm.get_id()}, {'$set': boards}) except (UpdateFailureException, KeyError): print("There was an issue updating a bingo board.")
def initialize(): Database.initialize() session['email'] = session.get('email') session['name'] = session.get('name') scheduler = BackgroundScheduler() scheduler.add_job(check_alert, "cron", day_of_week="0-4", hour="16", minute=30) scheduler.start()
def initialize(): Database.initialize() session['email'] = session.get('email') session['name'] = session.get('name') # job schedul setting scheduler = BackgroundScheduler() # scheduler.add_job(check_alert, "interval", seconds=10) # scheduler.add_job(check_alert, "cron", day_of_week="0-4", hour="17", minute=30) scheduler.start()
def update_user_alert(email, currency, rate_exchange, price): Database.update(collection="all_alert", query={ "email": email, "currency": currency, "rate_exchange": rate_exchange }, data={"$set": { "price": price }})
def inti_request(): Database.inti() # 定义session['email']的功能,否则找不到会报错. 要是浏览器重启,好像也直接报错 session['email'] = session.get('email') session['name'] = session.get('name') # 周1到周5,全天,一分钟刷新一次 clock = BackgroundScheduler() clock.add_job(Deliver.send_simple_message, 'cron', day_of_week='0-4', hour="0-23", minute="0-59") clock.start()
def test_getFinanciers(self): db = Database() db.insertFinancier( Financier(cnpj="31.213.941/0001-37", name="Financiador Teste 1", rate=70, term=72, warranty="Terreno")) db.insertFinancier( Financier(cnpj="24.642.112/0001-04", name="Financiador Teste 2", rate=70, term=48, warranty="Imóvel")) db.insertFinancier( Financier(cnpj="19.576.165/0001-34", name="Financiador Teste 3", rate=35, term=36, warranty="Imóvel")) financiers = db.getFinanciers(rate=70, term=72, warranty="Terreno") db.database.financier.delete_one({'cnpj': "31.213.941/0001-37"}) db.database.financier.delete_one({'cnpj': "24.642.112/0001-04"}) db.database.financier.delete_one({'cnpj': "19.576.165/0001-34"}) self.assertEqual(len(financiers), 1) self.assertEqual(financiers[0].name, "Financiador Teste 1") self.assertEqual(financiers[0].rate, 70) self.assertEqual(financiers[0].term, 72) self.assertEqual(financiers[0].warranty, "Terreno")
def test_getCompany(self): db = Database() db.insertCompany( Company(cnpj="93.612.749/0001-70", name="Companhia Teste 1", rate=25, term=72, warranty="Terreno")) db.insertCompany( Company(cnpj="27.456.797/0001-92", name="Companhia Teste 2", rate=25, term=48, warranty="Imóvel")) db.insertCompany( Company(cnpj="26.805.322/0001-00", name="Companhia Teste 3", rate=35, term=36, warranty="Imóvel")) company = db.getCompany(cnpj="93.612.749/0001-70") db.database.company.delete_one({'cnpj': "93.612.749/0001-70"}) db.database.company.delete_one({'cnpj': "27.456.797/0001-92"}) db.database.company.delete_one({'cnpj': "26.805.322/0001-00"}) self.assertEqual(company.name, "Companhia Teste 1") self.assertEqual(company.rate, 25) self.assertEqual(company.term, 72) self.assertEqual(company.warranty, "Terreno")
def __init__(self): self.main_database = Database('profit_and_debt.txt') self.secondary_database = Database('product_sales_info.txt') while True: user_input = input('> ') if (user_input.lower() == 'exit'): return self.handle_input(user_input)
def __init__(self): # Stores the current time in seconds since the Epoch. Used later to prevent the bot from replying to old comment every time it is executed. self.executed_timestamp = time.time() self.database = Database() # Authenticating with Reddit self.reddit = praw.Reddit('translate_bot') # Stream of comments self.comment_stream = self.reddit.subreddit('all').stream.comments( pause_after=-1) # Stream of inbox messages self.inbox_stream = praw.models.util.stream_generator( self.reddit.inbox.unread, pause_after=-1) # Start self.main()
def create_alert(email, currency, rate_exchange, price): alert_data = Database.find_one(collection="all_alert", query={"email": email, "currency": currency, "rate_exchange": rate_exchange}) if alert_data is not None: return False All_alert(email, currency, rate_exchange, price).save_to_db() return True
def load_sensors(self): self.logg.log("load sensors") try: self.db = Database.instance() sensors = self.db.get_sensors() self.logg.log(sensors) t_create = time.time() if sensors is not None: for s in sensors: s1: Sensor = Sensor() s1.id = s["sensor_id"] s1.log_rate = s["log_rate"] s1.topic_name = s["topic_name"] s1.topic_code = s["topic_code"] s1.type = s["sensor_type_code"] s1.ts = t_create s1.log_ts = t_create # self.logg.log(json.dumps(s1.__dict__)) self.sensors.append(s1) topics = self.db.get_topics() if topics is not None: for t in topics: t1: MQTTTopic = MQTTTopic(t) self.topics.append(t1) self.logg.log(self.topics) self.logg.log(self.sensors) except: self.logg.log(Utils.format_exception(self.__class__.__name__))
def start_thread(mode, db_name, qid, task, info, tq_rank, log_dir=None): config = ConfigParser.RawConfigParser(allow_no_value=True) config.read('config.ini') db = Database(config.get('database', 'user'), config.get('database', 'pw'), config.get('database', 'host'), db_name, config.get('database', 'cache_dir'), timeout=config.get('database', 'timeout'), buffer_pool_size=config.get('database', 'buffer_pool_size')) parser = SQLParser(db_name, config.get('parser', 'cache_dir')) # only load aig if info includes range aig = None if (mode == 'greedybb' or mode == 'greedyfirst') and info == 'range': aig = AIG(db, os.path.join(config.get('aig', 'dir'), db_name + '.aig')) data = json.loads(task) task_cleaned = { 'cqs': {}, 'ans': [] } for cqid, cq in data['cqs'].items(): task_cleaned['cqs'][int(cqid)] = cq for cqid in data['ans']: task_cleaned['ans'].append(int(cqid)) task = task_cleaned if log_dir: log_path = os.path.join(log_dir, str(qid) + '.log') with Logger(log_path): return run_task(mode, db, parser, qid, task, info, aig, tq_rank) else: return run_task(mode, db, parser, qid, task, info, aig, tq_rank)
def check_user(email, password): user_data = Database.find_one(collection="users", query={"email": email}) if user_data is None: return False if User.check_hash_password(password, user_data["password"]) is False: return False return True
def register_user(name, email, password): user_data = Database.find_one(collection="users", query={"email": email}) if user_data is not None: return False User(name, email, User.hash_password(password)).save_to_db() return True
def get_database(dataset, config, mode='train'): #TODO: make this better database_config = copy(config.DATA) database_config.transform = transform.ToTensor() database_config.scene_list = eval('config.DATA.{}_scene_list'.format(mode)) return Database(dataset, database_config)
def __init__(self, username, collection): """ Initialize a profile using the username, app instance and database collection. """ self.db = Database.get_instance() self.id = username.lower() self.fullname = "" self.hashed_pw = "" self.database_collection = collection
def connect(self): self.ext_apis = Constants.conf["ENV"]["EXT_API"] self.logstart = Constants.conf["ENV"]["EXT_API_LOG_INIT"] print(self.ext_apis) self.db = Database.instance() sensors: List[Sensor] = self.db.get_sensors() print(sensors) if sensors is not None: self.check_create_sensors(sensors)
def find_user_alert(email, rate_kind) -> dict: """ 功能:返回 用户的关注的所有货币 """ return Database.find(Alert.collection_name, query={ "email": email, "rate_kind": rate_kind })
def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids'])
def update_user_email(old_email, new_email): """ 用来修改用户的邮箱 """ return Database.update_one(User.collection_name, query={"email": old_email}, data={"$set": { "email": new_email }})
def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products['urls'] self.total = len(self.products['urls'])
def delete_user_alert(email, current, rate_kind): """ 功能:删除 某个货币监控 """ return Database.delete(Alert.collection_name, query={ "email": email, "current": current, "rate_kind": rate_kind })
def create_alert(email, currency, rate_exchange, price): alert_data = Database.find_one(collection="all_alert", query={ "email": email, "currency": currency, "rate_exchange": rate_exchange }) if alert_data is not None: return False All_alert(email, currency, rate_exchange, price).save_to_db() return True
def main(): parser = argparse.ArgumentParser() parser.add_argument('db') parser.add_argument('--qid', type=int) args = parser.parse_args() config = ConfigParser.RawConfigParser(allow_no_value=True) config.read('config.ini') db = Database(config.get('database', 'user'), config.get('database', 'pw'), config.get('database', 'host'), args.db, config.get('database', 'cache_dir'), timeout=config.get('database', 'timeout'), buffer_pool_size=config.get('database', 'buffer_pool_size')) parser = SQLParser(args.db, config.get('parser', 'cache_dir')) tasks = load_tasks(config.get('main', 'data_dir'), args.db) tqcs = load_tqc_cache(config, args.db) # load qids to exclude excludes = find_excludes(args.db) if args.qid: tqcs[args.qid] = tqc_for_task(db, parser, args.qid, tasks[args.qid]) else: for qid, task in tasks.items(): print('QUERY {}'.format(qid)) if qid in excludes: print('Skipping non-SPJ query.') print() continue elif qid in tqcs: print('Loaded from cache.') else: tqcs[qid] = tqc_for_task(db, parser, qid, task) save_tqc_cache(config, args.db, tqcs) print('TQ Confusion: {}'.format(tqcs[qid])) print() easy = 0 hard = 0 for qid, tqc in tqcs.items(): if qid in excludes: continue if tqc <= 0.75: easy += 1 else: hard += 1 print('TQC <= 0.75: {}'.format(easy)) print('TQC > 0.75: {}'.format(hard))
class TestStorage(unittest.TestCase): def setUp(self): self.service = Service() self.database = Database() open("test.service", "w+").close() open("test.db", "w+").close() def test_write_read_service(self): self.service.service_name = "Hello" self.service.username = "******" self.service.password = "******" storage.write("test", self.service, "test.service") service2 = Service() storage.read("test", service2, "test.service") self.assertEqual(service2.service_name, self.service.service_name) self.assertEqual(service2.username, self.service.username) self.assertEqual(service2.password, self.service.password) def test_write_read_database(self): self.database.add_service(Service()) self.database.add_service(Service()) self.database.name = "Hey" storage.write("test", self.database, "test.db") database2 = Database() storage.read("test", database2, "test.db") self.assertEqual(database2.name, self.database.name) for i in range(len(self.database.services)): self.assertEqual(database2.services[i].service_name, self.database.services[i].service_name) self.assertEqual(database2.services[i].username, self.database.services[i].username) self.assertEqual(database2.services[i].password, self.database.services[i].password) def tearDown(self): os.remove(os.getcwd() + "/test.service") os.remove(os.getcwd() + "/test.db")
def update_user_alert(email, current, rate_kind, price): """ 功能:修改 货币监控价格 """ return Database.update_one(Alert.collection_name, query={ "email": email, "current": current, "rate_kind": rate_kind }, data={"$set": { "price": price }})
def get_restaurant_name_by_id(object_id): """ Given a restaurant user's database id, return the restaurant's name. Returns "" on failure. """ try: db = Database.get_instance() user = db.query("restaurant_users", {"_id": ObjectId(object_id)})[0] return user["profile"]["name"] except (QueryFailureException, IndexError, KeyError, InvalidId): print("Something's wrong with the query.") return ""
class Stats: def __init__(self): self.__db = Database() def get_bin_per_day(self, order, interval=30): """ Getting a list of bin per day on a specific interval :return: List of bin with count and date """ # Getting the list of the number of bin creations in the last 60 days cursor = self.__db.get_cursor() logger.info(f"Getting stats for the latest {interval} days with {order} order") logger.debug(f"Retrieving bin number per day") query = """ select count(*) as count, date(created) as date from `bin` where created >= date_sub(curdate(), interval %s day) group by date order by `date` ASC """ cursor.execute(query, (interval,)) result = list(cursor.fetchall()) insertions_list = [] if len(result) >= 1: if order == 'DESC': logger.debug("Reversing the stats list") result = result[::-1] logger.debug(f"Found {len(result)} Bins in time range") for data in result: log_data = { "insertions": data[0], "day": data[1] } insertions_list.append(log_data) cursor.close() self.__db.done() return insertions_list def get_last_bin_timestamp(self): cursor = self.__db.get_cursor() logger.debug(f"Getting last insertion datetime") query = """ SELECT `created` FROM `bin` ORDER BY id DESC LIMIT 1 """ cursor.execute(query, ) result = list(cursor.fetchall()) if len(result) >= 1: logger.debug(f"Found latest insertion date") lastInsertTime = datetimeutil.ISO8601.from_datetime_obj(result[0][0]) cursor.close() self.__db.done() return lastInsertTime
def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = [ "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074" ] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls)
def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products["urls"] self.total = len(self.products["urls"])
def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids'])
def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products["urls"] self.images_store = "/" + settings["IMAGES_STORE"] self.total = len(self.start_urls)
def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls)
def __init__(self, *a, **kw): super(LydiasSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() # fix for bug with links they provide self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=") self.handle_not_provided() self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] lydias.add_properties(self.xml) self.total = len(self.products['urls'])
def __init__(self, *a, **kw): super(PartyliteSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = PartyliteTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.images_store = "/" + settings['IMAGES_STORE'] self.users = party.get_users(settings, self.d) self.exc = ZmagsException(50) self.production = self.d['env'] self.upload = self.d['upload'] self.english = self.d['lang'] self.file_name = self.d['file'] if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() self.change_url_list() else: self.get_lists_from_excel() self.xml = CommonXml() party.add_properties(self.xml) self.total = len(self.products['urls'])
class SportmanSpider(CrawlSpider): name = "sportman" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products["urls"] self.images_store = "/" + settings["IMAGES_STORE"] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: if "redirect_urls" in response.request.meta: item["product_id"] = [self.products["product_ids"][index]] item["name"] = [self.products["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[ "product_id" ], item["sku"] = self.get_basic_info(hxs) item["in_stock"] = ["IN_STOCK"] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item["image_urls"] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select('//div[@class="description2"]/text()').extract() description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if old_price != []: old_price = " ".join(old_price) old_price = old_price.split(":") old_price = old_price[1].replace("Kr", "") old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if price != []: price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, "Art.nr.", "</div>") sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku def get_vars(self, response, hxs): headers1 = { "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1", "Host": "www.sportmann.no", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", "Connection": "keep-alive", "Referer": "/product.aspx?productid=613232", "Cookie": "ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)", } page = hxs.select("//html").extract() page = " ".join(page) viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"') eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"') prevpage = [""] hidden_field = [""] r = requests.get(response.url, headers=headers1) page_one = r.content viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"') eventval_page = basic.get_middle_text(page_one, 'id="__EVENTVALIDATION" value="', '"') prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"') hidden_temp = page_one.split('id="__VIEWSTATE"') hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"') hidden_temp = hidden_temp[0].split("<script sr") val_x = len(hidden_temp) - 1 hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"') hidden_temp_val = hidden_temp[0] hidden_temp_val = hidden_temp_val.replace("amp;", "") hidden_url = "http://www.sportmann.no" + hidden_temp_val request_hidden = urllib2.Request(hidden_url) response_hidden = urllib2.urlopen(request_hidden) hidden_field_page = basic.get_middle_text( response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';" ) return ( viewst[0], eventval[0], prevpage[0], hidden_field[0], viewst_page[0], eventval_page[0], prevpage_page[0], hidden_field_page[0], ) def get_variants(self, hxs, response): page = hxs.select("//html").extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split("</div>") temp = temp[0].split("<select name") viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) if len(temp) == 1: color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>") color = basic.get_middle_text(test_color[0], '">', "</option>") value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split("</div>") size_temp = size_temp[0].split("<select name") if len(size_temp) == 1: size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split("</select>") if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text(a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>") size = basic.get_middle_text(a_page[0], '">', "</option>") size_val = basic.get_middle_text(a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>") size = basic.get_middle_text(test_size[0], '">', "</option>") size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one def get_server_path(self, url): images_array = [] for i in range(0, len(url)): url[i] = basic.clean_string(url[i]) images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg") return images_array def get_images(self, hxs): page = hxs.select("//html").extract() page = " ".join(page) images = [] temp = page.split('class="gallery_demo_unstyled"') temp = temp[1].split('<div class="right_container">') temp = basic.get_middle_text(temp[0], 'src="', '"') for i in range(0, len(temp)): image_url = "http://www.sportmann.no" + temp[i] images.append(image_url) return images def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", "Host": "www.sportmann.no", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", "Connection": "keep-alive", "Referer": "http://www.sportmann.no/product.aspx?productid=613232", "Cookie": "", } eventvalidation = urllib.urlencode({"__EVENTVALIDATION": eventvalidation}) viewstate = urllib.urlencode({"__VIEWSTATE": viewstate}) previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage}) hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden}) data = ( "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&" ) # r = requests.get(response.url, h) req = urllib2.Request(response.url, data, headers) resp_page = urllib2.urlopen(req).read() return resp_page def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d["database"]: try: self.database.connect() filename = self.database.get_name(self.d["catalog_id"]) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d["file"] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d["upload"]: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Sportmann: {0}".format(filename)) if self.d["email"]: mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d["email"]) except: msg += "\nSending mail failed." if self.d["database"]: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), "w") as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d["file"])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("short_desc", "Short Description", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("custom_price", "New Price", "text") xml.add_property("color_value", "Color Value", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("size_val", "Size Value", "text_list") xml.add_property("sku", "Sku", "text") xml.add_property("size_options", "Size_options", "text_list") xml.add_property("viewstate1", "Viewstate1", "text_list") xml.add_property("viewstate2", "Viewstate2", "text_list") xml.add_property("viewstate3", "Viewstate3", "text_list") xml.add_property("viewstate4", "Viewstate4", "text_list") xml.add_property("viewstate5", "Viewstate5", "text_list") xml.add_property("viewstate6", "Viewstate6", "text_list") xml.add_property("eventval", "Eventval", "text_list") xml.add_property("hidden", "Hidden Field", "text_list") xml.add_property("prevpage", "Previous Page", "text_list") xml.add_property("recommended_product", "Recommended Product", "text_list")
def save_to_db(self): Database.insert(collection="users", data=self.json())
import json import time from threading import Thread from modules.api import API from modules.database import Database from modules.config import * import os.path import sys database = Database(db_host, db_user, db_pass, db_name, db_autocommit) database.database_connection() class PlaySound: def __init__(self, user, cost): self.user = user self.api = API(1) self.cost = cost def playsound(self, sound): if(database.db_get_user_points_int(self.user) > self.cost): self.add_sound_to_queue(sound, self.get_filepath()) database.db_minus_points_user(self.user, self.cost) # self.start_timer() PlaySoundTimerRun() return "{} just spent {} points on an audio clip!".format(self.user, self.cost) else: return "{}, you don't have enough points FailFish".format(self.user) def get_filepath(self): basepath = os.path.dirname(__file__)
def find_user_data(email): return Database.find_one(collection="users", query={"email": email})
def update_user_email(old_email, email): Database.update(collection="users", query={"email": old_email}, data={"$set": {"email": email}})
def save_to_db(self): Database.insert(collection="all_alert", data=self.json())
class GuitarCenterSpider(CrawlSpider): name = "guitar_center" allowed_domains = ["musiciansfriend.com"] start_urls = ["http://www.musiciansfriend.com"] counter = 0 def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products["urls"] self.total = len(self.products["urls"]) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: item["product_id"] = [self.products["product_ids"][index]] item["name"], item["brand"] = self.get_basic_info(hxs) item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs) item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs) item["old_price"], item["discount"], item["price"] = self.get_prices(hxs) item["image_json"], img = self.get_images(hxs) item["serial"] = self.get_serials(hxs) item["warranty"] = self.gold_coverage(hxs) item["in_stock"] = self.get_available(hxs) item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs) if not item["add_to_cart_id"]: item["in_stock"] = ["NOT_AVAILABLE"] item["shipping"] = self.get_shipping(hxs) item["colors"] = self.get_colors(hxs) self.products["status"][index] = "ran" except StandardError: self.products["status"][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item["image_urls"] = img + brand_images return item def handle_not_provided(self): item = GuitarCenterItem() for n in self.no_urls["product_ids"]: item["product_id"] = [n] index = self.no_urls["product_ids"].index(n) item["name"] = [self.no_urls["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="fn"]/text()').extract() name = [basic.clean_string("".join(name))] brand = hxs.select('//span[@class="brand"]/text()').extract() name = [name[0].replace(u"\xa0", "")] return name, brand def get_description_images(self, hxs): brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract() brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract() images = brand_image + brand_image_promo if brand_image: brand_image = [self.get_server_path(brand_image[0])] if brand_image_promo: brand_image_promo = [self.get_server_path(brand_image_promo[0])] return brand_image, brand_image_promo, images def get_description(self, hxs): heading = hxs.select('//div[@id="description"]/p').extract() details = hxs.select('//p[@class="description"]').extract() specs = hxs.select('//div[@class="specs"]/ul').extract() last = hxs.select('//div[@class="callToAction"]/p/text()').extract() return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last) # function for getting prices, returns tags and values or empty field if no option for one of them new is discount def get_prices(self, hxs): tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract() value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract() old_price = [] discount = [] price = [] if len(tag) > 1: old_price = [basic.clean_string(value[0])] try: discount = [basic.clean_string(value[len(value) - 1])] except IndexError: print "This product has no price." try: price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract() except IndexError: print "This product has no price." if not old_price and not discount and not price: price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract() return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price) # returning json with image url and serial number of product image refers to def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d["image_url"] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d["product_serial"] = tags[i].replace("site1sku", "") else: d["product_serial"] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img # function for getting serials and all information about them, currently returns field with jsons with all # information, can be modified to return dicts if needed for subproducts for those one day def get_serials(self, hxs): serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract() new = [] for serial in serials: d = simplejson.loads(serial) new.append(basic.cdata(simplejson.dumps(d))) return new def get_server_path(self, url): # uncomment next line if you want to keep absolute image path from their site return url return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting gold coverage from the page which is actually additional warranty options def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d["id"] = ids[i] d["name"] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new # function for getting availability def get_available(self, hxs): p = hxs.select('//var[@class="hidden availability"]/text()').extract() if p: if p[0] == "in_stock": p = [p[0].upper()] else: # for those that have color options and in stock status for each of those # put IN_STOCK for the product as it has no that option on the page p = ["IN_STOCK"] return p # function for getting add to cart id and product reference def get_add_to_cart(self, hxs): try: temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0] except: print "Product not available" else: return [temp.split("|")[0]], [temp.split("|")[1]] return [], [] # function for gatting shipping information def get_shipping(self, hxs): return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract() # function for getting colors, return jsons with all the data about options def get_colors(self, hxs): colors = hxs.select('//var[@class="styleInfo"]/text()').extract() new = [] for color in colors: d = simplejson.loads(color) new.append(basic.cdata(simplejson.dumps(d))) return new # cleaning price to leave only numbers def clean_price(self, price): new = [] for i in price: new.append(re.sub("[^0-9.]", "", i)) return new def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d["database"]: try: self.database.connect() filename = self.database.get_name(self.d["catalog_id"]) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d["file"] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d["upload"]: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "GuitarCenter: {0}".format(filename)) if self.d["email"]: mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d["email"]) except: msg += "\nSending mail failed." if self.d["database"]: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), "w") as f: f.write(msg) def add_properties(self, xml): xml.add_property("old_price", "Old Price", "decimal") xml.add_property("image_json", "Image Json", "text_list") xml.add_property("discount", "Discount", "decimal") xml.add_property("product_ref", "Product Ref.", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("serial", "Serial", "text_list") xml.add_property("colors", "Colors", "text_list") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("shipping", "Shipping", "text") xml.add_property("warranty", "Warranty", "text_list") xml.add_property("heading", "Heading", "text") xml.add_property("details", "Details", "text") xml.add_property("specs", "Specs", "text") xml.add_property("call_to_action", "Call To Action", "text") xml.add_property("brand_image", "Brand Image", "text") xml.add_property("brand_image_promo", "Brand Image Promo", "text") def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d["file"])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
def find_user_alert(email, rate_exchange): return Database.find(collection="all_alert", query={"email": email, "rate_exchange": rate_exchange})
def delete_user_alert(email, currency, rate_exchange): Database.remove(collection="all_alert", query={"email": email, "currency": currency, "rate_exchange": rate_exchange})
def update_user_alert(email, currency, rate_exchange, price): Database.update(collection="all_alert", query={"email": email, "currency": currency, "rate_exchange": rate_exchange}, data={"$set": {"price": price}})
class PartyliteSpider(CrawlSpider): name = "partylite" allowed_domains = ["partylite.biz"] start_urls = ["http://www.zmags.com"] counter = 0 def __init__(self, *a, **kw): super(PartyliteSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = PartyliteTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.images_store = "/" + settings['IMAGES_STORE'] self.users = party.get_users(settings, self.d) self.exc = ZmagsException(50) self.production = self.d['env'] self.upload = self.d['upload'] self.english = self.d['lang'] self.file_name = self.d['file'] if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() self.change_url_list() else: self.get_lists_from_excel() self.xml = CommonXml() party.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): for url in self.products['urls']: if self.d['lang'] == 'us': request = Request(url, callback=self.parse_can, dont_filter=True) yield request elif self.d['lang'] == 'english': c_url = url.replace(self.users['us'], self.users['canada_en']) request = Request(c_url, callback=self.parse_can, dont_filter=True) request.meta['language'] = "eng" yield request elif self.d['lang'] == 'french': c_url = url.replace(self.users['us'], self.users['canada_fr']) request = Request(c_url, callback=self.parse_can, dont_filter=True) request.meta['language'] = "fr" yield request def change_url_list(self): for i in range(0, len(self.products['urls'])): if not self.production: self.products['urls'][i] = self.products['urls'][i].replace('www', 'qa') self.products['urls'][i] = self.products['urls'][i].replace('XXXXX', self.users['us']) def get_in_stock(self, hxs): """Gets in stock information about product.""" stock = hxs.select('//div[@id="availability_container"]').extract() if not stock: return ["IN_STOCK"] else: return ["NOT_IN_STOCK"] def get_basic_info(self, hxs): """Getting basic info about products (name, shown with).""" name = hxs.select('//div[@id="product_name"]/text()').extract() if name: name = basic.cdata_field(name) shown_with = hxs.select('//div[@id="shown_with_container"]').extract() if shown_with: shown_with = [basic.cdata(shown_with[0])] return name, shown_with def get_description(self, hxs): description = description = hxs.select('//div[@id="item_description"]').extract() description = [basic.cdata(basic.remove_tags(description[0]))] description = [description[0].replace(u"\u2044", "/")] return description def get_price(self, hxs): """Getting product prices. Gets regular and discount price if there is one.""" price = hxs.select('//span[@id="divUnitPrice"]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/text()').extract() discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract() price = basic.clean_string(price[0]) price = re.sub(" +", " ", price) price = price.replace("Price:", "") price = price.replace("Prix:", "") price = basic.cdata(price.strip()) if discount: discount = basic.cdata_field(discount) return [price], discount def get_add_to_cart_id(self, page): """Gets add to cart id from the javascript on the page.""" tmp = basic.get_middle_text(page, "if(isOrderStarted){", "}else")[0] tmp = basic.get_middle_text(tmp, "addItemToCart(", ",") return tmp def create_subproducts(self, page): """Gets information about colors from javascript. Returns field of dicts with information about colors. Those are really color variants for product.""" try: tmp = page.split("var largeImages = new Array();")[1] except IndexError: print "This product has no images" else: tmp = tmp.split("colorDropdownArray")[0] images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");") image_names = self.get_image_names(page) color_products = [] for im in images: product = {} attributes = im.split("',") product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production" product['normal_image_url'] += self.custom_clean_string(attributes[26], True) product['description'] = basic.cdata(self.custom_clean_string(attributes[27])) product['color_id'] = self.custom_clean_string(attributes[7], True) product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", "")) product['name'] = basic.cdata(image_names[product['color_id']]) product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "") product['price'] = self.custom_clean_string(attributes[10], True) color_products.append(product) return color_products return [] def custom_clean_string(self, string, spaces=False): """Custom function for cleaning strings. Replaces new line, return and tab signs, also replaces multiple spaces with only one.""" string = string.replace("\r", "") string = string.replace("\n", "") string = string.replace("\t", "") if not spaces: string = re.sub(' +', ' ', string) else: string = re.sub(' ', '', string) string = string.replace("'", "") return string def get_image_names(self, page): """Gets color names for color swatches.""" temp = page.split("new DropDownInfo") names = {} for i in range(1, len(temp)): names[basic.get_middle_text(temp[i], "('", "'")[0]] = basic.get_middle_text(temp[i], "'", "')")[2] return names def get_recommended(self, hxs): """Gets recommended product information. Returns information about recommended products as dict""" rec = hxs.select('//div[@id="right_column_container"]/div') new = [] i = 0 for r in rec: d = {} #to do: see how to get full href(different accounts) if not i: d['link'] = r.select('div/a/@href').extract()[0] d['image'] = "http://www.partylite.biz/imaging/resize" d['image'] += r.select('div/a/img/@src').extract()[0] d['name'] = r.select('div/a/text()').extract()[0] new.append(basic.cdata(simplejson.dumps(d))) i += 1 return new def get_reviews(self, page): """Gets average product rating. Returns string like 4.6 of 5 reviews.""" id = self.get_review_id(page) url = "http://partylite.ugc.bazaarvoice.com/8504-en_us/" + id + "/reviews.djs?format=embeddedhtml" url = url.replace(" ", "") page = urllib2.urlopen(url).read() page = basic.get_middle_text(page, '<div class=\\"BVRRRatingNormalImage\\">', '<\/div>') if page: rating = basic.get_middle_text(page[0], 'alt=\\"', '\\"')[0] return [rating] else: return [] def get_more_images(self, page): """Gets field of images.""" try: script = basic.get_middle_text(page, "var moreImages", "var numberOfImages")[0] except IndexError: print "This product has no images." else: r = basic.get_middle_text(script, "moreImages[", "';") images = [] # return cdata here if needed to go with absolute links for i in range(0, len(r)): if self.production: images.append("http://www.partylite.biz" + r[i].split("= '")[1]) else: images.append("http://qa.partylite.biz" + r[i].split("= '")[1]) return images return [] def get_absolute(self, relatives): """Creates absolute path for images. [DEPRECATED] Please check if there is a need for this function again. If needed dimensions of images got from the client server can be changed here.""" new = [] print relatives os._exit(0) for i in range(0, len(relatives)): #add width, height here for different dimensions #don't change the url in here from qa to www it's meant to be qa always new.append("http://www.partylite.biz/imaging/resize?fileName=/productcatalog/production" + relatives[i]) return new def get_review_id(self, page): """Gets review id that is used in javascript for reviews.""" return basic.get_middle_text(page, 'productId: "', '"')[0] def write_subproducts(self, id, list, xml): """Writes child products to xml. Receives id, list and xml attributes, id is master product id, list is list of child products and xml is Xml instance""" for i in range(0, len(list)): item = PartyliteItem() item['master_product_id'] = id item['product_id'] = [id[0] + "_" + str(i)] item['in_stock'] = ["IN_STOCK"] for k, v in list[i].iteritems(): item[k] = [v] xml.create_xml(item) return 1 def parse_can(self, response): """Parse function for scraping canadian sites. There is meta information send in request in this function about language.""" self.counter += 1 basic.print_status(self.counter, self.total) item = PartyliteItem() hxs = HtmlXPathSelector(response) image_urls = [] if 'redirect_urls' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]] self.exc.code_handler(102, response.request.meta['redirect_urls']) if 'language' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0] + "_can" + "_" + response.meta['language']] try: index = self.products['product_ids'].index(self.get_id (response.request.meta['redirect_urls'][0])[0]) item['name'] = [basic.cdata(item['product_id'][0] + self.products['names'][index])] self.products['status'][index] = 'no_avail' except KeyError as e: print "This %s id is not in list" % (item['product_id'][0]) item['in_stock'] = ['NOT_AVAILABLE'] item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) else: index = self.products['product_ids'].index(self.get_id(response.url)[0]) try: item['product_id'] = self.get_id(response.url) item['name'], item['shown_with'] = self.get_basic_info(hxs) item['description'] = self.get_description(hxs) if 'language' in response.meta: item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']] response.meta['item'] = item page = " ".join(hxs.select('//html').extract()) image_urls = self.get_more_images(page) item['normal_image_url'] = self.get_server_path_field(image_urls) item['in_stock'] = self.get_in_stock(hxs) color_products = self.create_subproducts(page) if color_products: self.write_subproducts(item['product_id'], color_products, xml) else: item['add_to_cart_id'] = self.get_add_to_cart_id(page) item['custom_price'], item['custom_discount'] = self.get_price(hxs) self.products['status'][index] = "ran" except StandardError: basic.print_error() self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) if image_urls: item['image_urls'] = image_urls return item def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = party.get_settings_message(self.d) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] logname = filename filename = "{0}_{1}".format(filename, self.d['lang']) self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) from modules.mail import Mail from modules.export_to_db import CommonExport exp = CommonExport() if self.upload: try: if self.d['lang'] == 'us': exp.xml_to_db(self.name, filename, "55892247-1b92-4ff9-a8a3-33cc976f9341") else: exp.xml_to_db(self.name, filename, "9cb6c676-c14f-403b-b94f-b981184e1de0") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" mail = Mail() try: mail.send_mail(msg, "Partylite: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Partylite: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = 'logs/{0}'.format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, logname), 'w') as f: f.write(msg) def get_id(self, url): """Gets id from product url.""" return [url.split("&sku=")[1]] def get_server_path(self, url): """Gets server path for image url.""" url = url.split("partylite.biz")[1] return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" def get_server_path_field(self, urls): """Getting server path for field of image urls.""" new = [] for url in urls: url = url.split("partylite.biz")[1] new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new def remove_spaces(self, field): new = [] for i in field: new.append(i.replace(' ', '')) return new def get_lists_from_excel(self): excel_path = "xls/{0}/{1}.xls".format(self.name, self.d['file']) xls = PartyliteExcel(path=excel_path, user=self.users['us'], production=self.production) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
def check_alert(): moneydict, position = Money.search_data() all_user = [] data = Database.find_all(collection="users") for user in data: all_user.append(user["email"]) for user in all_user: print(user) message = [] user_all_alert = Database.find(collection="all_alert", query={"email": user}) for user_alert in user_all_alert: if user_alert["rate_exchange"] == "cash": if moneydict[position[user_alert["currency"]]].cash_in != "-": if float(user_alert["price"][0]) >= float(moneydict[position[user_alert["currency"]]].cash_in): if user_alert["currency"] not in message: message.append(user_alert["currency"]) else: pass elif moneydict[position[user_alert["currency"]]].cash_out != "-": if float(user_alert["price"][0]) <= float(moneydict[position[user_alert["currency"]]].cash_out): if user_alert["currency"] not in message: message.append(user_alert["currency"]) else: pass else: pass else: pass elif moneydict[position[user_alert["currency"]]].cash_out != "-": if float(user_alert["price"][0]) <= float(moneydict[position[user_alert["currency"]]].cash_out): if user_alert["currency"] not in message: message.append(user_alert["currency"]) else: pass else: pass else: pass if user_alert["rate_exchange"] == "sign": if moneydict[position[user_alert["currency"]]].sign_in != "-": if float(user_alert["price"][1]) >= float(moneydict[position[user_alert["currency"]]].sign_in): if user_alert["currency"] not in message: message.append(user_alert["currency"]) else: pass elif moneydict[position[user_alert["currency"]]].sign_out != "-": if float(user_alert["price"][1]) <= float(moneydict[position[user_alert["currency"]]].sign_out): if user_alert["currency"] not in message: message.append(user_alert["currency"]) else: pass else: pass else: pass elif moneydict[position[user_alert["currency"]]].sign_out != "-": if float(user_alert["price"][1]) <= float(moneydict[position[user_alert["currency"]]].sign_out): if user_alert["currency"] not in message: message.append(user_alert["currency"]) else: pass else: pass else: pass else: pass print(user,":",message) requests.post( "https://api.mailgun.net/v3/sandboxcf0f0204481f4e5db32ca491987d150f.mailgun.org/messages", auth=("api", "b7288f39ae1c25d533325c5181e0eada-4a62b8e8-809b3b07"), data={"from": "Mailgun Sandbox <*****@*****.**>", "to": user, "subject": "外幣通知", "text": "目前符合調的外幣為:{},請盡快至關網查看!".format(str(message).strip("[]"))})
class BurtonSpider(CrawlSpider): name = "burton" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description(hxs) item['variants'], thumb_urls, color_names = self.get_variants(page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors(page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def handle_not_provided(self): item = BurtonItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="productHeading"]/text()').extract() product_id = hxs.select('//input[@name="productId"]/@value').extract() return product_id, name def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" return path def get_prices(self, hxs): price = hxs.select('//div[@class="op"]/text()').extract() price = [basic.get_price(price[0])] old_price = hxs.select('//span[@class="lp"]/text()').extract() if old_price: old_price = [basic.get_price(old_price[0])] return price, old_price def get_description(self, hxs): description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features) def get_variants(self, page): """Gets jsons for colors with all available sizes. In json are also fetched all information for sizes that are on the site """ script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0] sizes = [] image_urls = [] color_names = [] colors = script.split('skuSizeColorObj') for c in range(1, len(colors)): temp = basic.get_middle_text(colors[c], '= ', ';') # delete swatch image as it obviously won't be needed t = simplejson.loads(burton.replace_for_json(temp[0])) image_urls.append(t['swatchURL']) color_names.append(t['ColorDesc']) t['swatchURL'] = self.get_server_path(t['swatchURL']) sizes.append(basic.cdata(simplejson.dumps(t))) return sizes, image_urls, color_names def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="','";') return [basic.cdata(simplejson.dumps(all_sizes))] def get_colors(self, page, color_names): """Gets color information with images from javascript on the page. Returns json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and returnes filed of image urls that can be used for download later""" script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0] colors = basic.get_middle_text(script, '] = ', ';') image_urls = [] colors_json = [] for i in range(0, len(color_names)): color = burton.replace_color_json(colors[i]) color = simplejson.loads(color) color['cname'] = color_names[i] color.pop('reg') image_urls.append(color['enh']) color['enh'] = self.get_server_path(color['enh']) colors_json.append(basic.cdata(simplejson.dumps(color))) return colors_json, image_urls def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" # part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Burton: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class LydiasSpider(CrawlSpider): name = "lydias" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(LydiasSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() # fix for bug with links they provide self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=") self.handle_not_provided() self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] lydias.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path(default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item['image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item # function for checking if product has embroidery or not def get_embroidery(self, hxs): page = hxs.select('//html').extract()[0] if "document.getElementById('logocolor').disabled = true;" in page: return ["True"] else: return ["False"] # function for creating json with all information for colors def make_colors_json(self, color_urls, color_names, color_codes): dict = {} jsons = [] for i in range(0, len(color_urls)): dict['color_url'] = self.get_server_path_single(color_urls[i]) dict['color_name'] = color_names[i] dict['color_short'] = color_codes[i] json = basic.cdata(simplejson.dumps(dict)) jsons.append(json) return jsons # function for getting image server path def get_server_path_single(self, url): # return url return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting image path for field of images def get_server_path(self, urls): # return urls new = [] for url in urls: new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new #function for getting basic information for product def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract() description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description] # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes) def get_rating(self, hxs): temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract() if temp: rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out") return rating, temp else: return [], temp #function for getting reviews, returning rating and field of json reviews # or empty fields if there's no reviews def get_reviews(self, hxs): reviews = hxs.select('//div[@class="prodReview"]') if reviews: title = reviews[0].select('p[@class="review_title"]/text()').extract() text = reviews[0].select('p[@class="review_text"]/text()').extract() author = reviews[0].select('p[@class="review_author"]/text()').extract() location = reviews[0].select('p[@class="review_location"]/text()').extract() jsons = self.make_reviews_json(title, text, author, location) return jsons else: return [] # function for making json for reviews # currently not in use. cause there are no reviews in DPW design def make_reviews_json(self, title, text, author, location): jsons = [] print len(title) print len(text) print len(author) print len(location) os._exit(0) for i in range(0, len(title)): json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\ "%s" }' % (title[i], text[i], author[i], location[i]) json = basic.cdata(json) jsons.append(json) return jsons #function for getting size chart image def get_size_image(self, hxs): temp = hxs.select('//div[@class="TabbedPanelsContent cells"]/img/@src').extract() return temp #function for getting image swatches, returning fields (image_urls, image name, product color image) def get_image_swatches(self, hxs): colors = hxs.select('//div[@class="lolite"]') color_images = [] color_names = [] products_image = [] color_codes = [] for color in colors: color_images.append(color.select('a/img/@src').extract()[0]) color_names.append(color.select('a/img/@alt').extract()[0]) #if zoom image needed, this is the place to get it products_image.append(color.select('a/@rev').extract()[0]) color_codes.append(color.select('a/@onclick').extract()[0].split(",")[1].replace("'", "")) return color_images, color_names, products_image, color_codes #function for getting additional images, returns field of images or empty field if there is no def get_extra_images(self, hxs): additional_images = hxs.select('//div[@id="AddImg"]/script/text()').extract() if additional_images: temp = basic.get_middle_text(additional_images[0], '"', '"') thumb_images = temp[0].split(",") return thumb_images else: return [] #function for getting product id from the page def get_product_id(self, hxs): temp = hxs.select('//div[@id="wrap"]/script/text()').extract() id = basic.get_middle_text(temp[0], 'productid","', '"') return id[0] # function for getting sizes from another url, retunrning field of jsons for sizes # one id from the page is 115NB, if needed here to hardcode for testing # currently not in use def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons # function that handles creating subproducts, can be implemented for the usual way product for every combination # of size and color if needed def create_subproducts(self, id, color_names, product_image, color_codes, hxs): item = LydiasItem() # if no colors for specific product do this part and call to creating size children with empty string instead # of actual color name if len(color_names) == 0: item['master_product_id'] = [id] item['product_id'] = [id + "_" + "0"] item['color'] = ["NO_COLOR"] item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + "0", "", hxs) self.xml.create_xml(item) # for handling cases when there are color options for specific product, create child for every color, and call # for creating size children for every provided color else: for i in range(0, len(color_names)): print "name :" + color_names[i] + " code:" + color_codes[i] item['master_product_id'] = [id] item['product_id'] = [id + "_" + str(i)] item['color'] = [color_names[i]] item['color_short'] = [color_codes[i]] item['normal_image_url'] = self.get_server_path([product_image[i]]) item['in_stock'] = ["IN_STOCK"] item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + str(i), color_codes[i], hxs) self.xml.create_xml(item) item.clear() return 0 # function for creating child products for sizes # little messy with all the commented lines but those lines can be used if needed to go back to old way with # child products instead of json def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price(str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # return 0 # function for getting price for combination of every size and color, can return url where the price is, or can # parse that url to get that actual price but will drastically increase scraping time def get_size_price(self, id, color, size): if color != "": url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=388" % (str(id), str(color), size) else: url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=259" % (id, size) url = url.replace(" ", "%20") return url # just adding part for getting absolute paths for relative paths from page def absolute_path(self, urls): new = [] for i in urls: new.append("http://www.lydiasuniforms.com" + i) return new # function used for gettin embroidery information from clients page, was used only once to get it # cause embroidery is the same for all the products def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select('//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0) def handle_not_provided(self): item = LydiasItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Lydias: {0}".format(filename)) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) else: self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class ChomeSpider(CrawlSpider): name = "chome" allowed_domains = ["zmags.com"] start_urls = ["http://www.zmags.com/"] counter = 0 def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids']) def parse(self, response): self.counter += 1 hxs = HtmlXPathSelector(response) item = ChomeItem() print "IDs in excel feed: {0}".format(self.total) item['image_urls'] = self.parse_whole_xml() return item def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning("Feed file doesn't exist please de-select no download option") os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls['product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % (str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" return path def get_absolute(self, url): return "http://www.celebratinghome.com/" + url def escape(self, string): temp = HTMLParser.HTMLParser().unescape(string) return HTMLParser.HTMLParser().unescape(temp) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}\n".format(datetime.now()) if self.total - self.number: msg += "{0} id(s) from id list weren't found in feed".format(self.total - self.number) basic.warning(msg) else: msg += "All ids found in feed." basic.green(msg) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.no_urls) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "CelebratingHome: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("description_english", "Description English", "text") xml.add_property("description_spanish", "Description Spanish", "text") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("max_qty", "Max Quantity", "text") xml.add_property("time_type", "Time Type", "text") xml.add_property("name_english", "Name English", "text") xml.add_property("name_spanish", "Name Spanish", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("custom_price", "Custom Price", "text")
from modules.database import Database from modules.user import User Database.initialize() User.update_user_email("*****@*****.**", "*****@*****.**")
class KennethSpider(CrawlSpider): name = "kenneth" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(KennethSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.images_store = "/" + settings['IMAGES_STORE'] + "/" self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) print self.d if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.no_url_products(self.no_urls) self.start_urls = self.products['urls'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = KennethItem() #main try for script, run general except if error happens in code (send # url on mail where it happened) try: cur_url = response.url # search for noResultContent div on the page, if it exists keep # track, that product doesn't exist on # their page, otherwise continue scraping page available = hxs.select('//div[@id="noResultsContent"]').extract() if not available: index = self.products['urls'].index(cur_url) cur_id = self.get_product_id(cur_url) id = self.products['product_ids'][index] page = hxs.select('//div[@id="mainContent"]').extract() page = " ".join(page) item['name'], item['description'] = self.get_basic_info(hxs) price, new_p, old_p = self.get_prices(hxs) if new_p: item['new_price'] = new_p item['old_price'] = old_p else: item['price'] = price desc = basic.clean_string(item['description'][0]) item['description'] = [desc] urls = self.get_color_image(hxs) new = self.get_image_server_path(urls, id) item['color_image_urls'] = new self.export(item['color_image_urls'], [id], "swatchImage") jsons, images = self.we_also_recommend(cur_id, id) item['product_page'] = [cur_url] item['product_id'] = [id] item['add_to_cart_id'] = [cur_id] item['recommended_product'] = jsons item['in_stock'] = ["IN_STOCK"] self.products['status'][index] = "ran" images_or_404 = self.get_colors(hxs, page, id) if images_or_404 == 404: item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) item['image_urls'] = [] if images_or_404 != 404: item['image_urls'] += images_or_404 item['image_urls'] += urls item['image_urls'] += images #self.export(item['image_urls']) #item['image_urls'] = [] #uncomment for donwloading images else: # part for handling products that are not available cur_id = self.get_product_id(cur_url) cur_url = "http://www.kennethcole.com/product/index.jsp?" cur_url += "productId=" + str(cur_id) index = self.products['urls'].index(cur_url) self.products['status'][index] = "no_avail" item['product_id'] = [self.products['product_ids'][index]] if self.products['product_ids'][index]: item['name'] = [self.products['names'][index]] else: item['name'] = ["not available"] item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) self.exc.code_handler(102, cur_url) except: # part for catching errors and keeping track of numbers of # it and urls where it happened print "Error occured scraping this product" index = self.products['urls'].index(cur_url) self.products['status'][index] = "error" self.exc.code_handler(100, cur_url) return item def no_url_products(self, no_url): item = KennethItem() for n in no_url['product_ids']: item['product_id'] = [n] index = no_url['product_ids'].index(n) item['name'] = [no_url['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) #function for getting basic product info from the page def get_basic_info(self, hxs): name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract() description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0]) return name, [description] # function for getting prices from the page, nly one or new and old one if # that's the case def get_prices(self, hxs): price = hxs.select('//div[@id="productInfoTop"]/h2/text()').extract()[0] new_p = hxs.select('//h2[@class="sale-now"]/text()').extract() old_p = hxs.select('//span[@class="productGrey"]/text()').extract() price = re.sub('[^0-9.,]', '', price) return [price], new_p, old_p def get_color_image(self, hxs): return hxs.select('//div[@id="productInfoR2W"]/img/@src').extract() # function for gettng colors from javascript on the page, and writing them # in xml, from here is called function # for creating further sizes subproducts def get_colors(self, hxs, page, main_id): item = KennethItem() try: tmp = page.split('displays[0]')[1] except IndexError: print "This product is not available" return 404 script = tmp.split('</script>')[0] displays = script.split("};") global counter ids = [] images = [] color_ids = [] sizes_script = self.get_sizes_part_page(page) color_internal_code = {} for x in range(0, len(displays) - 1): id = basic.get_middle_text(displays[x], 'colorId: "', '"') ids.append(id[0]) reg = displays[x].count("Reg") images_in = [] for i in range(1, reg + 1): image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg: "', '"') if len(image) == 0: image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg:"', '"') if (len(image) > 0): if (image[0] != "null"): images_in.append(image[0]) if not images_in: images_in = hxs.select('//input[@name="productImage"]/@value').extract() color_ids.append(str(main_id) + "_" + str(x)) item['product_id'] = [str(main_id) + "_" + str(x)] item['color_option_id'] = id item['master_product_id'] = [main_id] item['normal_image_url'] = self.get_image_server_path(images_in, main_id) item['thumb_image_url'] = self.get_image_server_path_thumb(images_in, main_id) item['in_stock'] = ["NOT_IN_STOCK"] item['color'] = self.get_color_name(sizes_script, id[0]) color_internal_code[id[0]] = str(x) self.xml.create_xml(item) images += images_in self.export(item['normal_image_url'], item['product_id'], "productImage") self.get_sizes(sizes_script, ids, main_id, color_internal_code) return images # function for getting sizes for products from javascript, and storing # information in dicts of format {id : information} def get_sizes(self, page, ids, main_id, color_internal_code): options = page.split("};") skus = {} colors_name = {} inStocks = {} sizes = {} prices = {} for x in range(0, len(options) - 1): id = basic.get_middle_text(options[x], 'cId: "', '"') for i in range(0, len(ids)): if (id[0] == ids[i]): sku = basic.get_middle_text(options[x], 'sku: ', ',s') sku = re.sub("[^0-9]", "", sku[0]) skus = self.add_to_dict(skus, ids[i], sku) size = basic.get_middle_text(options[x], 'sDesc: "', '"') sizes = self.add_to_dict(sizes, ids[i], size[0]) price = basic.get_middle_text(options[x], 'price: "', '"') price = self.clean_price(price[0]) prices = self.add_to_dict(prices, ids[i], price[0]) available = basic.get_middle_text(options[x], 'avail: "', '"') inStocks = self.add_to_dict(inStocks, ids[i], available[0]) self.create_subproducts_xml(main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices) return main_id, colors_name, sizes, skus, inStocks, prices # function for creating subproducts for every size def create_subproducts_xml(self, main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices): number = 0 global counter for k, v in sizes.iteritems(): item = KennethItem() for i in range(0, len(v)): item['size'] = [v[i]] item['size_option_id'] = [skus[k][i]] m_id = main_id + "_" + color_internal_code[k] item['master_product_id'] = [m_id] id = m_id + "_" + str(i) item['product_id'] = [id] if inStocks[k][i] == "NOT_AVAILABLE": item['in_stock'] = ["NOT_IN_STOCK"] elif inStocks[k][i] == "ADVANCED_SALE_LIMITED": item['in_stock'] = ["IN_STOCK"] else: item['in_stock'] = [inStocks[k][i]] item['price'] = [prices[k][i]] #item['color'] = colors_name[k] self.xml.create_xml(item) number += 1 def add_to_dict(self, dict, index, value): try: dict[index].append(value) except: dict[index] = [value] return dict # function for getting we also recommend information about products from # their page, returns json list with information and images # list with images urls def we_also_recommend(self, id, main_id): url = "http://www.res-x.com/ws/r2/Resonance.aspx?appid=kennethcole01&t" url += "k=154212870918247&ss=525178103419747&sg=1&pg=897706724574618&b" url += "x=true&vr=2.67&sc=product_rr&ev=product&ei=" + id + "&cu=&ct=k" url += "ennethcolec01&no=3&cb=r1eh&clk=&cv1=" + id + "&cv23=63&ur=http%" url += "3A//www.kennethcole.com/product/index.jsp%3FproductId%3D3" + id url += "&plk=&rf=" import urllib2 page = urllib2.urlopen(url).read() temp = page.split("certonaRecBoxes") images = [] ids = [] names = [] prices = [] urls = [] # parsing data got from the upper url about we also recommend products for i in range(1, len(temp)): id = [basic.get_middle_text(temp[i], "d=", '\\"')[0]] image = basic.get_middle_text(temp[i], 'src=\\"', '\\"')[0] name = basic.get_middle_text(temp[i], 'alt=\\"', '\\"') price = basic.get_middle_text(temp[i], '<br>', '</a>') url = "http://www.kennethcole.com/product/index.jsp?productId=" url += id[0] urls.append(url) ids.append(id) names.append(name) prices.append(price) images.append(image) jsons = self.make_json(ids, names, prices, self.get_image_server_path(images, main_id), urls) return jsons, images # function for getting product id from the url def get_product_id(self, url): return url.split("=")[1] #function for making json def make_json(self, ids, names, prices, images, urls): jsons = [] for i in range(0, len(ids)): json = "{" + ' "id" : "' + str(ids[i][0]) + '", ' json += '"name" : "' + str(names[i][0]) + '", ' # insert function for storing the right image path json += '"image_url" : "' + str(images[i]) + '", ' json += '"product_url" : "' + urls[i] + '", ' json += '"price" : "' + str(prices[i][0]) + '" } ' json = basic.cdata(json) jsons.append(json) return jsons #function for getting javascript where sizes are handled def get_sizes_part_page(self, page): tmp = page.split("availDates = new Array();")[1] script = tmp.split("</script>")[0] return script # function for getting name of the color by id def get_color_name(self, script, id): temp = script.split(id) temp = temp[0].split('cDesc: "') temp = temp[len(temp) - 1] name = temp.split('"')[0] return [name] return {id: name} #function for exporting images to database via rest def export(self, images, id, tags): #set override to 0 for uploading images or else to skip uploading override = 1 if override == 0: import MultipartPostHandler import urllib2 import os url = 'http://api.admin.zmags.com/productImage/import?key=5ef90922-283b-4412-a1c8-3e70bc28b9d3' for i in range(0, len(images)): image_name = self.get_image_name(images[i]) path = "images/kenneth_images/small/" + str(image_name) params = {'file': file(path, 'rb'), 'product_id': id[0], 'index': str(i + 1), 'tags': tags} #token not working opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) code = opener.open(url, params).getcode() if (code != 202): print ("Achtung") global images_number images_number += 1 print images_number print "Image uploaded to product " + id[0] else: #print "Image upload overriden.." pass #function for getting image name from url def get_image_server_path(self, urls, id): # print urls new = [] for url in urls: temp = url.split("/") new.append(self.images_store + id + "/full/" + temp[len(temp) - 1]) return new # function for getting image paths on our server def get_image_server_path_thumb(self, urls, id): new = [] for url in urls: temp = url.split("/") new.append(self.images_store + id + "/small/" + temp[len(temp) - 1]) return new def clean_price(self, price): return [re.sub('[^0-9.,]', '', price)] def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped {0} product out of {1}\n\n".format(self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() #try: exp.xml_to_db(self.name, filename, "29eac9ea-8c57-4d22-baf4-3f1471dc3ab6") msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "KennethCole: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "KennethCole: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = 'logs/{0}'.format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(2, 2) self.products['product_ids'] = xls.read_excel_collumn_for_ids(0, 2) self.products['names'] = xls.read_excel_collumn(1, 2) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("add_to_cart_id", "Add To Cart Id", "text") xml.add_property("product_page", "Product page", "text") xml.add_property("color_image_urls", "Color Image URLs", "text_list") xml.add_property("color_option_id", "Color Option ID", "text") xml.add_property("recommended_product", "Recommended Product", "text_list") xml.add_property("size_option_id", "Size Option ID", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("new_price", "New Price", "text")