def k_shop(window_arrow): print(' K SHOP') url = 'http://apitest.skymall.co.kr/apiv4/tv/today-goods-list?' params = { 'mediaCode': 'TV07', 'uk': 'TT131127012', 'ck': '20170417145200', 'nolog': 'EPG' } response = requests.get( url, params=params, timeout=7.0, ) response_json = response.json() #print(response_json) goodList = response_json['data']['goodsList'] product_list = [] for goods in goodList: startTime = goods['startTime'] endTime = goods['endTime'] # startTime = window_arrow # endTime = window_arrow product_list = [] product_list = goods['subGoodsList'] for item in product_list: goodUrl = item['goodsUrl'] goodsCode = item['goodsCode'] goodsName = item['goodsName'] saleDcAmt = item['saleDcAmt'] imageUrl = get_imageUrl(goodUrl) print("startTime : " + startTime) print("endTime : " + endTime) print("goodName : " + goodsName) print("goodCode : " + goodsCode) print("imageUrl : " + imageUrl) print("saleDcAmt : " + str(saleDcAmt)) print("detailURl : " + "http://www.kshop.co.kr/goods/" + goodsCode) yield ProductInfo( name=goodsName, start_time=startTime, end_time=endTime, category='', shop_code='7', ch_no='20', shop_prod_id=goodsCode, product_id='003711' + goodsCode, detail_product_url="http://www.kshop.co.kr/goods/" + goodsCode, image=imageUrl, price=str(saleDcAmt), )
def parse_table(rows, window_arrow): the_time = '' category = '' for row in rows: price = '' price_item = row.find('span', {'class': 'sell'}) if price_item: price = get_text_from_child(price_item) td_time_item = row.find('td', {'class': 'dateTime'}) if td_time_item: span_time_item = td_time_item.find('span', {'class': 'time'}) the_time = get_text_from_child(span_time_item) or the_time td_goods_item = row.find('td', {'class': 'goods'}) if td_goods_item: product_item = td_goods_item.find('div', { 'class': 'text' }).find('a') product_url = product_item['href'] prod_id = extract_value_from_url_key(product_url, 'goods_code') product_name = get_text_from_child(product_item) image = get_image_url_by_prod_id(prod_id) yield ProductInfo( name=product_name, start_time=the_time.split(' ~ ')[0], end_time=the_time.split(' ~ ')[1], category=category, price=price, image=image, product_id=prod_id, detail_product_url=product_url, )
def crawling_page(window_arrow, is_yesterday=False): start_date = window_arrow.format('YYYYMMDD') url = "http://www.cjmall.com/etv/broad/schedule_list_week_iframe.jsp?start_date=" + start_date page_source = build_soup(url) table_source = page_source.findAll('tr') table_date = window_arrow.format('YYYY/MM/DD') week_days = table_source[0].findAll('td') index_for_the_day = 0 for week_day in week_days: if table_date in str(week_day): break index_for_the_day += 1 product_list = [] for column in table_source[1:]: hour = int(column.findAll('th')[0].text) if not is_yesterday and 0 <= hour <= 5: continue if is_yesterday and hour > 5: continue item_for_week_days = column.findAll('td') item = item_for_week_days[index_for_the_day] for category, the_time, parsed_name in parsing_item(item): product_list.append( ProductInfo( name=parsed_name, start_time=the_time, category=category, )) return product_list
def parse_table(rows): the_time = '' category = '' for row in rows: product_link = '' image_url = '' product_name = '' product_id = '' price = '' time_item = row.find('th') if time_item: the_time = get_text_from_child(time_item) category = get_text_from_child(time_item.find('span')) category = row.get('th', {}).get('span', '') or category product_item = row.find('td') product_item = product_item.find('div', {'class': 'layerUp'}) or product_item price_item = product_item.find('dl', {'class': 'price'}) product_item = product_item.find('dl', {'class': 'pdtTxts'}) or product_item product_item = product_item.find('dd', {'class': 'txt'}) or product_item product_link_item = product_item.find('a') if product_link_item: product_raw_link = product_link_item['href'] product_name = get_text_from_child(product_link_item) product_id = extract_value_from_url_key(product_raw_link, 'slitmCd') if product_id: product_link = get_detail_prod_url_by_prod_id(product_id) image_url = get_image_url_bu_prod_id(product_id) if price_item: price_item = price_item.find('span', {'class': 'txtStrong'}) if price_item: price = get_text_from_child(price_item) # yield the_time.split(' ~ ')[0], category, '' yield ProductInfo( name=product_name, start_time=the_time.split(' ~ ')[0], end_time=the_time.split(' ~ ')[1], category=category, shop_code='7', ch_no='10', shop_prod_id=product_id, product_id='001811' + product_id, detail_product_url=product_link, image=image_url, price=price, )
def gs_shop(window_arrow): print('GS MALL') url = 'http://with.gsshop.com/tv/tvScheduleMain.gs?lseq=397357&selectDate={0}'.format( window_arrow.format('YYYYMMDD')) soup = build_soup(url) tables = soup.findAll('table') product_list = [] for table in tables: rows = table.findAll('tr') the_time = '' for row in rows: column_times = row.find_all('td', {'class': 'time'}) column_descs = row.find_all('td', {'class': 'desc'}) column_prices = row.find_all('td', {'class': 'price'}) column_pics = row.find_all('td', {'class': 'pic'}) the_time = parsing_td_time(column_times) or the_time category, product = parsing_td_desc(column_descs) price = parsing_td_price(column_prices) the_id = parsing_td_pic(column_pics) if not category or not product: continue image_url = get_image_url_by_prod_id(the_id) if the_id else '' prod_detail_url = get_product_detail_url_by_prod_id( the_id) if the_id else '' if not the_id: continue product_list.append( ProductInfo( name=product, start_time=the_time.split('-')[0], end_time=the_time.split('-')[1], shop_code='7', ch_no='8', category=category, price=price, image=image_url, product_id='000711' + the_id, shop_prod_id=the_id, detail_product_url=prod_detail_url, )) return product_list
def lotte_home_shopping(window_arrow): print ('LOTTE') url = "http://www.lotteimall.com/main/viewMain.lotte?dpml_no=6&tab=3&tlog=19000_2" driver = webdriver.Firefox() driver.get(url) # soup = build_soup(driver.page_source) date_format = window_arrow.format('MM.DD') wait_for_condition(driver, By.XPATH, "//span[@class='rn_day']", 5) the_day_element = None for element in driver.find_elements_by_xpath("//span[@class='rn_day']"): if element.text == date_format: the_day_element = element break if not the_day_element: return the_day_element.click() if wait_for_condition(driver, By.LINK_TEXT, "이전 방송상품 보기", 2): prev_see_item = driver.find_element_by_link_text("이전 방송상품 보기") try: if prev_see_item.is_displayed(): driver.find_element_by_link_text("이전 방송상품 보기").click() except StaleElementReferenceException: pass html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") soup = build_soup_from_page(html) item_list = soup.find('div', {'class': 'rn_tsitem_list'}).find_all('div', {'class': 'rn_tsitem_box'}) for item in item_list: prod_id = '' the_time = '' title = '' price = '' prod_detail_url = '' image_url = '' time_item = item.find('div', {'class': 'rn_tsitem_caption'}).find('span') the_time = get_text_from_child(time_item) view_list = soup.find_all('div', {'class': 'rn_tsitem_view'}) for view_item in view_list: image = view_item.find('img') if image: prod_id = image['src'].split('/')[-1].split('_')[0] info_item = item.find('div', {'class': 'rn_tsitem_info'}) title_item = info_item.find('a') if title_item: title = get_text_from_child(title_item) price_info_item = info_item.find('div', {'class': 'rn_tsitem_priceinfo'}) if price_info_item: price_item = price_info_item.find('span', {'class': 'rn_tsitem_price'}) if price_item: price = get_text_from_child(price_item) prod_detail_url = get_prod_url_by_prod_id(prod_id) image_url = get_image_url_by_prod_id(prod_id) yield ProductInfo( name=title, category='', start_time=the_time.split(' ~ ')[0], end_time=the_time.split(' ~ ')[1], detail_product_url=prod_detail_url, price=price, product_id=prod_id, image=image_url, )