コード例 #1
0
ファイル: crawl_k_shop.py プロジェクト: leechae0/crawler
def k_shop(window_arrow):
    print(' K SHOP')

    url = 'http://apitest.skymall.co.kr/apiv4/tv/today-goods-list?'
    params = {
        'mediaCode': 'TV07',
        'uk': 'TT131127012',
        'ck': '20170417145200',
        'nolog': 'EPG'
    }
    response = requests.get(
        url,
        params=params,
        timeout=7.0,
    )

    response_json = response.json()

    #print(response_json)

    goodList = response_json['data']['goodsList']
    product_list = []

    for goods in goodList:
        startTime = goods['startTime']
        endTime = goods['endTime']

        # startTime = window_arrow
        # endTime = window_arrow
        product_list = []
        product_list = goods['subGoodsList']
        for item in product_list:
            goodUrl = item['goodsUrl']
            goodsCode = item['goodsCode']
            goodsName = item['goodsName']
            saleDcAmt = item['saleDcAmt']

            imageUrl = get_imageUrl(goodUrl)

            print("startTime    : " + startTime)
            print("endTime      : " + endTime)
            print("goodName     : " + goodsName)
            print("goodCode     : " + goodsCode)
            print("imageUrl     : " + imageUrl)
            print("saleDcAmt    : " + str(saleDcAmt))
            print("detailURl    : " + "http://www.kshop.co.kr/goods/" +
                  goodsCode)
            yield ProductInfo(
                name=goodsName,
                start_time=startTime,
                end_time=endTime,
                category='',
                shop_code='7',
                ch_no='20',
                shop_prod_id=goodsCode,
                product_id='003711' + goodsCode,
                detail_product_url="http://www.kshop.co.kr/goods/" + goodsCode,
                image=imageUrl,
                price=str(saleDcAmt),
            )
コード例 #2
0
def parse_table(rows, window_arrow):
    the_time = ''
    category = ''
    for row in rows:
        price = ''
        price_item = row.find('span', {'class': 'sell'})
        if price_item:
            price = get_text_from_child(price_item)

        td_time_item = row.find('td', {'class': 'dateTime'})
        if td_time_item:
            span_time_item = td_time_item.find('span', {'class': 'time'})
            the_time = get_text_from_child(span_time_item) or the_time

        td_goods_item = row.find('td', {'class': 'goods'})
        if td_goods_item:
            product_item = td_goods_item.find('div', {
                'class': 'text'
            }).find('a')
            product_url = product_item['href']
            prod_id = extract_value_from_url_key(product_url, 'goods_code')
            product_name = get_text_from_child(product_item)
            image = get_image_url_by_prod_id(prod_id)
            yield ProductInfo(
                name=product_name,
                start_time=the_time.split(' ~ ')[0],
                end_time=the_time.split(' ~ ')[1],
                category=category,
                price=price,
                image=image,
                product_id=prod_id,
                detail_product_url=product_url,
            )
コード例 #3
0
def crawling_page(window_arrow, is_yesterday=False):
    start_date = window_arrow.format('YYYYMMDD')
    url = "http://www.cjmall.com/etv/broad/schedule_list_week_iframe.jsp?start_date=" + start_date
    page_source = build_soup(url)
    table_source = page_source.findAll('tr')
    table_date = window_arrow.format('YYYY/MM/DD')
    week_days = table_source[0].findAll('td')
    index_for_the_day = 0
    for week_day in week_days:
        if table_date in str(week_day):
            break
        index_for_the_day += 1

    product_list = []
    for column in table_source[1:]:
        hour = int(column.findAll('th')[0].text)
        if not is_yesterday and 0 <= hour <= 5:
            continue
        if is_yesterday and hour > 5:
            continue

        item_for_week_days = column.findAll('td')
        item = item_for_week_days[index_for_the_day]
        for category, the_time, parsed_name in parsing_item(item):
            product_list.append(
                ProductInfo(
                    name=parsed_name,
                    start_time=the_time,
                    category=category,
                ))
    return product_list
コード例 #4
0
def parse_table(rows):
    the_time = ''
    category = ''
    for row in rows:
        product_link = ''
        image_url = ''
        product_name = ''
        product_id = ''
        price = ''
        time_item = row.find('th')
        if time_item:
            the_time = get_text_from_child(time_item)
            category = get_text_from_child(time_item.find('span'))
            category = row.get('th', {}).get('span', '') or category
        product_item = row.find('td')
        product_item = product_item.find('div',
                                         {'class': 'layerUp'}) or product_item
        price_item = product_item.find('dl', {'class': 'price'})
        product_item = product_item.find('dl',
                                         {'class': 'pdtTxts'}) or product_item
        product_item = product_item.find('dd',
                                         {'class': 'txt'}) or product_item
        product_link_item = product_item.find('a')
        if product_link_item:
            product_raw_link = product_link_item['href']
            product_name = get_text_from_child(product_link_item)
            product_id = extract_value_from_url_key(product_raw_link,
                                                    'slitmCd')
            if product_id:
                product_link = get_detail_prod_url_by_prod_id(product_id)
                image_url = get_image_url_bu_prod_id(product_id)

        if price_item:
            price_item = price_item.find('span', {'class': 'txtStrong'})
            if price_item:
                price = get_text_from_child(price_item)

        # yield the_time.split(' ~ ')[0], category, ''
        yield ProductInfo(
            name=product_name,
            start_time=the_time.split(' ~ ')[0],
            end_time=the_time.split(' ~ ')[1],
            category=category,
            shop_code='7',
            ch_no='10',
            shop_prod_id=product_id,
            product_id='001811' + product_id,
            detail_product_url=product_link,
            image=image_url,
            price=price,
        )
コード例 #5
0
def gs_shop(window_arrow):
    print('GS MALL')
    url = 'http://with.gsshop.com/tv/tvScheduleMain.gs?lseq=397357&selectDate={0}'.format(
        window_arrow.format('YYYYMMDD'))
    soup = build_soup(url)
    tables = soup.findAll('table')
    product_list = []
    for table in tables:
        rows = table.findAll('tr')
        the_time = ''
        for row in rows:
            column_times = row.find_all('td', {'class': 'time'})
            column_descs = row.find_all('td', {'class': 'desc'})
            column_prices = row.find_all('td', {'class': 'price'})
            column_pics = row.find_all('td', {'class': 'pic'})
            the_time = parsing_td_time(column_times) or the_time
            category, product = parsing_td_desc(column_descs)
            price = parsing_td_price(column_prices)
            the_id = parsing_td_pic(column_pics)
            if not category or not product:
                continue
            image_url = get_image_url_by_prod_id(the_id) if the_id else ''
            prod_detail_url = get_product_detail_url_by_prod_id(
                the_id) if the_id else ''

            if not the_id:
                continue

            product_list.append(
                ProductInfo(
                    name=product,
                    start_time=the_time.split('-')[0],
                    end_time=the_time.split('-')[1],
                    shop_code='7',
                    ch_no='8',
                    category=category,
                    price=price,
                    image=image_url,
                    product_id='000711' + the_id,
                    shop_prod_id=the_id,
                    detail_product_url=prod_detail_url,
                ))
    return product_list
コード例 #6
0
def lotte_home_shopping(window_arrow):
    print ('LOTTE')
    url = "http://www.lotteimall.com/main/viewMain.lotte?dpml_no=6&tab=3&tlog=19000_2"
    driver = webdriver.Firefox()
    driver.get(url)
    # soup = build_soup(driver.page_source)

    date_format = window_arrow.format('MM.DD')
    wait_for_condition(driver, By.XPATH, "//span[@class='rn_day']", 5)
    the_day_element = None
    for element in driver.find_elements_by_xpath("//span[@class='rn_day']"):
        if element.text == date_format:
            the_day_element = element
            break
    if not the_day_element:
        return
    the_day_element.click()

    if wait_for_condition(driver, By.LINK_TEXT, "이전 방송상품 보기", 2):
        prev_see_item = driver.find_element_by_link_text("이전 방송상품 보기")
        try:
            if prev_see_item.is_displayed():
                driver.find_element_by_link_text("이전 방송상품 보기").click()
        except StaleElementReferenceException:
            pass

    html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
    soup = build_soup_from_page(html)

    item_list = soup.find('div', {'class': 'rn_tsitem_list'}).find_all('div', {'class': 'rn_tsitem_box'})
    for item in item_list:
        prod_id = ''
        the_time = ''
        title = ''
        price = ''
        prod_detail_url = ''
        image_url = ''

        time_item = item.find('div', {'class': 'rn_tsitem_caption'}).find('span')
        the_time = get_text_from_child(time_item)

        view_list = soup.find_all('div', {'class': 'rn_tsitem_view'})
        for view_item in view_list:
            image = view_item.find('img')
            if image:
                prod_id = image['src'].split('/')[-1].split('_')[0]

        info_item = item.find('div', {'class': 'rn_tsitem_info'})
        title_item = info_item.find('a')
        if title_item:
            title = get_text_from_child(title_item)
        price_info_item = info_item.find('div', {'class': 'rn_tsitem_priceinfo'})
        if price_info_item:
            price_item = price_info_item.find('span', {'class': 'rn_tsitem_price'})
            if price_item:
                price = get_text_from_child(price_item)

        prod_detail_url = get_prod_url_by_prod_id(prod_id)
        image_url = get_image_url_by_prod_id(prod_id)

        yield ProductInfo(
            name=title,
            category='',
            start_time=the_time.split(' ~ ')[0],
            end_time=the_time.split(' ~ ')[1],
            detail_product_url=prod_detail_url,
            price=price,
            product_id=prod_id,
            image=image_url,
        )