Beispiel #1
0
def get_all_page_urls(pageKeyDic, page_urls, all_page_number):
    url_0 = page_urls[0]
    url = page_urls[1]
    url_pageKeyDic = pageKeyDic
    # print url
    previous_attrs_value_dict = {}
    all_url_list = []
    for i in range(0, all_page_number + 1):
        current_url = url
        for key, value in url_pageKeyDic.items():
            if isinstance(value[1], dict) is True:
                value0_list = list(value[0])
                value0_list_len = len(value0_list)
                ch = " "
                for ch_index in range(0, value0_list_len):
                    if (value0_list[ch_index].isdigit() is False):
                        ch = value0_list[ch_index]
                        value0_list[ch_index] = " "

                value0_list_splited = "".join(value0_list).split(" ")
                value0_list_len = len(value0_list_splited)
                for index in range(0, value0_list_len):
                    if (value[1].has_key(index)):
                        # print "???"
                        if (i == 0):
                            previous_attrs_value_dict[
                                index] = value0_list_splited[index]

                        value0_list_splited[index] = str(
                            int(previous_attrs_value_dict[index]) +
                            int(value[1][index]))

                        previous_attrs_value_dict[index] = value0_list_splited[
                            index]
                        # print previous_attrs_value_dict[index]

                res_value = ch.join(value0_list_splited)
                current_url = current_url.replace(("%s=%s") % (key, value[0]),
                                                  ("%s=%s") % (key, res_value))

            else:
                if (i == 0):
                    previous_attrs_value_dict[key] = int(value[0])
                current_url = current_url.replace(
                    ("%s=%s") % (key, value[0]), ("%s=%s") %
                    (key, int(value[1]) + previous_attrs_value_dict[key]))
                previous_attrs_value_dict[key] = int(
                    value[1]) + previous_attrs_value_dict[key]

        if (get_url_domain(url) not in current_url):
            url_sifter(get_partial_url(url_0), current_url)

        all_url_list.append(current_url)
    return all_url_list
def get_nav_in_url(soup, url, parser_method):
    allCategory_page_url = get_allCategory_from_Key(soup=soup)

    # 方法一:获取大分类页面
    if (allCategory_page_url != None
            and "javascript" not in allCategory_page_url):
        # log_util.error("大分类页面:" + allCategory_page_url)

        allCategory_page_url = url_sifter(url, allCategory_page_url)
        # print ("大分类页面:" + allCategory_page_url)
        # print ("解析方法:%d,%d" % (parser_method, 1))
        # if(methon == 2):
        #     next_soup = get_soup_by_selenium_with_sleep(allCategory_page_url)
        # else:next_soup = get_soup_by_request(url)

        a_url_list = category_page_parser(allCategory_page_url, url,
                                          parser_method)
        return 1, a_url_list
    else:
        nav = get_nav_by_class_nav(soup, url)
        if nav == None:
            nav = get_nav_by_tag_ul(soup, url)
            # print nav
            way_number = 3
        else:
            way_number = 2

        if nav == None:
            return -1, None
        else:
            # print ("解析方法:%d,%d"%(parser_method,way_number))
            return way_number, get_aTag_url_integration(nav, url)
def get_aTag_url_integration(original_data, domain):
    tmp_soup = get_soup_by_html_source(str(original_data))
    a_list = tmp_soup.find_all("a")
    # print (a_list)
    a_url_res = []
    for tag in a_list:
        a_url_res.append([tag.text, url_sifter(domain, tag.get("href"))])

    # print (a_url_res)
    return a_url_res
Beispiel #4
0
def analysis_by_tag(goods_list_tag, url):
    detail_url_set = set()
    for each_tag in goods_list_tag.contents:

        if (each_tag.name != None):

            current_url_list = []
            for inner_tag in each_tag.descendants:
                if (inner_tag.name != None and inner_tag.name == 'a'):
                    try:
                        detail_url = url_sifter(url=inner_tag['href'], parent_url=url)
                        if ('javascript' not in detail_url and 'list' not in detail_url and 'search' not in detail_url
                            and detail_url not in current_url_list and ' ' not in detail_url and 'cart' not in detail_url):
                            current_url_list_len = len(current_url_list)

                            check_flag = True
                            for i in range(0, current_url_list_len):
                                if (detail_url in current_url_list[i]):
                                    current_url_list[i] = detail_url
                                    check_flag = False
                                    break
                                elif (current_url_list[i] in detail_url):
                                    check_flag = False
                                    break
                            if (check_flag == True):
                                current_url_list.append(detail_url)
                    except:
                        pass
            detail_url_set = detail_url_set | (set(current_url_list))

    res_detail_urls_list = urls_clustering(list(detail_url_set))

    # pprint.pprint(res_detail_urls_list)

    res_max_len = -1
    res_max_list = []
    for i in res_detail_urls_list:
        i_len = len(i)
        if (res_max_len <= i_len):
            res_max_len = i_len
            res_max_list = i

    # debug
    pprint.pprint(res_max_list)
    # urls_clustering(res_max_list)
    # print len(res_max_list)

    return res_max_list
def get_categoryList_method_in_index_url(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'lxml')

    allCategory_page_url = get_allCategory_from_Key(soup=soup)
    # method = 1
    if (allCategory_page_url != None):
        """
        这里实际上是进入下一页页面,一般写callback
        """
        allCategory_page_url = url_sifter(url, allCategory_page_url)
        # print (allCategory_page_url)
        url_list = category_page_parser(allCategory_page_url)
        # print (len(url_list))

    else:
        pass
def deep_search_get_searchUrl_and_keyword_in_soup(soup, url):
    res_url = None
    res_key = None
    res_method = ""
    for a in soup.find_all('a'):
        try:
            next_url = a.get('href')
            # and quote(a.text) in next_url
            http_code_key = quote(a.text.encode('utf-8'))
            origianl_key = a.text
            if (next_url != None and 'javascript' not in next_url
                    and http_code_key != None and http_code_key != ''
                    and origianl_key != None and origianl_key != ''):
                # and http_code_key != None
                # and http_code_key != '' and origianl_key != None
                if ('search' in next_url):

                    if (origianl_key in next_url):
                        res_url = next_url
                        res_key = origianl_key
                        res_method = "ORIGINALKEY"
                        break
                    if (http_code_key in next_url):
                        res_url = next_url
                        res_key = http_code_key
                        res_method = "HTTPENCODEKEY"
                        break

                    re_str = '(%[\w\d]{2,4}\d*)+'
                    # print next_url
                    if (re.search(re_str, next_url)):
                        res_key = re.search(re_str, next_url).group()
                        res_url = next_url
                        res_method = "REGULARHTTP"

                        break
        except:
            pass

    # print res_url
    return [url_sifter(parent_url=url, url=res_url), res_key, res_method]
Beispiel #7
0
def get_pageUrls_and_all_pageNumber(url):
    driver = get_webdriver()
    attemps = 0

    ATTEMPS_TIMES = 3  # 失败尝试3次
    page_url_list = []
    all_page_numer = -1
    while (attemps < ATTEMPS_TIMES):
        driver.get(url)
        time.sleep(3)
        # print(driver.page_source)
        soup = get_soup_by_html_source(driver.page_source)
        is_find_page3_url = False
        # 当前是第一页,寻找分页中的第二页,通过第二页找到第三页的URL
        element_2_list = soup.find_all("a", text="2")
        number_to_url_dic = {}
        for elem in element_2_list:
            find_parent_times = 0
            while (find_parent_times < 4 and is_find_page3_url is False):
                # descendants_list = []
                if (find_parent_times == 0):
                    elem_parent = elem.parent
                    descendants_list = elem_parent.contents
                elif (find_parent_times == 1):
                    elem_parent = elem.parent
                    descendants_list = elem_parent.descendants
                else:
                    elem_ancestor = elem
                    for up_times in range(0, find_parent_times):
                        elem_ancestor = elem_ancestor.parent

                    descendants_list = elem_ancestor.descendants

                for descendant in descendants_list:
                    if (descendant.name != None and descendant.name == 'a'):
                        if descendant.text == '3':
                            number_to_url_dic['2'] = elem.get("href")
                            number_to_url_dic['3'] = descendant.get("href")
                            number_to_url_dic['attrs_dic2'] = elem.attrs
                            number_to_url_dic['attrs_dic3'] = descendant.attrs
                            is_find_page3_url = True
                            # print (elem.get("href"))

                        if (descendant.name != None
                                and descendant.name == 'a'):
                            if descendant.text == '3':
                                is_find_number = True
                            elif (is_find_number
                                  and descendant.text.isdigit()):

                                # print descendant.text
                                all_page_numer = max(int(descendant.text),
                                                     all_page_numer)
                        if (is_find_number and descendant.name != None):
                            allpage_text = descendant.parent.parent.parent.text
                            try:
                                tmp_number = int(
                                    re.search(
                                        "\d+",
                                        re.search(
                                            u"\d+\s*页",
                                            allpage_text).group()).group())
                                all_page_numer = max(tmp_number,
                                                     all_page_numer)
                            except:
                                pass
                                # print tmp_number

                find_parent_times += 1

            if is_find_page3_url and all_page_numer != -1:
                break
        # try:
        url_2 = number_to_url_dic['2']
        url_3 = number_to_url_dic['3']
        """
        处理假URL的情况,比如有些URL是#,javascript;这里需要用driver动态跳转,获取current_url
        """
        if (url_2.lower() == url_3.lower()):
            url_2 = get_url_by_attrs_dic(driver,
                                         number_to_url_dic["attrs_dic2"])
            url_3 = get_url_by_attrs_dic(driver,
                                         number_to_url_dic["attrs_dic3"])

            print "debug:%s" % url_2
            if (url_2.lower() == url_3.lower()):
                return None

            page_url_list = [
                url, url_sifter(url, url_2),
                url_sifter(url, url_3)
            ]
            break
        else:
            page_url_list = [
                url, url_sifter(url, url_2),
                url_sifter(url, url_3)
            ]
            break

    driver.close()

    return all_page_numer, page_url_list
Beispiel #8
0
def get_next_urlList_by_firstpage_url(url):
    driver = webdriver.PhantomJS()
    # driver = get_webdriver()
    attemps = 0

    ATTEMPS_TIMES = 3  # 失败尝试3次
    FAILUED_STRING = "FAILUED_STRING"
    page_url_list = []
    while (attemps < ATTEMPS_TIMES):
        driver.get(url)
        time.sleep(3)
        print driver.current_url
        # print(driver.page_source)
        soup = get_soup_by_html_source(driver.page_source)
        is_find_page3_url = False
        # 当前是第一页,寻找分页中的第二页,通过第二页找到第三页的URL
        element_2_list = soup.find_all("a", text="2")
        number_to_url_dic = {}
        for elem in element_2_list:
            find_parent_times = 0
            while (find_parent_times < 4 and is_find_page3_url is False):
                # descendants_list = []
                if (find_parent_times == 0):
                    elem_parent = elem.parent
                    descendants_list = elem_parent.contents
                elif (find_parent_times == 1):
                    elem_parent = elem.parent
                    descendants_list = elem_parent.descendants
                else:
                    elem_ancestor = elem
                    for up_times in range(0, find_parent_times):
                        elem_ancestor = elem_ancestor.parent

                    descendants_list = elem_ancestor.descendants

                for descendant in descendants_list:
                    if (descendant.name != None and descendant.name == 'a'):
                        if descendant.text == '3':
                            number_to_url_dic['2'] = elem.get("href")
                            number_to_url_dic['3'] = descendant.get("href")
                            number_to_url_dic['attrs_dic2'] = elem.attrs
                            number_to_url_dic['attrs_dic3'] = descendant.attrs
                            is_find_page3_url = True
                            # print (elem.get("href"))
                            print("-----------------------------")

                find_parent_times += 1

            if is_find_page3_url:
                break
        next_url_is_fake = False
        # try:
        url_2 = number_to_url_dic['2']
        url_3 = number_to_url_dic['3']
        """
        处理假URL的情况,比如有些URL是#,javascript;这里需要用driver动态跳转,获取current_url
        """
        if (url_2.lower() == url_3.lower()):
            url_2 = get_url_by_attrs_dic(driver,
                                         number_to_url_dic["attrs_dic2"])
            url_3 = get_url_by_attrs_dic(driver,
                                         number_to_url_dic["attrs_dic3"])

            print "debug:%s" % url_2

            # 出现解析问题,这个url可以跳过
            if (url_2.lower() == url_3.lower()):
                return None

            page_url_list = [
                url, url_sifter(url, url_2),
                url_sifter(url, url_3)
            ]
            break
        else:
            page_url_list = [
                url, url_sifter(url, url_2),
                url_sifter(url, url_3)
            ]
            break
    driver.close()

    return page_url_list
Beispiel #9
0
def category_page_parser(url, domain, parser_method):
    if parser_method == 1:
        soup = get_soup_by_request(url)
        # soup = BeautifulSoup(resq.text,"lxml")
    else:
        soup = get_soup_by_selenium_with_sleep(url)

    # print soup.prettify()

    tagPath_to_appearCount = {}
    tagPath_to_allTagInPath = {}

    max_appear_tag_path = ""
    max_appear_tag_number = 0

    for current_tag in soup.find_all("a"):

        # get 'tag-path' such as html/body/div/div/ul/li/a
        tag_path = get_tag_path(current_tag)

        # Has 'tag-path' been appeared
        if (tag_path in tagPath_to_appearCount):
            tagPath_to_appearCount[tag_path] += 1
            tagPath_to_allTagInPath[tag_path].append(current_tag)
        else:
            tagPath_to_appearCount[tag_path] = 1
            tagPath_to_allTagInPath[tag_path] = []
            tagPath_to_allTagInPath[tag_path].append(current_tag)

    sorted_tag_path_list = sorted(tagPath_to_appearCount.items(),
                                  key=lambda d: d[1],
                                  reverse=True)
    # for item in sort:
    #     print  "%s %s" % (sorted_tag_path_list[0], sorted_tag_path_list[1])

    # all_category = tagPath_to_allTagInPath[sorted_tag_path_list[0][0]]
    # category_res_list = []
    # category_name_set = set()

    # for tag in all_category:
    #     # if(category_name_set)
    #     # parent_deep =  1
    #     #
    #     # while(parent_deep <=3 and tag.text != None and len(tag.text)!=0):
    #     #
    #     #
    #     #
    #     #     parent_deep+=1
    #
    #
    #     print "-----------one menu----------------"
    #
    #     parent_tag = tag.parent
    #     # print parent_tag.text
    #
    #
    #
    #     parent_tag = parent_tag.parent
    #     # print parent_tag.text
    #     #
    #     parent_tag = parent_tag.parent
    #     print parent_tag.text
    #     print "-----------one menu----------------"
    #     # while parent_tag != None and parent_tag.name != None:
    #

    # parent_threshold_num = sorted_tag_path_list[int(len(sorted_tag_path_list)/3)][1]
    # category_menu_1_list = []
    #
    # print parent_threshold_num
    sorted(tagPath_to_appearCount.items(), key=lambda d: d[1])
    for key, value in tagPath_to_appearCount.items():
        # print key, ':', value
        if (max_appear_tag_number < value):
            max_appear_tag_number = value
            max_appear_tag_path = key

    all_category_tag_list = tagPath_to_allTagInPath[max_appear_tag_path]
    print(len(all_category_tag_list))

    a_url_list = []
    for tag in all_category_tag_list:
        # print tag.text
        a_url_list.append([tag.text, url_sifter(domain, tag.get("href"))])

    return a_url_list
Beispiel #10
0
def analysis_by_tag_return_goods_message(goods_list_tag, url):

    # print goods_list_tag.name
    # print goods_list_tag['class']
    pic_size_regular = r'\d{2,}x\d{2,}'

    res_goods_list = []
    for each_tag in goods_list_tag.contents:
        res_pic_url = ''
        res_price = ''
        res_detail_url = ''
        res_title = ''
        max_title_len = -1
        max_pic_size = -1
        res_goods_dict = {}
        if (each_tag.name != None):

            for inner_tag in each_tag.descendants:

                """
                商品列表页面本身含有一定的信息,此处暂时不做抓取(在商品详细页面抓取)

                以下注释信息是对商品信息的抓取
                """
                if(inner_tag.name!=None and is_single_tag(inner_tag)):
                    # print inner_tag
                    is_in_some_attri = False

                    tag_text =  inner_tag.text.replace('\n',"")


                    #url
                    if(res_detail_url == ''):
                        try:
                            detail_url = url_sifter(url=inner_tag['href'], parent_url=url)
                            if ('javascript' not in detail_url and 'list' not in detail_url and 'search' not in detail_url
                                and detail_url and ' ' not in detail_url and 'cart' not in detail_url):

                                res_detail_url = detail_url
                                is_in_some_attri = True
                        except:
                            pass


                    #价格
                    regular_str = '\d+\.+\d+'
                    re_res = re.search(regular_str,tag_text)
                    if(re_res and res_price== ''):
                        res_price =  re_res.group()


                    #搜索图片
                    if(inner_tag.name == 'img'):
                        try:
                            pic_url = inner_tag['src']
                            if ('jpg' in pic_url or 'png' in pic_url or 'jpeg' in pic_url):
                                if(res_pic_url == ''):
                                    res_pic_url = pic_url
                                else:
                                    re_res = re.search(pic_size_regular, pic_url).group()
                                    re_res_splited = re_res.split('x')
                                    pic_size = max(int(re_res_splited[0]), int(re_res_splited[1]))

                                    if (pic_size > max_pic_size):
                                        max_pic_size = pic_size
                                        res_pic_url = pic_url
                                is_in_some_attri = True
                        except:
                            pass

                    tag_style = inner_tag.get('style')
                    if(tag_style):
                        regular_str = r'url\w*\(\S+\)'
                        re_res = re.search(regular_str,str(tag_style))
                        if(re_res):
                            pic_url = re_res.group().split('(')[1].split(')')[0]
                            if('jpg' in pic_url or 'png' in pic_url or 'jpeg' in pic_url):
                                if (res_pic_url == ''):
                                    res_pic_url = pic_url
                                else:
                                    re_res = re.search(pic_size_regular, pic_url).group()
                                    re_res_splited = re_res.split('x')
                                    pic_size = max(int(re_res_splited[0]), int(re_res_splited[1]))

                                    if (pic_size > max_pic_size):
                                        max_pic_size = pic_size
                                        res_pic_url = pic_url
                                is_in_some_attri = True

                    if(is_in_some_attri == False and inner_tag.name!=None):
                        tag_text = inner_tag.text.replace('\n', "").replace(' ','')
                        # print tag_text
                        if(len(tag_text) > max_title_len):
                            max_title_len = len(tag_text)
                            res_title = tag_text

                            # print res_title


            # print "-----------------------one goods-----------------------"
            res_goods_dict['title'] = res_title
            res_goods_dict['price'] = res_price
            res_goods_dict['pic_url'] = res_pic_url
            res_goods_dict['detail_url'] = res_detail_url

            res_goods_list.append(res_goods_dict)


    return res_goods_list
Beispiel #11
0
def analysis_json_data(url, soup):
    rank_dic = {}

    def get_json_path(container, json_path):
        # if(rank_dic.has_key(json_path)): rank_dic[json_path] += 1
        # else: rank_dic[json_path] = 1

        if (isinstance(container, dict)):
            # print container
            for key, value in container.items():
                # print ("%s : %s")%(key,value)
                get_json_path(value, json_path + "/" + key)

        elif (isinstance(container, list)):
            # print container
            if (rank_dic.has_key(json_path)):
                rank_dic[json_path] += 1
            else:
                rank_dic[json_path] = 1
            # print json_path
            # return json_path
            for next_container in container:
                # print next_container
                get_json_path(next_container, json_path + "/a_list")

                # else:
                # print container

    # soup = get_soup_by_request(url)
    shop_json = ""
    maxlen = -1
    # print soup.prettify()
    for y in soup.find_all("script"):
        # print str(y)
        for x in re.findall("\{.*\}", str(y)):
            tmp_len = len(x)
            if (tmp_len > maxlen):
                maxlen = tmp_len
                shop_json = x
    # print "json--%s" % shop_json
    json_praser = json.loads(shop_json)
    get_json_path(json_praser, "")

    second_dic = {}
    max_path_str = ""
    max_path_len = -1
    for key, value in rank_dic.items():
        if value > 20 and "a_list" in key:
            # print "(%s,%s)"%(key,value)
            tmp_str = (key).split('a_list')[0]
            if second_dic.has_key(tmp_str):
                second_dic[tmp_str] += 1
            else:
                second_dic[tmp_str] = 1
            if (second_dic[tmp_str] > max_path_len):
                max_path_len = second_dic[tmp_str]
                max_path_str = tmp_str

    # print max_path_str
    # print len(max_path_str.split('/'))

    def not_empty(s):
        return s and s.strip()

    json_key_list = list(filter(not_empty, max_path_str.split('/')))

    json_key_index = 0
    res_dic = json_praser
    while json_key_index < len(json_key_list):
        res_dic = res_dic[json_key_list[json_key_index]]

        json_key_index += 1

    # detail_urls_list = []
    # pic_urls_list = []
    #

    res_goods_dic_list = []
    for li in res_dic:
        res_goods_dic = {}
        if(isinstance(li,dict) is not True):continue
        for key, value in li.items():
            # print "%s:%s"%(key,value)
            # print("=====================================")
            # print(type(key))
            # print(str(key))
            if(key is None or value is None): continue

            if ("price" in key and res_goods_dic.has_key('price') is False):
                res_goods_dic['price'] = value
                # print value
            elif ("title" in key and res_goods_dic.has_key('title') is False):
                res_goods_dic['title'] = re.sub('(?is)<.*?>', '', value)
                # print value
            elif ("detail" in key and res_goods_dic.has_key('detail_url') is False):
                # print value, key
                res_goods_dic['detail_url'] = url_sifter(url, value)
                # detail_urls_list.append(url_sifter(url, value))
            # elif("comment" in key):
            #     print value
            elif ((("img" in key) or ('pic' in key )or (".jpg" in value) or ('.png' in value) and res_goods_dic.has_key('pic_url') is False)):
                res_goods_dic['pic_url'] = pic_url_sifter(url,value)
                # print value
                # pic_urls_list.append(url_sifter(url, value))
        res_goods_dic_list.append(res_goods_dic)
        # print "-------------------"
    return res_goods_dic_list