예제 #1
0
def __getUrlsInPage(url,url_xpath,next_xpath):
    User_Agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
    header = {}
    header["User-Agent"] = User_Agent

    rq = requests.get(url, headers=header, allow_redirects=True, verify=False)
    page = rq.text

    page = remove_control_characters(page)

    html = etree.HTML(page)
    # try:
    result = html.xpath(url_xpath)
    try:
        q = urlparse.urlparse(url).query
        q = q.split('&')
        if len(q) == 1:
            nextpage = url + "&p=2"
        else:
            nextpage = int(q[1].split('=')[1]) + 1
            nextpage = url.split('&')[0] + "&p=" + str(nextpage)
    except Exception:
        nextpage = None
        #print page

    #print "Next:",nextpage
    return result, nextpage
예제 #2
0
    def __getUrlsInPage(self, url, wantedXpath, nextUrlXpath):
        User_Agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
        header = {}
        header["User-Agent"] = User_Agent

        rq = requests.get(url, headers=header,allow_redirects=True,verify=False)
        html = rq.text

        html = remove_control_characters(html)
        page = etree.HTML(html)
        result = page.xpath(wantedXpath)
        try:
            nextpage = page.xpath(nextUrlXpath)[0]
        except Exception:
            nextpage = None

        return result, nextpage
예제 #3
0
def __getUrlsInPage(url,url_xpath,next_xpath):
    page = getpagewithchrome(url,CHROME_DRIVER_PATH)
    page = remove_control_characters(page)

    html = etree.HTML(page)

    # try:
    result = html.xpath(url_xpath)

    try:
        nextpage = html.xpath(next_xpath)[0]
    except Exception:
        nextpage = None
        #print page

    #print "Next:",nextpage
    r = []
    for temp in result:
        if 'ip:' in temp:
            temp = temp.split('ip:')[-1]
            r.append(temp)
    result = r

    return result, nextpage