コード例 #1
0
    def findsource(self):

        contentall = self.gethtml()
        content = contentall[0]
        html = BeautifulSoup(content, "html.parser")
        sources = html.find("div", {'class': 'breadcrumbs'})

        # 请求url
        requesturl = contentall[1]

        if sources:
            divtext = sources.get_text()
            currentresult = requesturl + '\t' + fetch_util.replace_some_string(divtext, '>', '>')

            # 保存请求成功的path
            self.saveoutputlog(currentresult)

            self.setshowlog(requesturl + ' succeed')
            self.setresult(True)

        else:
            sources = html.find("div", {'class': 'subpage_menu'})
            if sources:
                divtext = sources.get_text()
                currentresult = requesturl + '\t' + fetch_util.replace_some_string(divtext, '>', '>')

                # 保存请求成功的path
                self.saveoutputlog(currentresult)

                self.setshowlog(requesturl + ' succeed')
                self.setresult(True)
            else:
                self.setshowlog(requesturl + ' failed')
                self.setresult(False)
コード例 #2
0
ファイル: main.py プロジェクト: lancong/Spiders
 def gettag(self):
     if self.info:
         tag = self.info.find('div', {'class': 'side-tags clearfix'})
         if tag:
             s = tag.get_text()
             s = fetch_util.replace_some_string(s, "\n", '+')
             return fetch_util.remove_last_char(s, '+')
コード例 #3
0
ファイル: main.py プロジェクト: lancong/Spiders
 def getcategory(self):
     if self.info:
         category = self.info.find('dd', {'class': 'tag-box'})
         if category:
             s = category.get_text()
             s = fetch_util.replace_some_string(s, "\n", '+')
             return fetch_util.remove_last_char(s, '+')
コード例 #4
0
ファイル: jd_parser.py プロジェクト: lancong/Spiders
def parser_type_3(html, source):
    div_text = source.get_text()
    result = fetch_util.replace_some_string(div_text, '\n', '')
    item_name_tag = html.find("div", {'class': 'sku-name'})
    if item_name_tag:
        item_name = item_name_tag.get_text()
        if item_name:
            result += '>' + item_name
    return result
コード例 #5
0
ファイル: main.py プロジェクト: lancong/Spiders
 def getcategory(self):
     category = self.html.find('div', {'class': 'crumb'})
     # if 'get_text' in category.attrs:
     try:
         category = category.get_text()
     # else:
     except:
         category = ''
     return fetch_util.replace_some_string(category, '\n', '')
コード例 #6
0
ファイル: jd_parser.py プロジェクト: lancong/Spiders
def parser_type_2(html, source):
    bread_tag = source.find("div", {'id': 'name'})
    if bread_tag:
        h1_item_name = bread_tag.find('h1')
        item_name = h1_item_name.contents[0]
        if item_name:
            # 替换结果中的内容
            result = fetch_util.replace_some_string(item_name, '>>', '>')
            return result
    return ''
コード例 #7
0
ファイル: jd_parser.py プロジェクト: lancong/Spiders
def parser_type_1(html, source):
    div_text = source.get_text()
    result = fetch_util.replace_some_string(div_text, '>', '>')
    item_name_tag = html.find("div", {'id': 'name'})
    if item_name_tag:
        h1_item_name = item_name_tag.find('h1')
        item_name = h1_item_name.contents[0]
        if item_name:
            result += '>' + item_name
    return result
コード例 #8
0
ファイル: jumeiurlstartwithh.py プロジェクト: lancong/Spiders
    def findsource(self):

        contentall = self.gethtml()
        content = contentall[0]
        html = BeautifulSoup(content, "html.parser")
        sources = html.find("div", {'class': 'subpage_menu'})

        # 请求url
        requesturl = contentall[1]

        if sources:
            divtext = sources.get_text()
            currentresult = requesturl + '\t' + fetch_util.replace_some_string(
                divtext, '>', '>')

            # 保存请求成功的path
            self.saveoutputlog(currentresult)

            self.setshowlog(requesturl + ' succeed')
            self.setresult(True)

        else:
            sources = html.find("div", {'class': 'content_text'})

            if sources:
                tds = sources.findAll('td')
                title = ''
                if tds and len(tds) >= 1:
                    title = tds[0].get_text().strip() + tds[1].get_text(
                    ).strip()
                else:
                    self.setshowlog(requesturl + ' failed')
                    self.setresult(False)
                    return
                if title:
                    currentresult = requesturl + '\t' + title
                else:
                    self.setshowlog(requesturl + ' failed')
                    self.setresult(False)
                    return

                # 保存请求成功的path
                self.saveoutputlog(currentresult)

                self.setshowlog(requesturl + ' succeed')
                self.setresult(True)

            else:

                self.setshowlog(requesturl + ' failed')
                self.setresult(False)
コード例 #9
0
    def match_type2(self, requesturl, sources):
        nametag = sources.find("div", {'id': 'name'})
        if nametag:
            h1itemname = nametag.find('h1')
            itemname = h1itemname.contents[0]
            if itemname:
                # 替换结果中的内容
                currentresult = fetch_util.replace_some_string(
                    itemname, '>>', '>')

                price = get_price(requesturl)
                # 保存请求成功的path
                self.save_result(currentresult + price)
                self.set_print_log(requesturl + ' succeed')
                self.set_result(True)
コード例 #10
0
    def match_type3(self, html, requesturl, sources):
        divtext = sources.get_text()
        currentresult = requesturl + '\t' + fetch_util.replace_some_string(
            divtext, '\n', '')
        divitemname = html.find("div", {'class': 'sku-name'})
        if divitemname:
            itemname = divitemname.get_text()
            if itemname:
                currentresult += '>' + itemname

        price = get_price(requesturl)

        # 保存请求成功的path
        self.save_result(currentresult + price)
        self.set_print_log(requesturl + ' succeed')
        self.set_result(True)
コード例 #11
0
    def match_type1(self, html, requesturl, sources):
        divtext = sources.get_text()
        currentresult = requesturl + '\t' + fetch_util.replace_some_string(
            divtext, '>', '>')
        itmenamehtml = html.find("div", {'id': 'name'})
        if itmenamehtml:
            h1itemname = itmenamehtml.find('h1')
            itemname = h1itemname.contents[0]
            if itemname:
                currentresult += '>' + itemname

        # 保存请求成功的path
        price = get_price(requesturl)
        self.save_result(currentresult + price)
        fetch_util.print_log(currentresult + price)
        self.set_print_log(requesturl + ' succeed')
        self.set_result(True)