Beispiel #1
0
    def insertData2DB(self, myContent):
        i = 0
        isTitle = False
        space = "\r\n\n\t"
        space1 = "\r\n"
        content1 = ""
        text = ""
        # self.singleText(content1, i, isTitle, myContent, space)
        from Consumers12315.items import Consumers12315Item
        item = Consumers12315Item()
        for line in myContent:

            if isTitle:
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space1
                content1 += "当前的问题是:" + line + space1
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space

                item['question'] = line
                isTitle = False
                continue

            if Utils.matchTitle(line):
                i += 1
                if i > 10:
                    break

                content1 += "______________________" + space1
                content1 += line + "---------------" + space1
                content1 += "______________________" + space1

                item['number'] = line

                isTitle = True
                continue

            if ~isTitle:
                l = line
                # for l in line:

                if Utils.matchTitle(l):
                    # content1 += line
                    content1 += space
                    continue

                content1 += l
                text = l

                endChar = l[len(l) - 1]
                if Utils.isEndChar(endChar):
                    content1 += space
        print(content1)
        item['answer'] = text
        yield item
Beispiel #2
0
    def parse(self, response):
        self.driver.get(response.url)
        # time.sleep(5)

        content = self.driver.page_source
        # print("爬取的内容如下:" + content)

        selector = Selector(text=content)
        # name = selector.xpath('//span[@id="headerName"]/text()').extract()
        names = selector.xpath('//ul[@id="zl_ul"]/li/a/text()').extract()
        ids = selector.xpath('//ul[@id="zl_ul"]/li/a/@onclick').extract()

        # 已经获取到需要的名称
        print("我需要的名称:" + names.__str__())
        print("我需要的原始ID:" + ids.__str__())

        startStr = "('"
        endStr = "')"

        for index, id in enumerate(ids):
            name = names[index]

            currentId = Utils.sliptStr(id, startStr, endStr)
            print("处理后的ID:" + currentId + ", 名称是:\t\t\t" + name +
                  ", 对应的访问地址是:" +
                  "http://www.12315.cn/knowledge/knowledgeView?zlcode=" +
                  currentId)
    def getSmallTitle(self, selector):
        myTile = selector.xpath(
            '//div[@class="WordSection1"]/p[@class="MsoNormal"]/span[@style="font-size:16.0pt;font-family:仿宋_GB2312;color:black"]/text()').extract()
        for t in myTile:
            print("___ " + t)
        print("下面只获取标题:")
        questList = []
        state = False
        for t in myTile:
            if state and not Utils.matchTitle(t) and not t.strip() == "":
                print("_A_ " + t)
                questList.append(t)
                state = False
                continue

            if Utils.matchTitle(t):
                state = True
                continue
        print("当前的总共有:" + len(questList).__str__())
Beispiel #4
0
    def singleText(self, content1, i, isTitle, myContent, space):
        for line in myContent:

            if isTitle:
                print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
                print("当前的问题是:" + line)
                print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^")

                isTitle = False
                continue

            if Utils.matchTitle(line):
                i += 1
                if i > 1:
                    break

                print("______________________")
                print(line + "---------------")
                print("______________________")

                isTitle = True
                continue

            if ~isTitle:
                l = line
                # for l in line:

                if Utils.matchTitle(l):
                    # content1 += line
                    content1 += space
                    continue

                content1 += l
                endChar = l[len(l) - 1]

                if Utils.isEndChar(endChar):
                    content1 += space
        print(content1)
Beispiel #5
0
    def oldGetContent(self, selector):
        bigTitle = selector.xpath('//div[@class="hd"]/h2/text()').extract()

        myContent = selector.xpath(
            '//div[@class="WordSection1"]/p[@class="MsoNormal"]/span//text()'
        ).extract()

        currentNumber = 0
        i = 0
        isTitle = False
        space = "\r\n\n\t"
        space1 = "\r\n"
        content1 = ""
        text = ""
        for line in myContent:

            if isTitle:
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space1
                content1 += "当前的问题是:" + line + space1
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space

                isTitle = False
                continue

            if Utils.matchTitle(line):
                i += 1
                # if i > 10:
                #     break

                # //把 line 转成成 数字存储起来
                lineNumber = int(line.split(".")[0])
                # print("下面是 lineNumber:")
                # print(lineNumber)

                if lineNumber < currentNumber:
                    content1 += line
                    continue

                currentNumber = lineNumber
                print("currentNumber:")
                print(currentNumber)

                content1 += "______________________" + space1
                content1 += line + "---------------" + space1
                content1 += "______________________" + space1

                isTitle = True
                continue

            if ~isTitle:
                l = line
                # for l in line:

                if Utils.matchTitle(l):
                    # content1 += line
                    content1 += space
                    continue

                content1 += l
                text = l

                endChar = l[len(l) - 1]
                if Utils.isEndChar(endChar):
                    content1 += space
        print(content1)
Beispiel #6
0
body = '''
    <span lang="EN-US" style="font-size:16.0pt;font-family:仿宋_GB2312;color:black">1.</span>
<span style="font-size:16.0pt;font-family:仿宋_GB2312;color:black">当心掉进健身预付卡消费陷阱</span>
<span style="font-size:16.0pt;font-family:仿宋_GB2312">预付款消费是指消费者预先向经营者交付一定额度的消费金额,按照事先的消费约定,以整存零取的方式接受商家的服务,一般会获得商家承诺的额外优惠。例如,一次性接受商家服务消费<span lang="EN-US">20</span>元,如果一次存入<span lang="EN-US">100</span>元,商家可以发给一个消费卡,并且可以享受<span lang="EN-US">6</span>至<span lang="EN-US">7</span>次的优惠服务,降低了正常消费金额,颇受消费者青睐。经营者一次性预收消费者现金相当于有了固定客源,因此预付款消费在很多服务行业颇为盛行。</span>
<span style="font-size:16.0pt;font-family:仿宋_GB2312">近年来,由于经营者的经营方式方法及诚信经营理念不同,消费者的消费理念存在差异,以及贪图便宜心理的存在,预付款消费纠纷成为消费者投诉的热点。例如,某健身俱乐部办理会员消费卡涉及消费者<span lang="EN-US">100</span>多人,现在是人走楼空,中途停止营业,有的消费者已存消费卡金额达<span lang="EN-US">3000</span>余元,但找不到当事人,致使维权之路至今未果。</span>
<span style="font-size:16.0pt;font-family:仿宋_GB2312">消费者进行预付款消费应注意以下事项:第一,要多做比较。消费者应尽量选择规模大、信誉好、经营状况良好的企业,不轻信广告和商家的口头承诺,不受促销的诱惑。第二,要签订合同。消费者与经营者应当签订书面合同,应载明价格、服务标准、优惠条件、使用商品品牌、有效期限、有效次数、使用权限、使用地点、续费升级、遗失补办等事项,并应明确预付款消费卡的功能、使用范围、有效期限、退卡(款)条件、违约责任等事项。对以格式合同、通知、声明、店堂告示做出对消费者不公平、不合理的规定,“最终解释权归本经营者所有”等提示,免除其损害消费者合法权益、加重消费者责任或排除消费者权利的内容,要坚决说不。第三,要适度消费。在办理预付款消费卡时,首先要弄清自己是否真的长期需要此类服务,应根据自身实际需要购买、充值预付款消费卡,每次充值金额不宜过多,谨慎选择预付额度过高、服务周期过长的预付款消费。要按照自己的实际需求量来购买预付款消费卡,不要贪便宜一下子大量购买,以避免承担过多风险。第四,要慎重进行预付款消费。因为预付款消费目前还没有专门的法律规定,现在购物、金融、洗浴、洗车、游泳、健身、美容、餐饮、娱乐等行业出现预付款消费纠纷后,只能使用旁法为消费者维权,为此要提醒消费者慎重选择预付款消费,并要注意妥善保管好相关的服务章程、协议和票据等消费凭证。第五,要及时维权。消费者发现经营者的经营行为有异常时,要及时向有关部门咨询或举报,避免办卡容易退卡难现象发生。一旦经营者需要变更经营地址、注销或整体转让经营资质,或商品质量、商品价格、服务质量、服务价格发生变化时,经营者应提前告知消费者,协商是退费还是继续履行合同承诺,若协商不成消费者可请求所辖地消费者协会调解,也可向有关行政职能部门申诉。对经营者未尽告知义务又无法履行合同承诺的侵权行为,消费者可向所在地行政职能部门提出申诉或向人民法院提起诉讼,以维护自身的合法权益。</span>

'''

content = Selector(text=body).xpath('//body/span//text()').extract()

# <class 'list'>: ['并且可以享受', '6', '至', '7', '次的优惠服务']

space = "\r\n\n\t"

content1 = ""
for line in content:

    if Utils.matchTitle(line):
        # content1 += line
        content1 += space
        continue

    content1 += line
    endChar = line[len(line) - 1]

    if Utils.isEndChar(endChar):
        content1 += space

print(content1)
    def parse(self, response):
        self.driver.get(response.url)
        # time.sleep(5)

        # content = self.driver.page_source
        # print("爬取的内容如下:" + content)

        # selector = Selector(text=content)
        selector = Selector(response)

        # bigTitle = selector.xpath('//div[@class="hd"]/h2/text()').extract()


        # self.getBigTitle(selector)
        # self.getSmallTitle(selector)



        myContent = selector.xpath('//div[@class="WordSection1"]/p[@class="MsoNormal"]/span//text()').extract()

        i = 0
        isTitle = False

        space = "\r\n\n\t"
        space1 = "\r\n"
        content1 = ""

        # self.singleText(content1, i, isTitle, myContent, space)

        for line in myContent:

            if isTitle:
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space1
                content1 += "当前的问题是:" + line + space1
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space

                isTitle = False
                continue

            if Utils.matchTitle(line):
                i += 1
                if i > 10:
                    break

                content1 += "______________________" + space1
                content1 += line + "---------------" + space1
                content1 += "______________________" + space1

                isTitle = True
                continue

            if ~isTitle:
                l = line
                # for l in line:

                if Utils.matchTitle(l):
                    # content1 += line
                    content1 += space
                    continue

                content1 += l
                endChar = l[len(l) - 1]

                if Utils.isEndChar(endChar):
                    content1 += space
        print(content1)