def parse(self, response):
     item = V_qqItem()
     urls = []
     url_list = response.xpath(
         '//*[@id="mod_main_nav"]/div/div/a/@href').extract()
     for url in url_list:
         if url[0] == '/' and not url[0:4] == "http":
             urls.append(f"http://v.qq.com{url}")
     url_list2 = response.xpath(
         '//*[@class="mod_row_box"]/div[2]/div/div/a/@href').extract()
     urls = urls + url_list2
     item['urls'] = urls
     for i in range(len(item['urls'])):
         try:
             res = requests.get(item['urls'][i], headers=HEADERS)
             if res.ok:
                 res.encoding = 'utf-8'
                 html = res.text
                 ex = Extractor(threshold=3)
                 content = ex.filter_tags(html)
                 data = clean_content(ex.getText(content))
                 with open(f"E:/c++/毕业设计开发日志/06.文本数据集/娱乐/视频/{i}.txt",
                           'w',
                           encoding="utf-8") as txtfile:
                     txtfile.write(data)
                 print(f"第{i}个网页爬取完毕")
                 time.sleep(2)
         except Exception as e:
             print(f"第{i}个文章错误,链接{item['urls'][i]},错误原因{e}")
     print(f"共{i}个视频页面信息爬取完毕")
 def search_web_content(self):
     url = self.lineEdit.text()
     print(url)
     ex = Extractor(threshold=30)
     html = ex.getHtml(url)
     content = ex.filter_tags(html)
     rlt = re.sub("\n", "", content)
     print(rlt)
     self.textEdit.setText(rlt)
Exemple #3
0
def parse_url():
    urls = []
    data = []
    length_all = 0
    length_ok = 0
    with open(r"C:\Users\叫乌鸦的少年怪\Desktop\历史记录文件.txt", 'r',
              encoding='utf-8') as files:
        urls += ["http://" + url[:-1] for url in files.readlines()]

    length_all = len(urls)
    j = 0
    for url in urls:
        try:
            ex = Extractor(threshold=30)
            html = ex.getHtml(url)
            content = ex.getText(ex.filter_tags(html))
            content = clean_content(content)
            length_ok += 1
            if content != "This   page   has   no   content   to   extract ":
                j += 1
                data_str = ""
                for _ in content.splitlines():
                    data_str += _
                data.append(data_str)
            else:
                pass
        except ConnectionError as errr:
            print(errr)
        #     print("响应失败")
        #     # TODO 记录下host
        except Exception as e:
            print(e)
            continue
    with open(r"C:\Users\叫乌鸦的少年怪\Desktop\content.txt", 'w+',
              encoding='utf-8') as rlt_txt:
        for single_data in data:
            rlt_txt.write(single_data + '\n')

    print(f"成功访问的{length_ok}")
    print(f"一共{length_all}")
    print(f'百分比{length_ok / length_all}')
Exemple #4
0
def parse_urls(url_list: list):
    j = 0
    for i in range(len(url_list)):
        try:
            extractor = Extractor(threshold=30)
            html = extractor.getHtml(url_list[i])
            content = extractor.filter_tags(html)
            data = clean_content(extractor.getText(content))
            if data != "This   page   has   no   content   to   extract ":
                j += 1
                with open(f'E:/c++/毕业设计开发日志/06.文本数据集/数据清洗模块测试.txt',
                          'w+',
                          encoding='utf-8') as txtfile:
                    txtfile.write(data)
                print(f"第{i+1}篇文章处理完毕")
            else:
                pass
        except Exception as e:
            print(e)

    print(f"共获取到{i}篇文章")
    print(f"成功处理{j}篇文章")
Exemple #5
0
from Settings import HEADERS
import json
from data_cleaning.Extractor import Extractor
from data_cleaning.content_clean import clean_content

if __name__ == '__main__':
    urls = []
    for i in range(5):
        response = get(
            "https://www.douyu.com/japi/weblist/apinc/rec/list?uid=8b6321ddbef037034b351cab00081501&num=20",
            headers=HEADERS)
        data_json = json.loads(response.text)
        data_url = (data_json['data'])
        for data in data_url:
            urls.append(f"https://douyu.com/{data['roomId']}")
    print(f"共爬取{len(urls)}条房间")
    try:
        for i in range(len(urls)):
            ex = Extractor(threshold=20)
            html = get(urls[i], headers=HEADERS).text
            content = ex.filter_tags(html)
            data = clean_content(ex.getText(content))
            with open(f'E:/c++/毕业设计开发日志/06.文本数据集/娱乐/直播/{i}.txt',
                      'w',
                      encoding='utf-8') as txtfile:
                txtfile.write(data)
            print(f"第{i}个直播间处理完毕")
        print(f'共{i}个直播间处理完毕')
    except Exception as e:
        print(e)
from data_cleaning.Extractor import Extractor
from data_cleaning.content_clean import clean_content

cx = Extractor(threshold=90)
html = cx.getHtml(
    "https://blog.csdn.net/Winterto1990/article/details/51220307")
content = cx.filter_tags(html)
# print(content)
s = cx.getText(content)

data = clean_content(s)
print(data)

# TODO: 给爬虫新增代理;
#       增加文本输出;
#       文本语料库!!
from data_cleaning.Extractor import Extractor
from data_cleaning.content_clean import clean_content
from time import sleep

with open("C:/Users/叫乌鸦的少年怪/Desktop/words.txt", 'r', encoding='utf-8') as txt:
    words = txt.read()
word_list = words.split('\n')

for i in range(len(word_list)):
    url = f"https://www.baidu.com/s?rsv_idx=1&wd={word_list[i]}&fenlei=256&ie=utf-8"
    ex = Extractor(threshold=40)
    html = ex.getHtml(url)
    content = ex.filter_tags(html)
    data = clean_content(ex.getText(content))
    with open(f"E:/c++/毕业设计开发日志/06.文本数据集/搜索引擎/百度/{i}.txt",
              'w',
              encoding='utf-8') as txtfile:
        txtfile.write(data)
    print(f'第{i}个百度网页爬取完毕')
    sleep(8)
print(f'共{i}个百度网页爬取完毕')