Python clean_html Exemples, crawler.tool.clean_html Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : news_China_Time.py Projet : ChangWH22/side_project

def chinatime_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):
    Title = []
    Publish_time = []
    Section = []
    Body = []
    Source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")
    #print(b_time, "\n", e_time)
    loop_flag = False

    for page in range(1, 11):
        chinatime_home = "https://www.chinatimes.com/money/total?page=" + str(page) + "&chdtv"

        try:
            print("start collecting ChinaTime page..%s" % page)
            r = crawler_tool.url_retry(chinatime_home)
            soup = BeautifulSoup(r, "lxml")
            time.sleep(5)

            for i in range(len(soup.select("h3.title a"))):
                chinatime_financial_url = "https://www.chinatimes.com" + soup.select("h3.title a")[i]["href"]

                try:
                    r2 = crawler_tool.url_retry(chinatime_financial_url)
                    soup2 = BeautifulSoup(r2, "lxml")
                    r_time = datetime.datetime.strptime(soup2.find("meta", attrs={"name": "pubdate"})["content"], "%Y-%m-%dT%H:%M:%S+08:00")

                    if r_time > b_time:
                        continue

                    if r_time < e_time:
                        print("Web Crawler has collected ChinaTime data from {b_time} to {e_time}".format(b_time= b_time, e_time= e_time))
                        loop_flag = True
                        break

                    else:
                        Publish_time.append(r_time)
                        Title.append(re.sub(r"\s{1,}","",soup2.find("h1").string))
                        Section.append(soup2.find("meta", attrs={"name": "section"})["content"])
                        Source.append(soup2.find("meta", attrs={"name": "source"})["content"])
                        body = soup2.select("div.article-body p")
                        Body.append(crawler_tool.clean_html("".join(str(x) for x in body)))
                        time.sleep(random.uniform(0, 2))
                        print("ChinaTime:", r_time)

                except rq.exceptions.RequestException as e:
                    print("in", e)

        except rq.exceptions.RequestException as e:
            print("home", e)

        if loop_flag:
            break

    df = pd.DataFrame({"Title": Title, "Time": Publish_time, "Section": Section,  "Source": Source, "Body": Body}).sort_values(by=["Time"])
    file_name = "D:/User/Desktop/corpus/news/chinatime/" + decide_time_begin + "_" + decide_time_end + "_chinatime.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)

Exemple #2

0

Afficher le fichier

def setn_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):
    Title = []
    Publish_time = []
    Section = []
    Body = []
    Source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")
    #print(b_time, "\n", e_time)

    loop_flag = False

    for page in range(1, 19):

        print("start collecting Setn page {page}".format(page=page))
        home_url = "https://www.setn.com/ViewAll.aspx?PageGroupID=2&p=" + str(
            page)
        time.sleep(5)

        try:
            r = crawler_tool.url_retry(home_url)
            soup = BeautifulSoup(r, "lxml")

            for i in range(len(soup.select("h3.view-li-title a"))):
                content_url = ("https://www.setn.com/" +
                               soup.select("h3.view-li-title a")[i]["href"])
                r2 = crawler_tool.url_retry(content_url)
                soup2 = BeautifulSoup(r2, "lxml")

                r_time = datetime.datetime.strptime(
                    soup2.find("meta", attrs={"name": "pubdate"})["content"],
                    "%Y-%m-%dT%H:%M:%S")

                if r_time > b_time:
                    continue

                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "Web Crawler has collected Setn data from {b_time} to {e_time}"
                        .format(b_time=b_time, e_time=e_time))
                    break

                else:
                    Section.append(
                        soup2.find("meta", attrs={"property": "og:title"
                                                  })["content"].split("|")[1])
                    Title.append(
                        re.sub(
                            r"\s{1,}", "",
                            soup2.find("meta",
                                       attrs={"property": "og:title"
                                              })["content"].split("|")[0]))
                    Source.append(
                        soup2.find("meta", attrs={"property": "og:title"
                                                  })["content"].split("|")[2])
                    Publish_time.append(r_time)
                    Body.append(
                        crawler_tool.clean_html("".join(
                            str(x) for x in soup2.select("div#Content1 p"))))
                    print("Setn:", r_time)
                    time.sleep(random.uniform(0, 2))

        except rq.exceptions.RequestException as e2:
            print("home", e2)

        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": Title,
        "Time": Publish_time,
        "Section": Section,
        "Source": Source,
        "Body": Body
    })
    file_name = "D:/User/Desktop/corpus/news/setn/" + decide_time_begin + "_" + decide_time_end + "_setn.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)

Exemple #3

0

Afficher le fichier

Fichier : news_moneyDJ.py Projet : ChangWH22/side_project

def moneyDJ_GET_NEWS_time(decide_time_begin, decide_time_end):
    title = []
    publish_time = []
    body = []
    section = []
    source = []

    loop_flag = False
    decide_time_begin = '202104200830'
    decide_time_end = '202104210830'

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    for page in range(1, 50):
        home_url = "https://www.moneydj.com/KMDJ/News/NewsRealList.aspx?index1=" + str(
            page) + "&a=MB06"
        r = crawler_tool.url_retry(home_url)
        soup = BeautifulSoup(r, "lxml")
        print("start collecting moneyDJ page..%s" % page)
        for i in range(
                len(
                    soup.find("table", attrs={
                        "class": "forumgrid"
                    }).find_all("a"))):
            i = 4
            url = "https://www.moneydj.com" + soup.find(
                "table", attrs={
                    "class": "forumgrid"
                }).find_all("a")[i]["href"]
            r2 = crawler_tool.url_retry(url)
            soup2 = BeautifulSoup(r2, "lxml")
            maindata = soup2.select(
                "article#MainContent_Contents_mainArticle")[0]

            r_time = datetime.datetime.strptime(
                soup2.find('span', attrs={
                    'id': 'MainContent_Contents_lbDate'
                }).text, "%Y/%m/%d %H:%M")

            if r_time > b_time:
                continue

            elif r_time < e_time:
                print(
                    "Web Crawler has collected moneyDJ  from {b_time} to {e_time}"
                    .format(b_time=b_time, e_time=e_time))
                loop_flag = True
                break

            else:
                title_temp = re.sub(r"\s{1, }", "",
                                    soup2.select("h1 span")[0].string)
                body_temp = re.sub(r"\s{1, }", "",
                                   crawler_tool.clean_html(str(maindata)))

                if len(re.sub(r"[0-9.]", "",
                              body_temp)) / len(body_temp) < 0.5:
                    title.append(title_temp)
                    body.append(title_temp)
                elif len(title_temp) + 100 > len(
                        re.sub(r"[a-zA-Z0-9/,=?:;.{}()#%'&-]", "", body_temp)):
                    title.append(title_temp)
                    body.append(title_temp)
                else:
                    title.append(title_temp)
                    body.append(body_temp)

                publish_time.append(r_time)
                section.append("台股")
                source.append("moneyDJ")
                print("moneyDJ:", r_time)
                time.sleep(random.uniform(0.5, 1.5))

        if loop_flag:
            break
    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_moneyDJ.csv"
    df.to_csv(file_name, encoding="utf-8")

Exemple #4

0

Afficher le fichier

def moneyudn_GET_NEWS_time(decide_time_begin, decide_time_end):
    begin_time = datetime.datetime.today()
    title = []
    publish_time = []
    section = []
    body = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    loop_flag = False

    for page in range(1, 100):
        print("start collecting page {page}".format(page=page))
        home_url = "https://money.udn.com/rank/newest/1001/0/" + str(page)
        time.sleep(3)
        try:
            r = crawler_tool.url_retry(home_url)
            soup = BeautifulSoup(r, "lxml")

            for i in range(len(soup.select("td a"))):
                url2 = soup.select("td a")[i]["href"]

                html_page = crawler_tool.url_retry(url2)
                soup2 = BeautifulSoup(html_page, "lxml")
                r_time = datetime.datetime.strptime(
                    soup2.find("meta", attrs={"name": "date"})["content"],
                    "%Y/%m/%d %H:%M:%S")
                if r_time > b_time:
                    continue

                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "Web Crawler has collected money_udn data from {b_time} to {e_time}"
                        .format(b_time=b_time, e_time=e_time))
                    break

                else:
                    sub_section = soup2.select("div#nav a")[-1].string
                    if sub_section == "品味" or sub_section == "會員專區" or sub_section == "兩岸":
                        time.sleep(random.randint(1, 5))
                        continue
                    else:
                        print(r_time)
                        body.append(
                            crawler_tool.clean_html("".join(
                                str(x)
                                for x in soup2.select("div#article_body p "))))
                        publish_time.append(r_time)  # time
                        title.append(
                            re.sub(
                                r"\s{1, }", "",
                                soup2.find("meta",
                                           attrs={"property": "og:title"
                                                  })["content"].split("|")[0]))
                        section.append(sub_section)
                        source.append(soup2.select("div#nav a")[0].string)

                        time.sleep(0.2)
        except rq.exceptions.RequestException as e2:
            print("home", e2)

        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    }).sort_values(by=["Time"])
    file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_moneyudn.csv"
    df.to_csv(file_name, encoding="utf-8")
    print("processing time:", datetime.datetime.today() - begin_time)
    return df

Exemple #5

0

Afficher le fichier

Fichier : news_anue.py Projet : ChangWH22/side_project

def anue_GET_NEWS_time(decide_time_begin, decide_time_end):
    dt = datetime.datetime.today() - datetime.datetime.fromtimestamp(
        1611763199)  # 1/27差值
    dta = (dt.days + 1) * 86400 + 1611763199
    dtb = str(dta - 11 * 86400 + 1)
    dta = str(dta)

    title = []
    publish_time = []
    section = []
    body = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    loop_flag = False

    for page in range(1, 30):
        print("start collecting Anue page {page}".format(page=page))
        home_url = "https://api.cnyes.com/media/api/v1/newslist/category/headline?limit=30&startAt="+dtb+"&endAt="+dta+"&page="\
                   + str(page)
        r = crawler_tool.url_retry_json(home_url)
        time.sleep(5)

        for i in range(len(r["items"]["data"])):

            content_url = "https://news.cnyes.com/news/id/" + str(
                r["items"]["data"][i]["newsId"])
            r2 = crawler_tool.url_retry(content_url)
            soup = BeautifulSoup(r2, "lxml")
            try:
                r_time = datetime.datetime.strptime(
                    soup.find("time").string, "%Y/%m/%d %H:%M")

                if r_time > b_time:
                    continue

                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "---Web Crawler has collected Anue data from {b_time} to {e_time}---"
                        .format(b_time=b_time, e_time=e_time))
                    break

                else:
                    section.append(
                        soup.find("meta", attrs={
                            "property": "og:title"
                        })["content"].split("|")[-1].split("-")[-1])
                    title.append(
                        re.sub(
                            r"\s{1,}", "",
                            soup.find("meta",
                                      attrs={"property": "og:title"
                                             })["content"].split("|")[0]))
                    source.append(
                        soup.find("meta", attrs={
                            "property": "og:title"
                        })["content"].split("|")[-1].split("-")[0])
                    publish_time.append(r_time)
                    body.append(
                        crawler_tool.clean_html("".join(
                            str(x) for x in soup.select("div._2E8y p"))))
                    print("Anue:", r_time)
                    time.sleep(random.uniform(0, 1.5))
            except:
                pass
        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_Anue.csv"
    df.to_csv(file_name, encoding="utf-8")

Exemple #6

0

Afficher le fichier

Fichier : news_ctee.py Projet : ChangWH22/side_project

def ctee_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):

    Title = []
    Publish_time = []
    Section = []
    Body = []
    Source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    loop_flag = False

    for page in range(1, 11):

        print("start ctee collecting page {page}".format(page=page))
        home_url = "https://m.ctee.com.tw/livenews/all/page/" + str(page)
        time.sleep(5)

        try:
            r = crawler_tool.url_retry(home_url)
            soup = BeautifulSoup(r, "lxml")

            for i in range(len(soup.select("p.now-title "))):
                content_url = soup.select("p.now-title ")[i].find_all(
                    "a")[-1]["href"]
                section = soup.select("p.now-title ")[i].find("span").string
                r_time = datetime.datetime.strptime(
                    str(b_time.year) + "/" + crawler_tool.clean_html(
                        str(soup.select("p.now-title ")[i].find_all("a")
                            [1]).split("|")[-1]), "%Y/ %m/%d %H:%M ")

                if r_time > b_time:
                    continue
                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "collected ctee news from {b_time} to {e_time}".format(
                            b_time=b_time, e_time=e_time))
                    break
                else:
                    r2 = crawler_tool.url_retry(content_url)
                    soup2 = BeautifulSoup(r2, "lxml")

                    if section == "生活" or section == "政治":
                        time.sleep(random.uniform(0, 1.5))
                        continue

                    else:
                        Title.append(soup2.select("span.post-title")[0].string)
                        Section.append(section)
                        Source.append("工商時報")
                        Publish_time.append(r_time)
                        Body.append(
                            crawler_tool.clean_html("".join(
                                str(x)
                                for x in soup2.select("div.entry-content p"))))
                        print("ctee:", r_time)
                        time.sleep(random.uniform(0, 1.5))

        except rq.exceptions.RequestException as e2:
            print("home", e2)

        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": Title,
        "Time": Publish_time,
        "Section": Section,
        "Source": Source,
        "Body": Body
    }).sort_values(by=["Time"])
    file_name = "D:/User/Desktop/corpus/news/ctee/" + decide_time_begin + "_" + decide_time_end + "_ctee.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)

Exemple #7

0

Afficher le fichier

def cna_GET_NEWS_time(decide_time_begin, decide_time_end):

    title = []
    publish_time = []
    body = []
    section = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    for category in ["aie", "asc"]:

        loop_flag = False

        for pageidx in range(1, 6):
            resp = rq.post(
                "https://www.cna.com.tw/cna2018api/api/WNewsList", {
                    "action": "0",
                    "category": category,
                    "pageidx": pageidx,
                    "pagesize": "20"
                })

            j = json.loads(resp.content)
            for i in range(len(j['ResultData']["Items"])):
                r_time = datetime.datetime.strptime(
                    j['ResultData']["Items"][i]["CreateTime"],
                    "%Y/%m/%d %H:%M")

                if r_time > b_time:
                    continue

                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "Web Crawler has collected 中央通訊社  from {b_time} to {e_time}"
                        .format(b_time=b_time, e_time=e_time))
                    break

                else:
                    url = j['ResultData']["Items"][i]["PageUrl"]
                    section.append(j['ResultData']["Items"][i]["ClassName"])
                    title.append(
                        re.sub(r"\s{1, }", "",
                               j['ResultData']["Items"][i]["HeadLine"]))
                    publish_time.append(r_time)
                    soup = BeautifulSoup(rq.get(url).text, "lxml")
                    source.append("中央通訊社")
                    body.append("".join(
                        crawler_tool.clean_html(str(x))
                        for x in soup.select("div.paragraph p")))
                    print("中央通訊社:", category, r_time)
                    time.sleep(random.uniform(0.5, 1.5))
            if loop_flag == True:
                break

    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_cna.csv"
    df.to_csv(file_name, encoding="utf-8")

Exemple #8

0

Afficher le fichier

def rti_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):
    title = []
    publish_time = []
    section = []
    body = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")
    #print(b_time, "\n", e_time)

    loop_flag = False

    for page in range(1, 100):
        print("start collecting Rti page {page}".format(page=page))
        home_url = "https://www.rti.org.tw/news/list/categoryId/2/page/" + str(
            page)
        r = crawler_tool.url_retry(home_url)
        soup = BeautifulSoup(r, "lxml")

        time.sleep(5)

        for i in range(len(soup.select("div.main_wrapper ul a"))):
            content_url = "https://www.rti.org.tw" + soup.select(
                "div.main_wrapper ul a")[i]["href"]
            r2 = crawler_tool.url_retry(content_url)
            soup2 = BeautifulSoup(r2, "lxml")
            r_time = datetime.datetime.strptime(
                re.sub("[^0-9]", "",
                       soup2.find("li", attrs={
                           "class": "date"
                       }).string), "%Y%m%d%H%M")

            if r_time > b_time:
                continue

            elif r_time < e_time:
                loop_flag = True
                print(
                    "Web Crawler has collected Rti data from {b_time} to {e_time}"
                    .format(b_time=b_time, e_time=e_time))
                break

            else:
                section.append("財經")
                title.append(
                    re.sub(r"\s{1,}", "",
                           soup2.find("title").string.split("-")[0]))
                source.append(soup2.find("title").string.split("-")[-1])
                publish_time.append(r_time)
                body.append(
                    crawler_tool.clean_html("".join(
                        str(x) for x in soup2.select("article p"))))
                print("Rti:", r_time)
                time.sleep(random.uniform(0, 2))

        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/rti/" + decide_time_begin + "_" + decide_time_end + "_rti.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)