Esempi in Python per url_retry, esempi in Python per crawler.tool.url_retry

Esempio n. 1

0

Mostra file

File: news_China_Time.py Progetto: ChangWH22/side_project

def chinatime_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):
    Title = []
    Publish_time = []
    Section = []
    Body = []
    Source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")
    #print(b_time, "\n", e_time)
    loop_flag = False

    for page in range(1, 11):
        chinatime_home = "https://www.chinatimes.com/money/total?page=" + str(page) + "&chdtv"

        try:
            print("start collecting ChinaTime page..%s" % page)
            r = crawler_tool.url_retry(chinatime_home)
            soup = BeautifulSoup(r, "lxml")
            time.sleep(5)

            for i in range(len(soup.select("h3.title a"))):
                chinatime_financial_url = "https://www.chinatimes.com" + soup.select("h3.title a")[i]["href"]

                try:
                    r2 = crawler_tool.url_retry(chinatime_financial_url)
                    soup2 = BeautifulSoup(r2, "lxml")
                    r_time = datetime.datetime.strptime(soup2.find("meta", attrs={"name": "pubdate"})["content"], "%Y-%m-%dT%H:%M:%S+08:00")

                    if r_time > b_time:
                        continue

                    if r_time < e_time:
                        print("Web Crawler has collected ChinaTime data from {b_time} to {e_time}".format(b_time= b_time, e_time= e_time))
                        loop_flag = True
                        break

                    else:
                        Publish_time.append(r_time)
                        Title.append(re.sub(r"\s{1,}","",soup2.find("h1").string))
                        Section.append(soup2.find("meta", attrs={"name": "section"})["content"])
                        Source.append(soup2.find("meta", attrs={"name": "source"})["content"])
                        body = soup2.select("div.article-body p")
                        Body.append(crawler_tool.clean_html("".join(str(x) for x in body)))
                        time.sleep(random.uniform(0, 2))
                        print("ChinaTime:", r_time)

                except rq.exceptions.RequestException as e:
                    print("in", e)

        except rq.exceptions.RequestException as e:
            print("home", e)

        if loop_flag:
            break

    df = pd.DataFrame({"Title": Title, "Time": Publish_time, "Section": Section,  "Source": Source, "Body": Body}).sort_values(by=["Time"])
    file_name = "D:/User/Desktop/corpus/news/chinatime/" + decide_time_begin + "_" + decide_time_end + "_chinatime.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)

Esempio n. 2

0

Mostra file

File: news_tvbs.py Progetto: ChangWH22/side_project

def tvbs_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):
    title = []
    publish_time = []
    body = []
    section = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    r = crawler_tool.url_retry("https://news.tvbs.com.tw/money")
    soup = BeautifulSoup(r, "lxml")

    for i in range(len(soup.select("div.content_center_contxt_box_news a"))):
        url = "https://news.tvbs.com.tw" + soup.select(
            "div.content_center_contxt_box_news a")[i]["href"]
        r2 = crawler_tool.url_retry(url)
        soup2 = BeautifulSoup(r2, "lxml")
        try:
            j = json.loads(
                soup2.find("script", attrs={
                    "type": "application/ld+json"
                }).string)
            r_time = datetime.datetime.strptime(j["datePublished"],
                                                "%Y/%m/%d %H:%M")

            if r_time > b_time:
                continue

            elif r_time < e_time:
                print(
                    "Web Crawler has collected TVBS  from {b_time} to {e_time}"
                    .format(b_time=b_time, e_time=e_time))
                break

            else:
                title.append(
                    re.sub(r"\s{1, }", "", j["headline"].split("│")[0]))
                publish_time.append(r_time)
                body.append(j["articleBody"])
                section.append(j["articleSection"])
                source.append("TVBS新聞網")
                print("TVBS新聞網:", r_time)

                time.sleep(random.uniform(0.5, 1.5))
        except json.decoder.JSONDecodeError:
            print("json.decoder.JSONDecodeError")
    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/tvbs/" + decide_time_begin + "_" + decide_time_end + "_tvbs.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)

Esempio n. 3

0

Mostra file

File: news_anue.py Progetto: ChangWH22/side_project

def anue_GET_NEWS_time(decide_time_begin, decide_time_end):
    dt = datetime.datetime.today() - datetime.datetime.fromtimestamp(
        1611763199)  # 1/27差值
    dta = (dt.days + 1) * 86400 + 1611763199
    dtb = str(dta - 11 * 86400 + 1)
    dta = str(dta)

    title = []
    publish_time = []
    section = []
    body = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    loop_flag = False

    for page in range(1, 30):
        print("start collecting Anue page {page}".format(page=page))
        home_url = "https://api.cnyes.com/media/api/v1/newslist/category/headline?limit=30&startAt="+dtb+"&endAt="+dta+"&page="\
                   + str(page)
        r = crawler_tool.url_retry_json(home_url)
        time.sleep(5)

        for i in range(len(r["items"]["data"])):

            content_url = "https://news.cnyes.com/news/id/" + str(
                r["items"]["data"][i]["newsId"])
            r2 = crawler_tool.url_retry(content_url)
            soup = BeautifulSoup(r2, "lxml")
            try:
                r_time = datetime.datetime.strptime(
                    soup.find("time").string, "%Y/%m/%d %H:%M")

                if r_time > b_time:
                    continue

                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "---Web Crawler has collected Anue data from {b_time} to {e_time}---"
                        .format(b_time=b_time, e_time=e_time))
                    break

                else:
                    section.append(
                        soup.find("meta", attrs={
                            "property": "og:title"
                        })["content"].split("|")[-1].split("-")[-1])
                    title.append(
                        re.sub(
                            r"\s{1,}", "",
                            soup.find("meta",
                                      attrs={"property": "og:title"
                                             })["content"].split("|")[0]))
                    source.append(
                        soup.find("meta", attrs={
                            "property": "og:title"
                        })["content"].split("|")[-1].split("-")[0])
                    publish_time.append(r_time)
                    body.append(
                        crawler_tool.clean_html("".join(
                            str(x) for x in soup.select("div._2E8y p"))))
                    print("Anue:", r_time)
                    time.sleep(random.uniform(0, 1.5))
            except:
                pass
        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_Anue.csv"
    df.to_csv(file_name, encoding="utf-8")

Esempio n. 4

0

Mostra file

File: news_ctee.py Progetto: ChangWH22/side_project

def ctee_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):

    Title = []
    Publish_time = []
    Section = []
    Body = []
    Source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")

    loop_flag = False

    for page in range(1, 11):

        print("start ctee collecting page {page}".format(page=page))
        home_url = "https://m.ctee.com.tw/livenews/all/page/" + str(page)
        time.sleep(5)

        try:
            r = crawler_tool.url_retry(home_url)
            soup = BeautifulSoup(r, "lxml")

            for i in range(len(soup.select("p.now-title "))):
                content_url = soup.select("p.now-title ")[i].find_all(
                    "a")[-1]["href"]
                section = soup.select("p.now-title ")[i].find("span").string
                r_time = datetime.datetime.strptime(
                    str(b_time.year) + "/" + crawler_tool.clean_html(
                        str(soup.select("p.now-title ")[i].find_all("a")
                            [1]).split("|")[-1]), "%Y/ %m/%d %H:%M ")

                if r_time > b_time:
                    continue
                elif r_time < e_time:
                    loop_flag = True
                    print(
                        "collected ctee news from {b_time} to {e_time}".format(
                            b_time=b_time, e_time=e_time))
                    break
                else:
                    r2 = crawler_tool.url_retry(content_url)
                    soup2 = BeautifulSoup(r2, "lxml")

                    if section == "生活" or section == "政治":
                        time.sleep(random.uniform(0, 1.5))
                        continue

                    else:
                        Title.append(soup2.select("span.post-title")[0].string)
                        Section.append(section)
                        Source.append("工商時報")
                        Publish_time.append(r_time)
                        Body.append(
                            crawler_tool.clean_html("".join(
                                str(x)
                                for x in soup2.select("div.entry-content p"))))
                        print("ctee:", r_time)
                        time.sleep(random.uniform(0, 1.5))

        except rq.exceptions.RequestException as e2:
            print("home", e2)

        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": Title,
        "Time": Publish_time,
        "Section": Section,
        "Source": Source,
        "Body": Body
    }).sort_values(by=["Time"])
    file_name = "D:/User/Desktop/corpus/news/ctee/" + decide_time_begin + "_" + decide_time_end + "_ctee.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)

Esempio n. 5

0

Mostra file

def rti_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q):
    title = []
    publish_time = []
    section = []
    body = []
    source = []

    b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M")
    e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M")
    #print(b_time, "\n", e_time)

    loop_flag = False

    for page in range(1, 100):
        print("start collecting Rti page {page}".format(page=page))
        home_url = "https://www.rti.org.tw/news/list/categoryId/2/page/" + str(
            page)
        r = crawler_tool.url_retry(home_url)
        soup = BeautifulSoup(r, "lxml")

        time.sleep(5)

        for i in range(len(soup.select("div.main_wrapper ul a"))):
            content_url = "https://www.rti.org.tw" + soup.select(
                "div.main_wrapper ul a")[i]["href"]
            r2 = crawler_tool.url_retry(content_url)
            soup2 = BeautifulSoup(r2, "lxml")
            r_time = datetime.datetime.strptime(
                re.sub("[^0-9]", "",
                       soup2.find("li", attrs={
                           "class": "date"
                       }).string), "%Y%m%d%H%M")

            if r_time > b_time:
                continue

            elif r_time < e_time:
                loop_flag = True
                print(
                    "Web Crawler has collected Rti data from {b_time} to {e_time}"
                    .format(b_time=b_time, e_time=e_time))
                break

            else:
                section.append("財經")
                title.append(
                    re.sub(r"\s{1,}", "",
                           soup2.find("title").string.split("-")[0]))
                source.append(soup2.find("title").string.split("-")[-1])
                publish_time.append(r_time)
                body.append(
                    crawler_tool.clean_html("".join(
                        str(x) for x in soup2.select("article p"))))
                print("Rti:", r_time)
                time.sleep(random.uniform(0, 2))

        if loop_flag == True:
            break

    df = pd.DataFrame({
        "Title": title,
        "Time": publish_time,
        "Section": section,
        "Source": source,
        "Body": body
    })
    file_name = "D:/User/Desktop/corpus/news/rti/" + decide_time_begin + "_" + decide_time_end + "_rti.csv"
    df.to_csv(file_name, encoding="utf-8")
    q.put(df)