Python dbExecutorの例、database.dbExecutor.dbExecutor Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Scraper-mkgp_gov.py プロジェクト: jurem/SLEDIMedO

def get_text(stran,SOURCE_ID):



    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(simple_get("http://www.mkgp.gov.si/si/medijsko_sredisce/sporocila_za_javnost/page/"+str(stran)), "html.parser")
    all_links = soup.find("div", {"id":"mainContainer2"}).find_all("a")
    for links in all_links:
        if(links.get("href")==None): continue
        if(re.match("http://www.mkgp.gov.si/si/medijsko_sredisce/novica+",links.get("href"))):
            soup = BeautifulSoup(simple_get(links.get("href")), "html.parser")
            naslov = soup.find("div", {"class":"article"}).find("h2").text
            datum = soup.find("time").text.split()
            s=""
            seq = (datum[0], datum[1], datum[2])
            datum = uniformDateStr(s.join(seq))
            vse = soup.find("div", {"class":"article"}).find_all("p")

            vsebina = ""
            for obj in vse:
                vsebina += str(obj.text).strip() + "\n"
            link = links.get("href")
            hashStr = makeHash(naslov, datum)  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #2

0

ファイルを表示

ファイル: Scraper-raz.um.py プロジェクト: jurem/SLEDIMedO

def get_text(stran,SOURCE_ID):



    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d")  # today date in the uniform format


    soup = BeautifulSoup(simple_get("http://raz.um.si/novice-in-dogodki/Strani/default.aspx#"), "html.parser")
    all_links = soup.find_all("a")
    for links in all_links:
        if(links.get("href")==None ): continue
        if(re.match("/novice-in-dogodki/novica/Strani/+",links.get("href"))):
            soup = BeautifulSoup(simple_get(parent_link+links.get("href")), "html.parser")
            naslov = soup.find("div", {"class":"naslov_novice"}).text.strip()
            datum =  soup.find("div", {"class":"year"}).text.strip().split()
            vsebina = soup.find("div", {"class":"teloNovice"}).text.strip()
            datum = uniformDateStr(datum[1])
            link = links.get("href")
            hashStr = makeHash(naslov, datum)  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #3

0

ファイルを表示

def get_text(stran,SOURCE_ID, startID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(simple_get(startID), "html.parser")
    all_links = soup.find_all("a")
    for links in all_links:
        if(links.get("href")==None): continue
        if(re.match("http://www.zgs.si/aktualno/sporocila_za_javnost/news_article/+",links.get("href")) or re.match("http://www.zgs.si/aktualno/novice/news_article/+",links.get("href"))):
            soup = BeautifulSoup(simple_get(links.get("href")), "html.parser").find("div", {"class":"news news-single"})
            naslov = soup.find("h3").text
            datum = soup.find("time").text.split()
            datum = uniformDateStr(datum[0])
            vse = soup.find_all("p")
            vsebina = ""

            for obj in vse:
                vsebina += str(obj.text) + "\n"
            link = links.get("href")
            hashStr = makeHash(naslov, datum)  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #4

0

ファイルを表示

def get_text(stran, SOURCE_ID):

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(
        simple_get(
            "https://www.cerknica.si/GetPosts?page=" + str(stran) +
            "&keywords=&region_id=&municipality_id=31&category_id=8&subcategory_id=&highlighted=False&datum_objave="
        ), "html.parser")
    all_links = soup.find_all("a")

    tmp = 0
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("/objava/+", links.get("href")) and tmp == 0):
            try:
                soup = BeautifulSoup(
                    simple_get(parent_link + links.get("href")), "html.parser")
                naslov = soup.find("h1", {"id": "main_title"}).text.strip()

                if (soup.find("div", {"id": "short_info"}) == None
                        or soup.find("div", {"id": "short_info"}) == ''):
                    datum = todayDateStr
                else:
                    try:
                        datum = uniformDateStr(
                            soup.find("div", {
                                "id": "short_info"
                            }).text.strip())
                    except ValueError as e:
                        datum = todayDateStr
                vse = soup.find("div", {"class": "tab-content"}).find_all("p")
                vsebina = ""
                for obj in vse:
                    vsebina += str(obj.text).strip() + "\n"
                link = parent_link + links.get("href")
                hashStr = makeHash(
                    naslov, datum
                )  # creates article hash from title and dateStr (HASH_VREDNOST)
                date_downloaded = todayDateStr  # date when the article was downloaded

                if sqlBase.getByHash(hashStr) is None:
                    description = vsebina
                    entry = (datum, naslov, description, date_downloaded,
                             hashStr, link, SOURCE_ID)
                    sqlBase.insertOne(
                        entry)  # insert the article in the database
                    print("Inserted succesfuly")
            except TypeError as e:
                print(e)

        if (tmp < 3):
            tmp += 1
        else:
            tmp = 0

    print(tmp)

コード例 #5

0

ファイルを表示

def main():
    articlesChecked = 0     # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format

    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES))  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)   # set headers of the session

        resp = s.get(POSTS_URL)
        soup = bs.BeautifulSoup(resp.text, "html.parser")

        articlesList = list()
        articlesList = getCurrentNews(s, soup, articlesList)
        articlesList = getArchivedNews(s, soup, articlesList)

        for article in articlesList:
            articlesChecked += 1
            try:
                title = article[0]
                link = article[1]
                shortDate = article[2]
                hashStr = makeHash(title, shortDate)

                # if article is not yet saved in the database we add it
                if sqlBase.getByHash(hashStr) is None:
                    # get article description/content
                    # logger.debug("Getting article from url: {}".format(link))
                    description, dateStr = getArticleDescrAndDate(s, link)
                    date_created = uniformDateStr(dateStr, "%d.%m.%Y")
                    date_downloaded = todayDateStr                            # date when the article was downloaded

                    # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                    entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID)
                    sqlBase.insertOne(entry, True)   # insert the article in the database
                    articlesDownloaded += 1

                if articlesChecked % 5 == 0:
                    logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded))
                if not firstRunBool and articlesChecked >= NUMBER_ARTICLES_TO_CHECK:
                    break

            except Exception:
                logger.error("Url on which the error occured: {}".format(resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #6

0

ファイルを表示

ファイル: Scraper-PtujInfo.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(
        simple_get("https://ptujinfo.com/lokalno?page=0%2C0%2C" + str(stran)),
        "html.parser")
    all_links = soup.find("div", {"class": "view__content"}).find_all("a")
    tmp = 0
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("/novica/lokalno+", links.get("href")) and tmp == 2):
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")),
                                 "html.parser")
            naslov = soup.find("div", {
                "class": "before-main__left"
            }).find("h1").text
            datum = soup.find("div", {
                "class": "before-main__left"
            }).find("span", {
                "class": "date"
            }).text.split()
            s = ""
            seq = (datum[0], meseci[datum[1].lower() + ","], datum[2])
            datum = uniformDateStr(s.join(seq))
            podnaslov = soup.find(
                "div", {"class": "field field--name-field-podnaslov"})
            if (podnaslov != None):
                podnaslov = podnaslov.text.strip()
            else:
                podnaslov = ""
            besedilo = soup.find("div",
                                 {"class": "field field--name-field-besedilo"})
            if (besedilo != None):
                besedilo = besedilo.text.strip()
            else:
                besedilo = ""

            vsebina = podnaslov + "\n" + besedilo
            link = links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded
            if sqlBase.getByHash(hashStr) is None:

                entry = (datum, naslov, vsebina, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

        if (tmp < 2):
            tmp += 1
        else:
            tmp = 0

コード例 #7

0

ファイルを表示

ファイル: Scraper-innorenew.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(simple_get("https://innorenew.eu/news/#"),
                         "html.parser")
    all_links = soup.find("div").find_all("a")
    tmp = 0
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("https://innorenew.eu/[0-9][0-9][0-9][0-9/]+",
                     links.get("href")) and tmp == 0):
            soup = BeautifulSoup(simple_get(links.get("href")), "html.parser")
            n_soup = soup.find("div", {"class": "col-md-7 col-md-push-5"})
            naslov = n_soup.find("h2").text.strip()
            d_soup = soup.find("aside", {"class": "col-md-3 col-md-pull-7"})
            soup = soup.find("article")
            dat = d_soup.find_all("p")
            datum = dat[1].text.split()
            s = ""
            seq = (datum[2], meseci[str(datum[1]).lower() + ","], datum[3])
            datum = uniformDateStr(s.join(seq), "%d,%m.%Y")
            vse = soup.find_all("p")

            vsebina = ""
            tmp_bool = True
            for obj in vse:
                if (tmp_bool):
                    tmp_bool = False
                    continue
                vsebina += str(obj.text).strip() + "\n"
            link = links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

        if (tmp < 2):
            tmp += 1
        else:
            tmp = 0

コード例 #8

0

ファイルを表示

def get_first_text(SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(simple_get("https://www.vestnik.si/aktualno"),
                         "html.parser")
    news = soup.find_all("div", {"class", "row w-clearfix"})
    for n in news:
        all_links = n.find_all("a")
        tmp = 0
        for links in all_links:
            if (links.get("href") == None): continue
            if (re.match(".*-[0-9]+", links.get("href")) and tmp == 0):
                try:
                    soup = BeautifulSoup(
                        simple_get(parent_link + links.get("href")),
                        "html.parser")
                    naslov = soup.find("div", {
                        "class": "main-column"
                    }).find("h1").text.strip()
                    datum = soup.find_all("div",
                                          {"class": "avtordatum"})[1].text
                    datum = uniformDateStr(datum)
                    vse = soup.find("div", {
                        "class": "main-col w-clearfix"
                    }).find_all("p")
                    vsebina = ""
                    for obj in vse:
                        vsebina += str(obj.text).strip() + "\n"
                    link = parent_link + links.get("href")
                    hashStr = makeHash(
                        naslov, datum
                    )  # creates article hash from title and dateStr (HASH_VREDNOST)
                    date_downloaded = todayDateStr  # date when the article was downloaded

                    if sqlBase.getByHash(hashStr) is None:
                        description = vsebina
                        entry = (datum, naslov, description, date_downloaded,
                                 hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(
                            entry)  # insert the article in the database
                        print("Inserted succesfuly")
                except TypeError as e:
                    print(e)
            if (tmp < 1):
                tmp += 1
            else:
                tmp = 0

コード例 #9

0

ファイルを表示

ファイル: Scraper-tednik.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    tmp = 0
    soup = BeautifulSoup(
        simple_get("https://www.tednik.si/tednik?limit=9&start=" + str(stran)),
        "html.parser")
    all_links = soup.find("div", {"class": "t3-content"}).find_all("a")
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("/tednik/+", links.get("href")) and tmp == 0):
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")),
                                 "html.parser")
            naslov = soup.find("h1", {"class": "article-title"}).text.strip()
            datum = soup.find("div", {
                "class": "article-main"
            }).find("time").text.strip()
            datum = datum.split()
            vse = soup.find("div", {
                "class": "article-content-main"
            }).find_all("p")
            vsebina = ""
            for obj in vse:
                vsebina += str(obj.text).strip() + "\n"
            s = ""
            seq = (datum[1] + ".", meseci[datum[2].lower()], datum[3])
            datum = uniformDateStr(s.join(seq))
            link = links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

        if (tmp < 2):
            tmp += 1
        else:
            tmp = 0

コード例 #10

0

ファイルを表示

ファイル: Scraper-pzs.py プロジェクト: jurem/SLEDIMedO

def get_text(stran,SOURCE_ID):



    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d")  # today date in the uniform format


    soup = BeautifulSoup(simple_get("https://www.pzs.si/novice.php?stran="+str(stran)+"&limit=20&iskanec=&tip=1"), "html.parser")
    all_links =  soup.find_all("table",{"class":"round7"})[4].find_all("a")
    tmp=0
    for links in all_links:
        if(links.get("href")==None): continue
        if(re.match("novice\.php\?pid=[0-9]+",links.get("href")) and tmp == 0):
            soup = BeautifulSoup(simple_get(parent_link+links.get("href")), "html.parser")
            soup =  soup.find_all("table",{"class":"round7"})[4].find("div", {"class":"poravnava"})
            naslov = soup.find("h2").text
            datum = soup.find("b").text.split(",")[1].split()
            s=""
            seq = (datum[0], meseci[datum[1]+","], datum[2])
            datum = uniformDateStr(s.join(seq))

            vse = soup.find("table").find_all("tr")
            tm=0
            vsebina=""
            for t in vse:
                if(t.find("td", {"class":"poravnava"})):
                    vsebina += vse[tm-1].text.strip() +"\n"
                    vsebina += t.find("td", {"class":"poravnava"}).text.strip()
                tm+=1
            link = links.get("href")
            hashStr = makeHash(naslov, datum)  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")


        if (tmp < 1):
            tmp += 1
        else:
            tmp = 0

コード例 #11

0

ファイルを表示

ファイル: Scraper-Notrajinski_park.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(
        simple_get("https://www.notranjski-park.si/novice-dogodki?cur_page=" +
                   str(stran)), "html.parser")
    all_links = soup.find_all("a")
    for links in all_links:
        if (links.get("href") == None): continue
        #print("++++++++")
        if (re.match("https://www.notranjski-park.si/novice-dogodki/+",
                     links.get("href"))):
            soup = BeautifulSoup(simple_get(links.get("href")), "html.parser")
            naslov = soup.find("h1").text
            datum = soup.find("li", {"class": "date"}).text
            datum = datum.split()

            if (len(datum) == 0):
                datum = "2018-01-01"
            elif (len(datum) == 3 and datum[1] == "ob"):
                datum = uniformDateStr(datum[0])
            else:
                s = ""
                seq = (datum[0], meseci[datum[1]], datum[2])
                datum = uniformDateStr(s.join(seq))
            vse = soup.find("div", {"class": "container small"}).find_all("p")

            vsebina = ""
            for obj in vse:
                vsebina += str(obj.text).strip() + "\n"
            link = links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #12

0

ファイルを表示

def get_text(stran, SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format
    all_links = ""
    try:
        soup = BeautifulSoup(
            simple_get("https://www.vestnik.si/aktualno?page=" + str(stran)),
            "html.parser")
        all_links = soup.find("div", {
            "class": "content-container"
        }).find_all("a")
    except TypeError as e:
        print(e)
    for links in all_links:
        try:
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")),
                                 "html.parser")
            try:
                naslov = soup.find("div", {
                    "class": "main-column"
                }).find("h1").text.strip()
                datum = soup.find_all("div", {"class": "avtordatum"})[1].text
                datum = uniformDateStr(datum)
                vse = soup.find("div", {
                    "class": "main-col w-clearfix"
                }).find_all("p")
                vsebina = ""
                for obj in vse:
                    vsebina += str(obj.text).strip() + "\n"
                link = parent_link + links.get("href")
                hashStr = makeHash(
                    naslov, datum
                )  # creates article hash from title and dateStr (HASH_VREDNOST)
                date_downloaded = todayDateStr  # date when the article was downloaded
            except AttributeError as e:
                print(e)

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")
        except TypeError as e:
            print(e)

コード例 #13

0

ファイルを表示

ファイル: Scraper-notranjskoprimorske_obcina_cerknica.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(
        simple_get(
            "http://notranjskoprimorske.si/kategorije/vse-novice/page/" +
            str(stran)), "html.parser")
    all_links = soup.find_all("a")
    tmp = 0
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("http://notranjskoprimorske.si/[0-9][0-9][0-9][0-9]/+",
                     links.get("href")) and tmp == 0):
            #print("----------------------")
            soup = BeautifulSoup(simple_get(links.get("href")), "html.parser")
            naslov = soup.find("h1", {"class": "entry-title"}).text
            datum = soup.find("span", {
                "class": "td-post-date"
            }).find("time").text.split()
            s = ""
            seq = (datum[1], meseci[datum[2]], datum[3])
            datum = uniformDateStr(s.join(seq))

            vse = soup.find("div", {"class": "td-post-content"}).find_all("p")
            vsebina = ""
            for obj in vse:
                vsebina += str(obj.text).strip() + "\n"
            link = links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

        if (tmp < 1):
            tmp += 1
        else:
            tmp = 0

コード例 #14

0

ファイルを表示

ファイル: Scraper-pomurec.py プロジェクト: jurem/SLEDIMedO

def get_text(stran,SOURCE_ID,leto):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(simple_get("https://www.pomurec.com/go/203/arhiv/"+str(leto)+"/"+str(stran)), "html.parser")
    all_links = soup.find("div",{"id":"vsebina"}).find_all("a")
    tmp = 0
    for links in all_links:
        print(links["href"])
        if(links.get("href")==None): continue
        if(re.match("/vsebina/+",links.get("href")) and tmp == 0):
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")), "html.parser")
            naslov = soup.find("div",{"id":"vsebina"}).find("h3").text.strip()
            datum = soup.find("div", {"id": "vsebina"}).find("p",{"class":"datum"}).text.strip().split(",")
            if (len(datum) == 1):
                if(datum[0]==''):
                    if (tmp < 3):
                        tmp += 1
                    else:
                        tmp = 0
                    continue
                datum =datum[0].split()
                datum = uniformDateStr(datum[0])
            elif (len(datum) == 2 ):
                datum = datum[1].split()
                datum = uniformDateStr(datum[0])
            vse = soup.find("div", {"id": "vsebina"}).find("ul",{"id":"clanek"}).find_all("p")
            vsebina = ""
            for obj in vse:
                vsebina += str(obj.text).strip() + "\n"

            link = links.get("href")
            hashStr = makeHash(naslov, datum)  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

        if (tmp < 3):
            tmp += 1
        else:
            tmp = 0

コード例 #15

0

ファイルを表示

def get_text(stran, SOURCE_ID):

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(
        simple_get("http://www.danube-region.eu/communication/news?start=" +
                   str(stran)), "html.parser")
    all_links = soup.find("main", {"id": "content"}).find_all("a")
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("/communication/news/+", links.get("href"))):
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")),
                                 "html.parser")
            naslov = soup.find("div",
                               {"class", "page-header"
                                }).find("h2").text.strip().split("\n")[0]
            datum = soup.find("dd", {
                "class": "published"
            }).text.split(":")[1].split()
            s = ""
            seq = (datum[0] + ".", meseci[datum[1] + ","], datum[2])
            datum = uniformDateStr(s.join(seq))

            vse = soup.find("div", {"class", "item-page"}).find_all("p")
            vsebina = ""
            for p in vse:
                vsebina += p.text.strip()

            link = links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded
            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #16

0

ファイルを表示

ファイル: Scraper-bistra.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(
        simple_get("http://www.bistra.si/aktualne-novice/novice?start=" +
                   str(stran)), "html.parser")
    all_links = soup.find_all("a")
    #print(all_links)
    for links in all_links:
        if (links.get("href") == None): continue

        if (re.match("/aktualne-novice/novice/+", links.get("href"))):
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")),
                                 "html.parser")

            content = str(soup.find("div", {"class": "articleBody"}).text)
            title = str(soup.find("div", {"class": "entry-header"}).text)
            datestr = str(soup.find("dd", {"class": "create"}).text)
            title = re.sub('\t+', '', title)
            title = re.sub('\n+', '', title)
            datestr = re.sub('\t+', '', datestr)
            datestr = re.sub('\n+', '', datestr)
            datestr = datestr.split()
            datestr = ''.join([datestr[1], meseci[datestr[2]], datestr[3]])

            datestr = uniformDateStr(datestr, "%d.%m.%Y")

            link = parent_link + links.get("href")

            hashStr = makeHash(
                title, datestr
            )  # creates article hash from title and dateStr (HASH_VREDNOST)

            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = content
                entry = (datestr, title, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #17

0

ファイルを表示

ファイル: SCRAPER-RTV-Ljubljana.py プロジェクト: jurem/SLEDIMedO

def get_text(stran,SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(simple_get("https://www.rtvslo.si/lokalne-novice/ljubljana/arhiv/?&page="+str(stran)), "html.parser")
    all_links = soup.find_all("a",{"class":"title"})
    prev_link = ""
    for links in all_links:
        if(prev_link == links): continue
        prev_link = links
        if(links.get("href")==None): continue
        if(not re.match("http://+",links.get("href"))):
            #print("----------------------")
            print(parent_link+links.get("href"))
            soup = BeautifulSoup(simple_get(parent_link+links.get("href")), "html.parser")
            tmp = soup.find("div", {"id":"newsbody"})
            if(tmp == None): continue
            naslov =  str(tmp.find("h1").text)
            vse =  tmp.find_all("p")
            vsebina=""

            for obj in vse:
               vsebina+=str(obj.text)+"\n"
            dat = str(tmp.find("div", {"class":"info"}).text)
            dat= dat.split()
            s=""
            seq = (dat[0], meseci[dat[1]], dat[2])
            datum = uniformDateStr(s.join(seq))
            link = parent_link+links.get("href")
            hashStr = makeHash(naslov, datum)  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr, link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #18

0

ファイルを表示

def get_text(stran, SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(
        simple_get("https://www.izvoznookno.si/aktualno?p=" + str(stran) +
                   "&t1=1&c=&s=&pn=&t4=4&q="), "html.parser")
    all_links = soup.find_all("a")

    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("/Aktualno/+", links.get("href"))):
            soup = BeautifulSoup(simple_get(parent_link + links.get("href")),
                                 "html.parser")
            naslov = soup.find("article", {
                "class": "col-md-12"
            }).find("h1").text
            datum = soup.find("article", {
                "class": "col-md-12"
            }).find("date").text.split()
            s = ""
            seq = (datum[0], meseci[datum[1]], datum[2])
            datum = uniformDateStr(s.join(seq))

            vsebina = soup.find("div", {"class": "user-html"}).text
            link = parent_link + links.get("href")
            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)
            date_downloaded = todayDateStr  # date when the article was downloaded

            if sqlBase.getByHash(hashStr) is None:
                description = vsebina
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #19

0

ファイルを表示

ファイル: Scraper-p_tech.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):
    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format
    soup = BeautifulSoup(
        simple_get("http://p-tech.si/category/novice/page/" + str(stran)),
        "html.parser")
    all_links = soup.find_all("div")
    tmp = 0

    for links in all_links:
        if (links.get("id") == None): continue
        if (re.match("post+", links.get("id"))):
            naslov = links.find("h2").text
            datum = links.find("span", {"class": "published"}).text.split()
            s = ""
            seq = (datum[0], meseci[datum[1]], datum[2])
            datum = uniformDateStr(s.join(seq))

            hashStr = makeHash(
                naslov, datum
            )  # creates article hash from title and dateStr (HASH_VREDNOST)

            if sqlBase.getByHash(hashStr) is None:
                vsebina = links.find("div", {
                    "class": "page-content"
                }).text.strip()
                link = links.find("a").get("href")
                date_downloaded = todayDateStr  # date when the article was downloaded
                # get article description/content
                description = vsebina
                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                entry = (datum, naslov, description, date_downloaded, hashStr,
                         link, SOURCE_ID)
                sqlBase.insertOne(entry)  # insert the article in the database
                print("Inserted succesfuly")

コード例 #20

0

ファイルを表示

def main():
    articlesChecked = 0     # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format

    with requests.Session() as s:

        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES))  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)   # set headers of the session

        pages = [BASE_URL+"/sl/informacija.asp?id_meta_type=54&type_informacij=0",
                 BASE_URL+"/sl/informacija.asp?id_meta_type=68&type_informacij=0"]

        resp = s.get(pages[0])
        soup = bs.BeautifulSoup(resp.text, "html.parser")
        allNewsLinksHtml = soup.find_all("span", class_="vec")
            
        resp = s.get(pages[1])
        soup = bs.BeautifulSoup(resp.text, "html.parser")
        allNewsLinksHtml2 = soup.find_all("span", class_="vec")

        # makes a one list out of two
        # list1: [1,2,3,4]
        # list2: [5,6,7,8]
        # output list: [1,5,2,6,3,7,4,8]
        for num, pg in enumerate(allNewsLinksHtml2):
            allNewsLinksHtml.insert(num*2, pg)
        
        for newsLink in allNewsLinksHtml:
            try:
                articlesChecked += 1

                link = BASE_URL+"/sl/"+newsLink.find("a")["href"]
                resp = s.get(link)
                soup = bs.BeautifulSoup(resp.text, "html.parser")
                subPage = soup.find("div", class_="Vsebina")

                title = subPage.find("h2").text
                dateStr = parseDate(subPage.find_all("p"))
                hashStr = makeHash(title, dateStr)
                description = subPage.text

                # # print ("date created:", dateStr)
                date_created = uniformDateStr(dateStr, "%d.%m.%Y") # date when the article was published on the page
                date_downloaded = todayDateStr                       # date when the article was downloaded

                # if article is not yet saved in the database we add it
                if sqlBase.getByHash(hashStr) is None:
                    # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                    entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID)
                    sqlBase.insertOne(entry, True)   # insert the article in the database
                    articlesDownloaded += 1

                if articlesChecked % 5 == 0:
                    logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded))
                if not firstRunBool and articlesChecked >= NUMBER_ARTICLES_TO_CHECK:
                    break

            except Exception as e:
                logger.error("Url on which the error occured: {}".format(resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #21

0

ファイルを表示

ファイル: Scraper-gzs.py プロジェクト: jurem/SLEDIMedO

def main():
    pagesChecked = 0        # number of checked pages
    articlesChecked = 0     # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format

    # creates a session
    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES))  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)   # set headers of the session

        resp = s.get(BASE_URL+"/mediji/Novice/ArticlePage/1")
        soup = bs.BeautifulSoup(resp.text, "html.parser")
        logger.info("Checking page 1")

        lastPageNum = int(soup.find("div", class_="pager").find_all("a")[-2].text)

        for pageNum in range(2, lastPageNum+1, 1):
            try:
                pagesChecked += 1
                # find all ~15 articles on current page
                articles = soup.find_all("div", class_="article")

                for article in articles:
                    articlesChecked += 1

                    title = article.find("h1", class_="heading").find("a").text             # finds article title
                    link = article.find("h1", class_="heading").find("a")["href"] # finds article http link
                    dateStr = article.find("div", class_="metadata")          # finds article date (DATUM_VNOSA)
                    [x.extract() for x in dateStr.find_all('span', class_="pull-right")]
                    date_created = parseDate(dateStr.text)
                    hashStr = makeHash(title, date_created)                                # creates article hash from title and dateStr (HASH_VREDNOST)
                    
                    date_downloaded = todayDateStr                     # date when the article was downloaded

                    # if article is not yet saved in the database we add it
                    if sqlBase.getByHash(hashStr) is None:
                        # get article description/content
                        description = getArticleDescr(s, link)

                        # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                        entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(entry, True)   # insert the article in the database
                        articlesDownloaded += 1

                    if articlesChecked % 5 == 0:
                        logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded))

                # firstRunBool = True # DEBUG!
                if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK:
                    break

                # find next page
                resp = s.get(BASE_URL+"/mediji/Novice/ArticlePage/"+str(pageNum))
                soup = bs.BeautifulSoup(resp.text, "html.parser")
                logger.info("Checking page: {}".format(pageNum))

                        
            except Exception as e:
                logger.error("Url on which the error occured: {}".format(resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #22

0

ファイルを表示

def main():
    articlesChecked = 0
    articlesDownloaded = 0  # number of downloaded articles

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format
    yearInt = datetime.datetime.now().year

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES))  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)   # set headers of the session

        for subPage in ["/obvestila-za-obcane/", "/sporocila-za-javnost/"]:
            logger.info("First checking subpage: {}".format(subPage))
            maxYearToCheck = MAX_YEAR-1
            if not firstRunBool:
                maxYearToCheck = yearInt-1
            for yearNum in range(yearInt, maxYearToCheck, -1):
                logger.info("Checking year: {}".format(yearNum))
                pagelink = BASE_URL+subPage+str(yearNum)
                try: 
                    resp = s.get(pagelink)
                    soup = bs.BeautifulSoup(resp.text, "html.parser")

                    monthListings = soup.find_all("div", class_="newsArchive-group")

                    for monthListing in monthListings:
                        articleListings = monthListing.find_all("div", class_="news-listing-item")
                        for article in articleListings:
                            articlesChecked += 1

                            title = article.find("h2").text
                            link = BASE_URL+"/"+str(article.find("h2").find("a")["href"])
                            dateStr = article.find("div", class_="news-listing-item-date").text
                            hashStr = makeHash(title, dateStr)

                            date_created = uniformDateStr(dateStr, "%d. %m. %Y") # date when the article was published on the page
                            date_downloaded = todayDateStr                       # date when the article was downloaded

                            # if article is not yet saved in the database we add it
                            if sqlBase.getByHash(hashStr) is None:
                                # get article description/content
                                description = getArticleDescr(s, link)

                                # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                                entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID)
                                sqlBase.insertOne(entry, True)   # insert the article in the database
                                articlesDownloaded += 1

                            if articlesChecked % 5 == 0:
                                logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded))

                except Exception as e:
                    logger.error("Url on which the error occured: {}".format(resp.url))
                    logger.exception("")
                    sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #23

0

ファイルを表示

ファイル: Scraper-geopark-idrija.py プロジェクト: jurem/SLEDIMedO

def main():
    pagesChecked = 0  # number of checked pages
    articlesChecked = 0  # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    # creates a session
    with requests.Session() as s:
        pageStart = 0  # set at which page (article) to start

        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)  # set headers of the session

        # send get request to the http page (if you need a post request you could also use s.post(...))
        resp = s.get(BASE_URL + "/si/novice/")
        # adds the html text of the http response to the BeautifulSoup parser
        soup = bs.BeautifulSoup(resp.text, "html.parser")

        # find "next page" button link - to import all the news recursive
        try:
            nextPageLink = BASE_URL + "/" + soup.find(
                "div", class_="pager").find_all("a")[-1]["href"]
        except KeyError as e:
            logger.exception("")

        while nextPageLink != None:
            try:
                pagesChecked += 1
                # find all ~15 articles on current page
                articles = soup.find_all("div", class_="newswrapp")

                for article in articles:
                    articlesChecked += 1

                    textPart = article.find("div", class_="newstext")

                    title = textPart.find("h2").find(
                        "a").text  # finds article title
                    link = BASE_URL + textPart.find("h2").find("a")[
                        "href"]  # finds article http link
                    date_created = parseDate(
                        textPart.find("div", class_="date").text
                    )  # finds article date (DATUM_VNOSA)
                    hashStr = makeHash(
                        title, date_created
                    )  # creates article hash from title and dateStr (HASH_VREDNOST)

                    date_downloaded = todayDateStr  # date when the article was downloaded

                    # if article is not yet saved in the database we add it
                    if sqlBase.getByHash(hashStr) is None:
                        # get article description/content
                        description = getArticleDescr(s, link)

                        # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                        entry = (date_created, title, description,
                                 date_downloaded, hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(
                            entry, True)  # insert the article in the database
                        articlesDownloaded += 1

                    if articlesChecked % 5 == 0:
                        logger.info(
                            "Checked: {} articles. Downloaded: {} new articles."
                            .format(articlesChecked, articlesDownloaded))

                # find next page
                resp = s.get(nextPageLink)  # loads next page
                soup = bs.BeautifulSoup(
                    resp.text, "html.parser")  # add html text to the soup
                try:
                    nextPageLink = BASE_URL + "/" + soup.find(
                        "div", class_="pager").find_all("a")[-1][
                            "href"]  # select the "next page" button http link
                except AttributeError as e:
                    nextPageLink = None
                    logger.exception("")
                except KeyError as e:
                    nextPageLink = None
                    logger.exception("")

                if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK:
                    break

            except Exception as e:
                logger.error("Url on which the error occured: {}".format(
                    resp.url))
                logger.exception("")
                sys.exit()

        resp = s.get(BASE_URL + "/si/dogodki/")
        # adds the html text of the http response to the BeautifulSoup parser
        soup = bs.BeautifulSoup(resp.text, "html.parser")

        pagesChecked += 1
        # find all ~15 articles on current page
        articles = soup.find_all("div", class_="newswrapp")

        for article in articles:
            try:
                articlesChecked += 1

                textPart = article.find("div", class_="newstext")

                title = textPart.find("h2").find(
                    "a").text  # finds article title
                link = BASE_URL + textPart.find("h2").find("a")[
                    "href"]  # finds article http link
                date_created = ""  # finds article date (DATUM_VNOSA)
                hashStr = makeHash(
                    title, date_created
                )  # creates article hash from title and dateStr (HASH_VREDNOST)
                date_created = None
                date_downloaded = todayDateStr  # date when the article was downloaded

                # if article is not yet saved in the database we add it
                if sqlBase.getByHash(hashStr) is None:
                    # get article description/content
                    description = getArticleDescr(s, link)

                    # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                    entry = (date_created, title, description, date_downloaded,
                             hashStr, link, SOURCE_ID)
                    sqlBase.insertOne(
                        entry, True)  # insert the article in the database
                    articlesDownloaded += 1

                if articlesChecked % 5 == 0:
                    logger.info(
                        "Checked: {} articles. Downloaded: {} new articles.".
                        format(articlesChecked, articlesDownloaded))

                if not firstRunBool and articlesChecked >= NUM_ARTICLES_TO_CHECK:
                    break
            except Exception as e:
                logger.exception("")

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #24

0

ファイルを表示

def main():
    pagesChecked = 0  # number of checked pages
    articlesChecked = 0  # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    # creates a session
    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)  # set headers of the session

        resp = s.get(SUB_PAGES_URL + "1")
        soup = bs.BeautifulSoup(resp.text, "html.parser")

        # find "next page" button link - to import all the news recursive
        nextPageLink = soup.find("div", class_="pagination clearfix").find(
            "div", class_="alignleft").find(
                "a")  # searches tag "span" with class "next"
        pageStart = 1  # set at which page (article) to start

        while nextPageLink != None:
            try:
                pagesChecked += 1
                # find all ~15 articles on current page
                articles = soup.find_all("article")

                for article in articles:
                    articlesChecked += 1

                    title = article.find(
                        "h2", class_="entry-title").text  # finds article title
                    link = article.find("h2", class_="entry-title").find("a")[
                        "href"]  # finds article http link
                    dateStr = ""
                    hashStr = makeHash(
                        title, dateStr
                    )  # creates article hash from title and dateStr (HASH_VREDNOST)

                    date_created = None  # date when the article was published on the page
                    date_downloaded = todayDateStr  # date when the article was downloaded

                    # if article is not yet saved in the database we add it
                    if sqlBase.getByHash(hashStr) is None:
                        # get article description/content
                        description = getArticleDescr(s, link)

                        # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                        entry = (date_created, title, description,
                                 date_downloaded, hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(
                            entry, True)  # insert the article in the database
                        articlesDownloaded += 1

                    if articlesChecked % 5 == 0:
                        logger.info(
                            "Checked: {} articles. Downloaded: {} new articles."
                            .format(articlesChecked, articlesDownloaded))

                # find next page
                try:
                    nextPageLink = soup.find(
                        "div", class_="pagination clearfix"
                    ).find("div", class_="alignleft").find("a")[
                        "href"]  # select the "next page" button http link
                    logger.debug("Checking page: {}".format(nextPageLink))
                    resp = s.get(nextPageLink)  # loads next page
                    soup = bs.BeautifulSoup(
                        resp.text, "html.parser")  # add html text to the soup
                except Exception:
                    logger.exception("Can not find next page")
                if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK - 1:
                    break

            except Exception:
                logger.error("Url on which the error occured: {}".format(
                    resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #25

0

ファイルを表示

ファイル: Scraper-dolenjskilist.py プロジェクト: jurem/SLEDIMedO

def main():
    pagesChecked = 0  # number of checked pages
    articlesChecked = 0  # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    # creates a session
    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)  # set headers of the session

        archiveLinks = getCalArchiveLinks(s)
        # print (archiveLinks)

        if firstRunBool:
            logger.debug("Number of found days to check: {}".format(
                len(archiveLinks)))

        # while not on the last page
        for archiveDay, archiveLink in enumerate(archiveLinks):
            try:
                archiveLinkPage = archiveLink[0]
                logger.info("Downloading page: {}".format(archiveLinkPage))
                resp = s.get(archiveLinkPage)
                soup = bs.BeautifulSoup(resp.text, "html.parser")

                pagesChecked += 1
                articles = soup.find("div", class_="row news").find_all(
                    "div", class_="col-sm-4")
                logger.debug("Number of found articles: {}".format(
                    len(articles)))

                for article in articles:
                    articlesChecked += 1

                    title = article.find(
                        "span", class_="h2").text  # finds article title
                    link = BASE_URL + article.find("a")[
                        "href"]  # finds article http link
                    date_created = archiveLink[
                        1]  # finds article date (DATUM_VNOSA)
                    hashStr = makeHash(
                        title, date_created
                    )  # creates article hash from title and dateStr (HASH_VREDNOST)

                    date_created = uniformDateStr(date_created, "%Y-%m-%d")
                    date_downloaded = todayDateStr  # date when the article was downloaded

                    # if article is not yet saved in the database we add it
                    if sqlBase.getByHash(hashStr) is None:
                        # get article description/content
                        description = getArticleDescr(s, link)

                        # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                        entry = (date_created, title, description,
                                 date_downloaded, hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(
                            entry, True)  # insert the article in the database
                        articlesDownloaded += 1

                    if articlesChecked % 5 == 0:
                        logger.info(
                            "Checked: {} articles. Downloaded: {} new articles."
                            .format(articlesChecked, articlesDownloaded))

                # firstRunBool = True # DEBUG!
                if not firstRunBool and archiveDay >= NUM_DAYS_TO_CHECK:
                    break

            except Exception:
                logger.error("Url on which the error occured: {}".format(
                    resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #26

0

ファイルを表示

ファイル: Scraper-pomurske_novice.py プロジェクト: jurem/SLEDIMedO

def get_text(stran, SOURCE_ID):

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    soup = BeautifulSoup(
        simple_get("http://pomurske-novice.si/rubrika/pomurje/page/" +
                   str(stran)), "html.parser")
    all_links = soup.find("div", {"class": "td-ss-main-content"}).find_all("a")

    tmp = 0
    for links in all_links:
        if (links.get("href") == None): continue
        if (re.match("http://pomurske-novice.si/+", links.get("href"))
                and tmp == 0 and not re.match(
                    "http://pomurske-novice.si/rubrika/+", links.get("href"))):
            print(links.get("href"))
            try:
                soup = BeautifulSoup(simple_get(links.get("href")),
                                     "html.parser")
                naslov = soup.find("h1", {"class": "entry-title"}).text.strip()

                if (soup.find("time") == None):
                    datum = todayDateStr
                else:
                    try:
                        datum = soup.find("time").text.strip().split()
                        s = ""
                        seq = (datum[1], meseci[datum[0] + ","], datum[2])
                        datum = uniformDateStr(s.join(seq))
                    except ValueError as e:
                        datum = todayDateStr
                vse = soup.find("div", {
                    "class": "td-post-content"
                }).find_all("p")
                vsebina = ""
                for obj in vse:
                    vsebina += str(obj.text).strip() + "\n"
                link = links.get("href")
                hashStr = makeHash(
                    naslov, datum
                )  # creates article hash from title and dateStr (HASH_VREDNOST)
                date_downloaded = todayDateStr  # date when the article was downloaded

                if sqlBase.getByHash(hashStr) is None:
                    description = vsebina
                    entry = (datum, naslov, description, date_downloaded,
                             hashStr, link, SOURCE_ID)
                    sqlBase.insertOne(
                        entry)  # insert the article in the database
                    print("Inserted succesfuly")
            except TypeError as e:
                print(e)

        if (tmp < 2):
            tmp += 1
        else:
            tmp = 0

    print(tmp)

コード例 #27

0

ファイルを表示

ファイル: Scraper-tovarnapodjemov.py プロジェクト: jurem/SLEDIMedO

def main():
    articlesChecked = 0  # number of checked articles
    articlesDownloaded = 0  # number of downloaded articles

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)  # set headers of the session

        resp = s.get(POSTS_URL)
        soup = bs.BeautifulSoup(resp.text, "html.parser")

        articlesTitle = soup.find("div", class_="main",
                                  id="ResizeBody").find_all(
                                      "div", class_="blogPostTitle")
        articlesDate = soup.find("div", class_="main",
                                 id="ResizeBody").find_all(
                                     "div", class_="blogPostDate")
        if len(articlesTitle) != len(articlesDate):
            logger.error(
                "Page is different: number of title and date <div> tags is different."
            )
            sys.exit()

        for num in range(len(articlesTitle)):
            articlesChecked += 1
            try:
                title = articlesTitle[num].find("h2").text
                link = BASE_URL + articlesTitle[num].find("h2").find(
                    "a")["href"][2:]
                logger.debug("TITLE: {}\nLINK: {}".format(
                    title.encode("utf-8"), link))
                dateStr = articlesDate[num].text.split(" |")[0]
                hashStr = makeHash(title, dateStr)
                # print ("title:\n", title, "link:\n", link, "dataStr:\n", dateStr, "hash:\n", hashStr)

                date_created = uniformDateStr(
                    dateStr, "%d.%m.%y"
                )  # date when the article was published on the page
                logger.debug("DATE: {}".format(date_created))
                date_downloaded = todayDateStr  # date when the article was downloaded

                # if article is not yet saved in the database we add it
                if sqlBase.getByHash(hashStr) is None:
                    # get article description/content
                    description = getArticleDescr(s, link)

                    # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                    entry = (date_created, title, description, date_downloaded,
                             hashStr, link, SOURCE_ID)
                    sqlBase.insertOne(
                        entry, True)  # insert the article in the database
                    articlesDownloaded += 1

                if articlesChecked % 5 == 0:
                    logger.info(
                        "Checked: {} articles. Downloaded: {} new articles.".
                        format(articlesChecked, articlesDownloaded))
                if not firstRunBool and articlesChecked >= NUMBER_ARTICLES_TO_CHECK:
                    break

            except Exception:
                logger.error("Url on which the error occured: {}".format(
                    resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #28

0

ファイルを表示

def main():
    articlesChecked = 0
    articlesDownloaded = 0  # number of downloaded articles

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format
    yearInt = datetime.datetime.now().year

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES))  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)   # set headers of the session

        maxYearToCheck = MAX_YEAR-1
        if not firstRunBool:
            maxYearToCheck = yearInt-1
        for yearNum in range(yearInt, maxYearToCheck, -1):
            logger.info("Checking year: {}".format(yearNum))

            yearPageLink = POSTS_URL+str(yearNum)

            try: 
                resp = s.get(yearPageLink)
                soup = bs.BeautifulSoup(resp.text, "html.parser")

                articleDates = soup.find("div", id="contentCenterSubjectBlock").find_all("span", class_="contentCenterSubjectBlockDate")
                articleLinks = soup.find("div", id="contentCenterSubjectBlock").find_all("a")
                if len(articleDates) != len(articleLinks):
                    logger.error("Page is different: can not find same amount of dates and links.")
                    sys.exit(1)

                for num in range(len(articleDates)):
                    # print (articleLinks[num]["href"])
                    articlesChecked += 1

                    title = articleLinks[num].text
                    link = BASE_URL+"/"+articleLinks[num]["href"]
                    dateStr = articleDates[num].text.strip(" ").strip("\n").strip("\r")
                    hashStr = makeHash(title, dateStr)

                    logger.debug("TITLE: {}".format(title.encode("utf-8")))
                    logger.debug("TITLE: {}".format(link))
                    logger.debug("DATE: {}".format(dateStr))

                    date_created = uniformDateStr(dateStr, "%d.%m.%Y") # date when the article was published on the page
                    date_downloaded = todayDateStr                       # date when the article was downloaded

                    # if article is not yet saved in the database we add it
                    if sqlBase.getByHash(hashStr) is None:
                        # get article description/content
                        description = getArticleDescr(s, link)

                        # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                        entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(entry, True)   # insert the article in the database
                        articlesDownloaded += 1

                    if articlesChecked % 5 == 0:
                        logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded))

            except Exception:
                logger.error("Url on which the error occured: {}".format(resp.url))
                logger.exception("")
                sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #29

0

ファイルを表示

def main():
    articlesChecked = 0
    articlesDownloaded = 0  # number of downloaded articles

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime(
        "%Y-%m-%d")  # today date in the uniform format

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent":
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://",
                requests.adapters.HTTPAdapter(max_retries=MAX_HTTP_RETRIES)
                )  # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)  # set headers of the session

        for url in URLS_TO_CHECK:
            logger.info("Checking page: {}".format(url))
            resp = s.get(url)
            soup = bs.BeautifulSoup(resp.text, "html.parser")

            articles = soup.find("div", class_="freetext").find_all("li")
            for article in articles:
                articlesChecked += 1

                try:
                    link = BASE_URL + article.find("a")["href"]
                    title = article.text.strip()
                    hashStr = makeHash(title)

                    date_downloaded = todayDateStr  # date when the article was downloaded
                    date_created = None

                    # if article is not yet saved in the database we add it
                    if sqlBase.getByHash(hashStr) is None:
                        # get article description/content
                        description = getArticleDescr(s, link)

                        # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                        entry = (date_created, title, description,
                                 date_downloaded, hashStr, link, SOURCE_ID)
                        sqlBase.insertOne(
                            entry, True)  # insert the article in the database
                        articlesDownloaded += 1

                    if articlesChecked % 5 == 0:
                        logger.info(
                            "Checked: {} articles. Downloaded: {} new articles."
                            .format(articlesChecked, articlesDownloaded))

                except Exception as e:
                    logger.error("Url on which the error occured: {}".format(
                        resp.url))
                    logger.exception("")
                    sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))

コード例 #30

0

ファイルを表示

ファイル: Scraper-pina.py プロジェクト: jurem/SLEDIMedO

def main():
    articlesChecked = 0
    articlesDownloaded = 0  # number of downloaded articles
    pageNum = 0
    pagesChecked = 0

    sqlBase = dbExecutor()  # creates a sql database handler class
    todayDateStr = datetime.datetime.now().strftime("%Y-%m-%d") # today date in the uniform format
    yearInt = datetime.datetime.now().year

    # optionally set headers for the http request
    HEADERS = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    with requests.Session() as s:
        # set every http/https request to retry max MAX_HTTP_RETRIES retries before returning an error if there is any complications with loading the page
        s.mount("http://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES))  # set max retries to: MAX_HTTP_RETRIES
        s.mount("https://", requests.adapters.HTTPAdapter(max_retries = MAX_HTTP_RETRIES)) # set max retries to: MAX_HTTP_RETRIES
        s.headers.update(HEADERS)   # set headers of the session

        for yearNum in range(yearInt, MAX_YEAR-1, -1):
            nextPageLink = ""
            pagelink = BASE_URL+"/"+str(yearNum)
            while "javascript:void(0);" not in nextPageLink:
                try: 
                    if not firstRunBool and pagesChecked >= NUM_PAGES_TO_CHECK:
                        break
                    pageNum += 1
                    pagesChecked += 1
                    resp = s.get(pagelink)
                    soup = bs.BeautifulSoup(resp.text, "html.parser")

                    articles = soup.find_all("article", class_="post")

                    for article in articles:
                        articlesChecked += 1

                        title = article.find("h2", class_="entry-title").text
                        link = article.find("h2", class_="entry-title").find("a")["href"]
                        dateStr = article.find("time", class_="entry-date").text
                        hashStr = makeHash(title, dateStr)

                        date_created = uniformDateStr(dateStr, "%d.%m.%Y") # date when the article was published on the page
                        date_downloaded = todayDateStr                     # date when the article was downloaded

                        # if article is not yet saved in the database we add it
                        if sqlBase.getByHash(hashStr) is None:
                            # get article description/content
                            description = getArticleDescr(s, link)

                            # (date_created: string, caption: string, contents: string, date: string, hash: string, url: string, source: string)
                            entry = (date_created, title, description, date_downloaded, hashStr, link, SOURCE_ID)
                            sqlBase.insertOne(entry, True)   # insert the article in the database
                            articlesDownloaded += 1

                        if articlesChecked % 5 == 0:
                            logger.info("Checked: {} articles. Downloaded: {} new articles.".format(articlesChecked, articlesDownloaded))

                    nextPageLink = soup.find("div", class_="pagination loop-pagination").find_all("a")[1]["href"]
                    pagelink = nextPageLink

                except Exception as e:
                    logger.error("Url on which the error occured: {}".format(resp.url))
                    logger.exception("")
                    sys.exit()

    logger.info("Downloaded {} new articles.".format(articlesDownloaded))