Example #1
0
def main():
    num_pages_to_check = 1
    num_new_articles = 0
    articles_checked = 0

    with requests.Session() as session:
        session.headers.update(headers)

        articles = get_articles_on_pages(num_pages_to_check,session)
        articles_checked = len(articles)

        new_articles_tuples = []
        for x in articles:
            title = get_title(x)
            hash_str = make_hash(title, base_url) #datuma ni na prvi strani, namesto tega hash naredim iz base_url

            if is_article_new(hash_str):
                link = get_link(x)
                r = session.get(link, timeout=8)
                soup = bs(r.text, 'html.parser')
                date = get_date(soup)
                content = get_content(soup)
                print(link + '\n')
                new_tup = (str(datetime.date.today()), title, content, format_date(date), hash_str, link, SOURCE)
                new_articles_tuples.append(new_tup)
                num_new_articles += 1

        #add new articles to database
        dbExecutor.insertMany(new_articles_tuples)
        print(num_new_articles, 'new articles found,', articles_checked,'articles checked')
Example #2
0
def main():

    print('=========================')
    print(sys.argv[0])
    print('=========================')
    
    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)
        articles = getArticlesOnPage(num_articles_to_check, session)


        articles_tuples = []

        print('\tgathering article info ...')
        for x in tqdm(articles):
            title = getTitle(x)
            date = getDate(x)
            hash_str = makeHash(title, date)

            if is_article_new(hash_str):
                link = getLink(x)
                content = getContent(link, session)
                tup = (str(datetime.date.today()), title, content, formatDate(date), hash_str, link, SOURCE)
                articles_tuples.append(tup)
                num_new_articles += 1

    dbExecutor.insertMany(articles_tuples)
    print(num_new_articles, 'new articles found,', len(articles), 'articles checked,', num_errors, 'errors\n')
Example #3
0
def main():

    num_new_articles = 0
    num_pages_to_check = 2

    with requests.Session() as session:

        articles = getArticlesOn_n_pages(num_pages_to_check)

        titles = []
        dates = []
        links = []
        hashes = []

        for x in articles:
            title = getTitle(x)
            date = getDate(x)
            hash_str = makeHash(title, date)

            if isArticleNew(hash_str):
                titles.append(title)
                dates.append(date)
                hashes.append(hash_str)
                links.append(getLink(x))
                num_new_articles += 1

        list_new = []
        for i in range(num_new_articles):
            content = getContent(links[i], session)
            tup = (str(datetime.date.today()), titles[i], content,
                   formatDate(dates[i]), hashes[i], links[i], base_url)
            list_new.append(tup)

        dbExecutor.insertMany(list_new)
Example #4
0
def main():
    num_pages_to_check = 2
    num_new_articles = 0
    articles_checked = 0

    with requests.Session() as session:
        session.headers.update(headers)

        articles = get_articles_on_pages(num_pages_to_check, session)
        articles_checked = len(articles)

        new_articles_tuples = []
        for x in articles:
            title = get_title(x)
            date = get_date(x)
            hash_str = make_hash(title, date)

            if is_article_new(hash_str):
                link = get_link(x)
                r = requests.get(link)
                soup = bs(r.text, 'html.parser')
                content = get_content(soup)
                print(link + '\n')
                new_tup = (str(datetime.date.today()), title, content, date,
                           hash_str, link, base_url)
                new_articles_tuples.append(new_tup)
                num_new_articles += 1

        #add new articles to database
        dbExecutor.insertMany(new_articles_tuples)
        print(num_new_articles, 'new articles found,', articles_checked,
              'articles checked')
def main():
    driver = initDriver()
    html = fullyLoadPage(my_url,driver)
    i = 0
    while html is NOT_FOUND and MAX_HTTP_RETRIES >= i:
        html = fullyLoadPage(my_url,driver)
        i+=1

    page_soup = soup(html, "html.parser")
    #vzame vsak clanek
    try:
        clanki = page_soup.findAll("div", class_="tl-entry-flex")
        clanki = filterAds(clanki)
        novice = []
        count = 0
        for clanek in clanki:
            date = getDate(clanek)
            title = getTitle(clanek)
            hash = makeHash(title,date)
            if db.getByHash(hash):
                break
            content = getContent(clanek)
            source = getSource(clanek)
            count+=1
            data = (str(datetime.date.today()),title,content,date,hash,my_url,source)
            novice.append(data)
        if len(novice) > 0:
            db.insertMany(novice)
            print("Najdenih "+str(count)+" novih clankov")
        else:
            print('Ni najdenih novih clankov')
        driver.close()
    except:
        print("Error pri obdelavi clankov")
Example #6
0
def main():
    num_pages_to_check = 2
    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)
        articles = getArticlesOn_n_pages(num_pages_to_check, session)
        articles_checked = len(articles)

        dates = []
        titles = []
        hashes = []
        links = []

        for x in articles:
            title = getTitle(x)
            date = getDate(x)
            hash = makeHash(title, date)

            if isArticleNew(hash):
                titles.append(title)
                dates.append(date)
                hashes.append(hash)
                links.append(getLink(x))
                num_new_articles += 1

        list_of_tuples = []
        for i in range(len(links)):
            content = getContent(links[i], session)
            tup = (str(datetime.date.today()), titles[i], content, dates[i], hashes[i], links[i], base_url)
            list_of_tuples.append(tup)

        dbExecutor.insertMany(list_of_tuples)

    print(num_new_articles, 'new articles found,', num_pages_to_check,'pages checked -', articles_checked, 'articles checked')
def main():
    num_articles_to_check = 20
    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)
        articles = getArticlesOn_n_pages(num_articles_to_check, session)

        new_articles_tuples = []

        for x in articles:
            title = getTitle(x)
            date = getDate(x)
            hash_str = makeHash(title, date)

            if isArticleNew(hash_str):
                link = getLink(x)
                content = getContent(link, session)
                if not content:
                    content = x.find('p').text
                num_new_articles += 1
                new_articles_tuples.append((str(datetime.date.today()), title, content, formatDate(date), hash_str, link, base_url))
    
    dbExecutor.insertMany(new_articles_tuples)
    print(num_new_articles, 'new articles found,', num_articles_to_check, 'articles checked')
Example #8
0
def main():
    driver = initDriver()
    clanki = getClanki(driver)
    if clanki != NOT_FOUND:
        db.insertMany(clanki)
    else:
        print('Ni najdenih novih clankov')
    driver.close()
Example #9
0
def main():
    driver = initDriver()
    html = loadPage(my_url,driver,1)
    i = 0
    #ce se clanki niso uspesno nalozili, probaj max 10krat
    while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
        html = loadPage(my_url,driver,1)
        i+=1

    NOVICE = []
    STEVILO_VSEH_STRANI = getSteviloVsehStrani(my_url,driver)
    '''
        Trenutno gre skozi vse članke, če pride do že obstoječega, se ustavi
        
        Za testiranje najboljše, da zamenjaš STEVILO_VSEH_STRANI z neko malo cifro,
        da ne naloada vseh clankov, ker jih je ogromno
    '''
    for x in range(1,3):
        i = 0
        html = loadPage(my_url, driver, x)
        while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
            html = loadPage(my_url,driver,x)
            i+=1
        page_soup = soup(html, "html.parser")
        clanki = page_soup.find("ul", class_="articles").findAll("li", class_="item bigger")
        count = 0
        # print("PAGE "+str(x)+"**************************")
        done = False
        for clanek in clanki:
            title = getTitle(clanek)
            content = getContent(clanek)
            date = getDate(clanek)
            source = getSource(clanek)
            hash = makeHash(title, date)
            if content is NOT_FOUND and title is NOT_FOUND:
                continue
            if db.getByHash(hash):
                done = True
                break
            else:
                data = (str(datetime.date.today()), title, content, date, hash, my_url, source)
                NOVICE.append(data)
                # print("Datum: "+str(date))
                # print("Naslov: "+str(title))
                # print("Vsebina: "+str(content))
                # print("Source: "+str(source))
                # print("Hash: "+str(hash))
                # print("-------------------------------------------------------")
                count += 1
        if done:
            break
    db.insertMany(NOVICE)
    # print(count)
    # print("STEVILO_VSEH_STRANI: "+str(STEVILO_VSEH_STRANI))
    driver.close()
Example #10
0
def main():
    driver = initDriver()
    html = fullyLoadPage(my_url, driver)
    i = 0
    while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
        html = fullyLoadPage(my_url, driver)
        i += 1
    page_soup = soup(html, "html.parser")
    clanki = getClanki(page_soup,driver)
    db.insertMany(clanki)
    driver.close()
Example #11
0
def main():
    driver = initDriver()
    html = fullyLoadPage(my_url, driver)
    page_soup = soup(html, "html.parser")
    clanki = page_soup.findAll("article")
    NOVICE = []
    for clanek in clanki:
        novica = getClanek(driver, clanek)
        if novica is not NOT_FOUND:
            NOVICE.append(novica)
    db.insertMany(NOVICE)
    driver.close()
Example #12
0
def main():
    driver = initDriver()
    html = loadPage(driver)
    i = 0
    while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
        html = loadPage(driver)
        i += 1
    page_soup = soup(html, "html.parser")
    clanki = getClanki(page_soup)
    if clanki != NOT_FOUND:
        db.insertMany(clanki)
    else:
        print('Ni najdenih novih clankov')
    driver.close()
Example #13
0
def main():
    print('=========================')
    print(sys.argv[0])
    print('=========================')

    num_new_articles = 0
    num_pages_to_check = 3
    driver = initDriver()

    articles = getArticlesOn_n_pages(num_pages_to_check, driver)
    driver.quit()

    titles = []
    dates = []
    links = []
    hashes = []

    print('\tgathering article info ...')
    for x in tqdm(articles):
        title = getTitle(x)
        date = getDate(x)
        hash_str = makeHash(title, date)

        if is_article_new(hash_str):
            titles.append(title)
            dates.append(date)
            hashes.append(hash_str)
            links.append(getLink(x))
            num_new_articles += 1

    new_articles_tuples = []

    driver = initDriver()
    print('\tgathering article content ...')
    for i in tqdm(range(num_new_articles)):
        content = getContent(links[i], driver)
        new_articles_tuples.append(
            (str(datetime.date.today()), titles[i], content,
             formatDate(dates[i]), hashes[i], links[i], SOURCE))
        # time.sleep(2)

    driver.quit()

    dbExecutor.insertMany(new_articles_tuples)

    print(num_new_articles, 'new articles found,', len(articles),
          'articles checked,', num_errors, 'errors found\n')
def main():
    driver = initDriver()
    html = fullyLoadPage(my_url, driver)
    i = 0
    while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
        html = fullyLoadPage(my_url, driver)
        i += 1
    page_soup = soup(html, "html.parser")
    clanki = page_soup.findAll("div", class_="card_article")
    links = getLinks(clanki)
    NOVICE = []
    for link in links:
        novica = getClanek(driver, link)
        if novica is not NOT_FOUND:
            NOVICE.append(novica)
    db.insertMany(NOVICE)
    driver.close()
Example #15
0
def main():
    driver = initDriver()
    html = loadFirstPage(my_url, driver)
    i = 0
    while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
        html = loadFirstPage(my_url, driver)
        i += 1
    page_soup = soup(html, "html.parser")
    noviClanki = getNoviClanki(page_soup)
    stariClanki = getStariClanki(driver)
    if noviClanki != NOT_FOUND and stariClanki != NOT_FOUND:
        vsiClanki = noviClanki + stariClanki
        db.insertMany(vsiClanki)
    else:
        print('Ni najdenih novih clankov')

    driver.close()
Example #16
0
def main():

    num_new_articles = 0
    num_pages_to_check = 3
    driver = initDriver()

    articles = getArticlesOn_n_pages(num_pages_to_check, driver)
    driver.quit()

    titles = []
    dates = []
    links = []
    hashes = []

    for x in articles:
        title = getTitle(x)
        date = getDate(x)
        hash_str = makeHash(title, date)

        if isArticleNew(hash_str):
            titles.append(title)
            dates.append(date)
            hashes.append(hash_str)
            links.append(getLink(x))
            num_new_articles += 1

    new_articles_tuples = []

    driver = initDriver()
    for i in range(num_new_articles):
        content = getContent(links[i], driver)
        new_articles_tuples.append(
            (str(datetime.date.today()), titles[i], content,
             formatDate(dates[i]), hashes[i], links[i], base_url))
        # time.sleep(2)

    driver.quit()

    dbExecutor.insertMany(new_articles_tuples)

    print(num_new_articles, 'new articles found', num_pages_to_check,
          'pages checked')
Example #17
0
def main():
    num_pages_to_check = 1
    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)
        articles = getArticlesOn_n_pages(num_pages_to_check, session)
        articles_checked = len(articles)

        titles = []
        hashes = []
        links = []

        for x in articles:
            title = getTitle(x)

            hash_str = makeHash(title)

            if is_article_new(hash_str):
                titles.append(title)
                hashes.append(hash_str)
                links.append(getLink(x))
                num_new_articles += 1

        list_of_tuples = []
        for i in range(len(links)):
            print(links[i])
            r = session.get(links[i], timeout=10)
            soup = BeautifulSoup(r.text, 'html.parser')

            content = getContent(soup)
            date = getDate(soup)

            tup = (str(datetime.date.today()), titles[i], content,
                   formatDate(date), hashes[i], links[i], base_url)
            list_of_tuples.append(tup)

        dbExecutor.insertMany(list_of_tuples)

    print(num_new_articles, 'new articles found,', num_pages_to_check,
          'pages checked -', articles_checked, 'articles checked')
Example #18
0
def main():
    num_pages_to_check = 3
    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)
        articles = getArticlesOnPage(num_pages_to_check, session)

        dates = []
        titles = []
        hashes = []
        links = []

        for x in articles:
            title = getTitle(x)
            date = getDate(x)
            hash = makeHash(title, date)

            if isArticleNew(hash):
                titles.append(title)
                dates.append(date)
                hashes.append(hash)
                links.append(getLink(x))
                num_new_articles += 1

        new_articles_tuples = []
        for i in range(len(links)):
            #tukaj popravi, da vneses v bazo
            content = ' '.join(getContent(links[i], session).split())
            tup = (str(datetime.date.today()), titles[i], content,
                   formatDate(dates[i]), hashes[i], links[i], base_url)
            new_articles_tuples.append(tup)

        dbExecutor.insertMany(new_articles_tuples)

    print(num_new_articles, 'new articles found,', num_pages_to_check,
          'pages checked')
Example #19
0
def main():
    num_pages_to_check = 1
    num_new_articles = 0
    articles_checked = 0

    with requests.Session() as session:
        session.headers.update(headers)

        if firstRunBool:
            maxPageNum = getMaxPageNum(session)
            print("Checking {} pages".format(maxPageNum))
            num_pages_to_check = maxPageNum

        articles = get_articles_on_pages(num_pages_to_check, session)
        articles_checked = len(articles)

        new_articles_tuples = []
        for x in articles:
            title = get_title(x)
            date = get_date(x)
            hash_str = make_hash(title, date)

            if is_article_new(hash_str):
                link = get_link(x)
                r = session.get(link, timeout=8)
                soup = bs(r.text, 'html.parser')
                content = get_content(soup)
                print(link + '\n')
                new_tup = (str(datetime.date.today()), title, content, date,
                           hash_str, link, SOURCE)
                new_articles_tuples.append(new_tup)
                num_new_articles += 1

        #add new articles to database
        dbExecutor.insertMany(new_articles_tuples)
        print(num_new_articles, 'new articles found,', articles_checked,
              'articles checked')