Beispiel #1
0
def getClanki(page_soup,driver):
    noviClankiHtml = page_soup.findAll("div", class_="art-post-inner art-article")
    clanki = []
    #pridobi nove clanke
    for clanek in noviClankiHtml:
        title = getTitle(clanek)
        date = getDate(clanek)
        content = getVsebina(clanek)
        source = my_url
        hash = makeHash(title, date)
        #ce je ze v bazi, se ustavi
        if db.getByHash(hash):
            return clanki
        else:
            clanki.append((str(datetime.date.today()),title,content,date,hash,my_url,source))
    #pridobi stare clanke
    oldClankiLinks = getLinks(page_soup)
    if oldClankiLinks is NOT_FOUND:
        pass
    else:
        for link in oldClankiLinks:
            html = loadOldPage(link,driver)
            clanek = soup(html, "html.parser")
            title = getTitle(clanek)
            content = getVsebina(clanek)
            source = link
            hash = makeHash(title, date)
            if db.getByHash(hash):
                return clanki
            else:
                clanki.append((str(datetime.date.today()), title, content, date, hash, my_url, source))
    return clanki
def main():
    driver = initDriver()
    html = fullyLoadPage(my_url,driver)
    i = 0
    while html is NOT_FOUND and MAX_HTTP_RETRIES >= i:
        html = fullyLoadPage(my_url,driver)
        i+=1

    page_soup = soup(html, "html.parser")
    #vzame vsak clanek
    try:
        clanki = page_soup.findAll("div", class_="tl-entry-flex")
        clanki = filterAds(clanki)
        novice = []
        count = 0
        for clanek in clanki:
            date = getDate(clanek)
            title = getTitle(clanek)
            hash = makeHash(title,date)
            if db.getByHash(hash):
                break
            content = getContent(clanek)
            source = getSource(clanek)
            count+=1
            data = (str(datetime.date.today()),title,content,date,hash,my_url,source)
            novice.append(data)
        if len(novice) > 0:
            db.insertMany(novice)
            print("Najdenih "+str(count)+" novih clankov")
        else:
            print('Ni najdenih novih clankov')
        driver.close()
    except:
        print("Error pri obdelavi clankov")
Beispiel #3
0
def getStariClanki(driver):
    try:
        driver.get(arhiv_novic_url)
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'vsebina')))
        time.sleep(3)
        html = driver.execute_script(
            "return document.documentElement.outerHTML")
        page_soup = soup(html, "html.parser")
        clankihtml = page_soup.find_all('div', class_="novica")
        clanki = []
        for clanek in clankihtml:
            title = getTitle(clanek)
            date = getDate(clanek)
            hash = makeHash(title, date)
            if db.getByHash(hash):
                return NOT_FOUND
            content = getContent(clanek)
            source = arhiv_novic_url
            clanek = (str(datetime.date.today()), title, content, date, hash,
                      my_url, source)
            clanki.append(clanek)
        if len(clanki) < 1:
            return NOT_FOUND
        else:
            return clanki
    except:
        return NOT_FOUND
Beispiel #4
0
def getClanek(driver, clanek):
    try:
        link = clanek.find("a")["href"]
        url = my_url + str(link)
        driver.get(url)
        try:
            WebDriverWait(driver, 6).until(
                EC.visibility_of_element_located(
                    (By.CLASS_NAME, "itemFullText")))
        except:
            return NOT_FOUND
        html = driver.execute_script(
            "return document.documentElement.outerHTML")
        openedClanek = soup(html, "html.parser")
        title = getTitle(openedClanek)
        date = getDate(openedClanek)
        content = getContent(openedClanek)
        hash = makeHash(title, date)
        if db.getByHash(hash):
            return NOT_FOUND
        source = str(url)
        novica = (str(datetime.date.today()), title, content, date, hash,
                  my_url, source)
        return novica
    except:
        return NOT_FOUND
Beispiel #5
0
def main():
    driver = initDriver()
    html = loadPage(my_url,driver,1)
    i = 0
    #ce se clanki niso uspesno nalozili, probaj max 10krat
    while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
        html = loadPage(my_url,driver,1)
        i+=1

    NOVICE = []
    STEVILO_VSEH_STRANI = getSteviloVsehStrani(my_url,driver)
    '''
        Trenutno gre skozi vse članke, če pride do že obstoječega, se ustavi
        
        Za testiranje najboljše, da zamenjaš STEVILO_VSEH_STRANI z neko malo cifro,
        da ne naloada vseh clankov, ker jih je ogromno
    '''
    for x in range(1,3):
        i = 0
        html = loadPage(my_url, driver, x)
        while i < MAX_HTTP_RETRIES and html is NOT_FOUND:
            html = loadPage(my_url,driver,x)
            i+=1
        page_soup = soup(html, "html.parser")
        clanki = page_soup.find("ul", class_="articles").findAll("li", class_="item bigger")
        count = 0
        # print("PAGE "+str(x)+"**************************")
        done = False
        for clanek in clanki:
            title = getTitle(clanek)
            content = getContent(clanek)
            date = getDate(clanek)
            source = getSource(clanek)
            hash = makeHash(title, date)
            if content is NOT_FOUND and title is NOT_FOUND:
                continue
            if db.getByHash(hash):
                done = True
                break
            else:
                data = (str(datetime.date.today()), title, content, date, hash, my_url, source)
                NOVICE.append(data)
                # print("Datum: "+str(date))
                # print("Naslov: "+str(title))
                # print("Vsebina: "+str(content))
                # print("Source: "+str(source))
                # print("Hash: "+str(hash))
                # print("-------------------------------------------------------")
                count += 1
        if done:
            break
    db.insertMany(NOVICE)
    # print(count)
    # print("STEVILO_VSEH_STRANI: "+str(STEVILO_VSEH_STRANI))
    driver.close()
Beispiel #6
0
def getClanek(clanekHtml):
    try:
        title = getTitle(clanekHtml)
        date = getDate(clanekHtml)
        hash = makeHash(title, date)
        if db.getByHash(hash):
            return NOT_FOUND
        content = getContent(clanekHtml)
        source = getSource(clanekHtml)
        clanek = (str(datetime.date.today()), title, content, date, hash,
                  my_url, source)
        return clanek
    except:
        return NOT_FOUND
Beispiel #7
0
def getClanek(driver, link):
    try:
        driver.get(link)
        time.sleep(3)
        html = driver.execute_script(
            "return document.documentElement.outerHTML")
        openedClanek = soup(html, "html.parser")
        date = getDate(openedClanek)
        title = getTitle(openedClanek)
        content = getContent(openedClanek)
        source = str(link)
        hash = makeHash(title, date)
        if db.getByHash(hash):
            return NOT_FOUND
        clanek = (str(datetime.date.today()), title, content, date, hash,
                  my_url, source)
        return clanek
    except:
        return NOT_FOUND
Beispiel #8
0
def getClanki(page_soup):
    try:
        clanki = page_soup.find_all('article', {"data-tpl": "content"})
        novice = []
        for clanek in clanki:
            title = getTitle(clanek)
            date = getDate(clanek)
            hash = makeHash(title, date)
            if db.getByHash(hash):
                break
            content = getContent(clanek)
            source = getSource(clanek)
            novica = (str(datetime.date.today()), title, content, date, hash,
                      my_url, source)
            novice.append(novica)
        if len(novice) < 1:
            return NOT_FOUND
        else:
            return novice
    except:
        return NOT_FOUND
Beispiel #9
0
def getNoviClanki(page_soup):
    try:
        noviClankiHtml = page_soup.findAll("div", class_="novica")
        date = ""
        title = ""
        content = ""
        source = ""
        hash = ""
        clanki = []
        for clanek in noviClankiHtml:
            date = getDate(clanek)
            try:
                title = str(clanek.find('h2').find('a').text).strip()
            except:
                title = NOT_FOUND
            try:
                content = getContent(clanek)
            except:
                content = NOT_FOUND

            hash = makeHash(title, date)
            if db.getByHash(hash):
                return NOT_FOUND

            try:
                source = my_url + str(clanek.find('h2').find('a')['href'])
            except:
                source = NOT_FOUND
            novica = (str(datetime.date.today()), title, content, date, hash,
                      my_url, source)
            clanki.append(novica)
        if len(clanki) < 1:
            return NOT_FOUND
        else:
            return clanki
    except:
        return NOT_FOUND
Beispiel #10
0
def is_article_new(hash_str):
    if dbExecutor.getByHash(hash_str):
        return False
    return True
Beispiel #11
0
def is_article_new(hash_str):
    if dbExecutor.getByHash(hash_str):
        return False
    print('new article found')
    return True