Ejemplo n.º 1
0
def scrape(curr_url, hash, soup, results):
    print('Found defencereview.gr...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='newsmag-post-meta'):
                for d in c.select('a'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('div.newsmag-date'):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('div.newsmag-custom-header'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('div', id='comments'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                #
                result = result + utils.clean_soup(d)
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Ejemplo n.º 2
0
def scrape(curr_url, hash, soup, results):
    print('Found disidentia.com...')
    counter = 0

    # article
    for t in soup.find_all('article', class_='type-post'):
        print('Getting wordpress article...')

        counter += 1
        result = '{\"meta\":{'
        result = result + '\"id\":\"' + str(hash) + str(counter) + '\",'
        result = result + '\"type\":\"article\",'
        result = result + '\"source\":\"' + curr_url + '\",'
        for c in t.find_all('div', class_='td-module-meta-info'):
            result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
            break
        for c in t.find_all('h1', class_='entry-title'):
            result = result + '\"title\":\"' + utils.clean_soup(c)
            break
        result = result + '\"'
        result = result + '},'

        for c in t.find_all('div', class_='td-post-content tagdiv-type'):
            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                result = result + utils.clean_soup(d)
            result = result + '\"'
        result = result + '}'

        result = utils.clean_whitespaces(result)
        results.append(result)
        print(result)

    # comments
    if len(soup.find_all('ol', class_='comment-list')) > 0:
        print('Getting custom comments...')
        for t in soup.find_all('div', class_='comment-content'):

            counter += 1
            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + str(counter) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"' + utils.clean_soup(t) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found katohika.gr...')

    # article
    for t in soup.find_all('div', id='content'):
        if len(soup.find_all('div', class_='entry-content')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            for c in t.find_all('div', class_='entry-author'):
                result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + '\"title\":\"' + utils.clean_soup(c) + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)

    # comments
    for t in soup.find_all('div', id='comments-section'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                result = result + utils.clean_soup(d)
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found periodistadigital.com...')

    for t in soup.find_all('div', id='m4p-post-detail'):
        print('Getting wordpress article...')

        result = '{\"meta\":{'
        result = result + '\"id\":\"' + str(hash) + '\",'
        result = result + '\"type\":\"article\",'
        result = result + '\"source\":\"' + curr_url + '\",'
        for c in t.find_all('div', class_='m4p-author_time'):
            result = result + '\"meta\":\"' + utils.clean_soup(c)
        result = result + '\",'
        for c in t.find_all('h1', class_='m4p-size-1'):
            result = result + '\"title\":\"' + utils.clean_soup(c)
        result = result + '\"'
        result = result + '},'

        for c in t.find_all('div', class_='m4p-post-content'):
            result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
        result = result + '}'

        result = utils.clean_whitespaces(result)
        results.append(result)
        print(result)
def scrape(curr_url, hash, soup, results):
    print('Found okdiario.com...')

    for t in soup.find_all('article', class_='post'):
        if len(t.find_all('div', class_='entry-content')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('address', class_='autor'):
                result = result + utils.clean_soup(c)
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            for c in t.find_all('div', class_='entry-content'):
                result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found arxaiaithomi.gr...')

    # article
    for t in soup.find_all('div', class_='post'):
        if len(soup.find_all('body', class_='single-post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='post-footer'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('div.post-headline > h2'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='post-bodycopy'):
                for d in c.find_all(recursive=False):
                    if d.name != 'div':
                        result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Ejemplo n.º 7
0
def scrape(curr_url, hash, soup, results):
    print('Found ekklisiaonline.gr...')

    # article
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.select('h6.entry-date'):
                #
                result = result + utils.clean_soup(c) + ' '
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + utils.clean_soup(c)
                break
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.select('div#article > article'):
                for d in c.find_all('p', class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Ejemplo n.º 8
0
def scrape(curr_url, hash, soup, results):
    print('Found alertadigital.com...')

    for t in soup.find_all('div', id='homepost'):
        if len(t.find_all('div', class_='post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            for c in t.find_all('div', id='datemeta'):
                result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
            for c in t.find_all('h2', class_=''):
                result = result + '\"title\":\"' + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            for c in t.find_all('div', class_='entry'):
                result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found makeleio.gr...')

    # article
    for t in soup.find_all('div', class_='single-style1-wrap'):
        print('Getting wordpress article...')

        result = '{\"meta\":{'
        result = result + '\"id\":\"' + str(hash) + '\",'
        result = result + '\"type\":\"article\",'
        result = result + '\"source\":\"' + curr_url + '\",'
        for c in t.find_all('div', class_='single-style1-meta-tag'):
            result = result + '\"meta\":\"' + utils.clean_soup(c) + '\",'
        for c in t.find_all('div', class_='single-style1-title'):
            result = result + '\"title\":\"' + utils.clean_soup(c) + '\"'
        result = result + '},'

        for c in t.find_all('div', class_='single-style1-content'):
            result = result + '\"text\":\"' + utils.clean_soup(c) + '\"'
        result = result + '}'

        result = utils.clean_whitespaces(result)
        results.append(result)
        print(result)

    # comments
    for t in soup.find_all('div', class_='comments-area'):
        print('Getting wordpress comments...')
        for c in t.find_all('div', class_='comment-content'):

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for d in c.find_all('p', class_=''):
                result = result + utils.clean_soup(d)
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found hellenicns.gr...')

    # articles
    for t in soup.find_all('body', class_='single-post'):
        if len(soup.find_all('article', class_='post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='below-entry-meta'):
                for d in c.select('time.published'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('span.author > a'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('span.tag-links'):
                    result = result + utils.clean_soup(d) + ' '
                break
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.find_all('h1', class_='entry-title'):
                result = result + utils.clean_soup(c)
                break
            result = result + '\",'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            for c in t.find_all('div', class_='single-content'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Ejemplo n.º 11
0
def scrape(curr_url, hash, soup, results):
    print('Found vimaorthodoxias.gr...')

    for t in soup.find_all('div', class_='post-wrap'):
        if len(soup.find_all('body', class_='single-post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='jeg_meta_container'):
                for d in c.select('div.jeg_meta_author'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('div.jeg_meta_date > a'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('div.jeg_meta_category > span > a'):
                    result = result + utils.clean_soup(d) + ' '
            for c in t.select('div.jeg_post_tags > a'):
                #
                result = result + utils.clean_soup(c) + ' '
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('div.entry-header > h1.jeg_post_title'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='content-inner'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
def scrape(curr_url, hash, soup, results):
    print('Found voicenews.gr...')

    # webdriver
    options = FirefoxOptions()
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    driver.implicitly_wait(5)
    driver.get(curr_url)
    try:
        print('deploying webdriver...')
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'brm-more-link')))
        driver.execute_script(
            "document.getElementsByClassName('brm-more-link')[0].scrollIntoView();setTimeout(function(){},2000);"
        )
        driver.execute_script(
            "document.getElementsByClassName('brm-more-link')[0].click();setTimeout(function(){},2000);"
        )
        soup = BeautifulSoup(driver.page_source, "html.parser")
    except:
        #
        print('webdriver timeout... ')
    driver.close()

    # article
    for t in soup.find_all('div', class_='post-inner'):
        if len(soup.find_all('body', class_='single-post')) > 0:
            print('Getting wordpress article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('p', class_='post-meta'):
                for d in c.select('span.tie-date'):
                    result = result + utils.clean_soup(d) + ' '
                for d in c.select('span.post-cats > a'):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('div.post-inner > h1.post-title'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='entry'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            for c in t.find_all('div', class_='brm'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)
Ejemplo n.º 13
0
def scrape(curr_url, hash, soup, results):
    print('Found pentapostagma.gr...')

    ds_url = ''

    # article
    for t in soup.find_all('article', class_='article'):
        if len(soup.find_all('div', class_='article__body')) > 0:
            print('Getting drupal article...')

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"article\",'
            result = result + '\"source\":\"' + curr_url + '\",'
            result = result + '\"meta\":\"'
            for c in t.find_all('div', class_='article__top-details'):
                for d in c.select('time.default-date'):
                    result = result + utils.clean_soup(d) + ' '
                    break
                for d in c.select('a.default-category'):
                    result = result + utils.clean_soup(d) + ''
            result = result + '\",'
            result = result + '\"title\":\"'
            for c in t.select('section.article__top > h1'):
                #
                result = result + utils.clean_soup(c)
            result = result + '\"'
            result = result + '},'

            result = result + '\"text\":\"'
            for c in t.find_all('div', class_='article__body'):
                for d in c.find_all(class_=None, recursive=False):
                    result = result + utils.clean_soup(d) + ' '
            result = result + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)

    # comments
    options = FirefoxOptions()
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    driver.implicitly_wait(5)

    try:
        driver.get(curr_url)
        driver.execute_script(
            "document.getElementById('disqus_thread').scrollIntoView();")
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'disqus_thread')))
        cont = BeautifulSoup(driver.page_source, "html.parser")
        for i in cont.find_all('iframe'):
            if i.has_attr('src') and i['src'].find('disqus.com/embed') >= 0:
                ds_url = i['src']
                print('found discus thread with url:', ds_url)
                break
    except:
        #
        print('site webdriver timeout...')

    try:
        driver.get(ds_url)
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'post-message')))
        for i in range(10):
            print('load more')
            driver.execute_script(
                "document.getElementsByClassName('load-more__button')[0].scrollIntoView();"
            )
            driver.execute_script(
                "document.getElementsByClassName('load-more__button')[0].click();"
            )
            time.sleep(2)
    except:
        #
        print('disqus webdriver timeout...')

    try:
        print('Getting disqus comments...')
        soup = BeautifulSoup(driver.page_source, "html.parser")
        for t in soup.find_all('div', class_='post-message'):
            counter += 1

            result = '{\"meta\":{'
            result = result + '\"id\":\"' + str(hash) + '\",'
            result = result + '\"type\":\"comment\",'
            result = result + '\"source\":\"' + curr_url + '\"'
            result = result + '},'

            result = result + '\"text\":\"' + utils.clean_soup(t) + '\"'
            result = result + '}'

            result = utils.clean_whitespaces(result)
            results.append(result)
            print(result)

    except:
        #
        print('disqus webdriver empty...')

    driver.close()