def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        article_url = article_url.split('||||')
        category = article_url[1]
        article_url = article_url[0]

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title_card = soup.find('div', {'class': 'nws__title--card'})
        title = SoupHelper.get_txt_soup(title_card).find('h2')

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        date = soup.find('div', {'class': 'post__time'})
        date = SoupHelper.get_txt_soup(date).find('span')

        title = title.text
        date = date.text

        date = date.split(' ')
        month = date[1]
        day = date[2]
        year = date[0]

        date = str(month) + ' ' + str(day) + ',' + str(year)

        article = soup.find('div', {'ok__news--wrap'})
        article = SoupHelper.get_txt_soup(article).findAll('p')

        article_text = list()
        for data in article:
            article_text.append(data.text.strip())
        article_text = ''.join(article_text)

        db_helper.insert_article(article_url, Config.online_khabar, category,
                                 title, date, article_text, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)

    except requests.TooManyRedirects:
        Logger.add_error('Redirect Error ' + article_url)
Beispiel #2
0
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        article_url = article_url.split('||||')
        category = article_url[1]
        article_url = article_url[0]

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title = soup.find('div', {'class': 'inner-section cover-news'})
        title = SoupHelper.get_txt_soup(title).find('div',
                                                    {'class': 'col-sm-12'})
        title = SoupHelper.get_txt_soup(title).find('h1')

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        title = title.text

        date = soup.find('div', {'class', 'author-location'})
        date = SoupHelper.get_txt_soup(date).find('span')
        date = date.text.split(',')
        date = date[1].strip().split(' ')
        month = date[1]
        day = date[0]
        year = date[2]
        date = str(month) + ' ' + str(day) + ',' + str(year)

        article = soup.find('div', {'id': 'newsContent'})
        article = SoupHelper.get_txt_soup(article).findAll('p')

        article_text = list()
        for data in article:
            article_text.append(data.text.strip())
        article_text = ''.join(article_text)

        db_helper.insert_article(
            article_url, Config.nagarik_news,
            Config.nagarik_news_sections_dict.get(category), title, date,
            article_text, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title = soup.find('div', {'class': 'article-header'})
        headline = SoupHelper.get_txt_soup(title).find('h1')
        sub_headline = SoupHelper.get_txt_soup(title).find(
            'div', {'class': 'sub-headline'})

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        title = str(headline.text)

        if sub_headline is not None:
            title = str(headline.text) + '\n' + str(sub_headline.text)

        date = soup.find('time')
        article = soup.find('div', {'class': 'description'})

        scripts = SoupHelper.get_txt_soup(article).findAll('script')
        article = article.text

        for script in scripts:
            script_text = script.text
            if script_text in article:
                article = article.replace(script_text, '')

        article = article.split('Share on Facebook')
        article = article[0]

        temp = article_url.split('/')
        category = temp[3]

        db_helper.insert_article(article_url, Config.kantipur_daily_, category,
                                 title, date.text, article, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
Beispiel #4
0
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db)

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)
        title = soup.find('span', {'class': 'news-big-title'})

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return

        title = title.text

        article_text = list()

        article = soup.find('div', {'class': 'editor-box'})
        article = SoupHelper.get_txt_soup(article).findAll('p')

        for data in article:
            article_text.append(data.text)

        article_text = ' '.join(article_text)

        pub_date = soup.find('span', {'class': 'pub-date'})

        pub_date = pub_date.text

        month = pub_date.split(',')[1].strip().split(' ')[0]
        day = pub_date.split(',')[1].strip().split(' ')[1]
        year = pub_date.split(',')[2].strip()

        date = str(month) + ' ' + str(day) + ',' + str(year)

        category = article_url.split('/')[3]

        db_helper.insert_article(article_url, Config.setopati, category, title,
                                 date, article_text, 'ред')
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
Beispiel #5
0
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db_english)

        temp = article_url.split('/')
        category = temp[4]

        if db_helper.data_present(article_url):
            return

        soup = SoupHelper.get_url_soup(article_url)

        title = soup.find('div', {'class': 'col-lg-12'})
        title = SoupHelper.get_txt_soup(title).find('h4')

        if title is None:
            Logger.add_error('Dead Link ' + str(article_url))
            return
        title = title.text

        date = soup.find('div', {'class': 'date-time'})
        date = SoupHelper.get_txt_soup(date).find('span')
        date = date.text
        date = datetime.strptime(date, '%A, %b %d, %Y')
        date = date.strftime('%Y-%m-%d')

        temp_article = soup.find('div', {'class': 'mn-text'})
        temp_article = SoupHelper.get_txt_soup(temp_article).findAll('p')

        article = list()

        for data in temp_article:
            article.append(data.text.strip())

        article = ' '.join(article)

        db_helper.insert_article(article_url, Config.karobar_daily,
                                 category, title, date,
                                 article, '. ')

        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)
def scrape_article_data(article_url):
    try:
        db_helper = DbHelper(Config.db_english)

        if db_helper.data_present(article_url):
            return

        # db_helper.insert_article(article_url, Config.kantipur_daily_, category, title, date.text, article)
        db_helper.close_connection()

        Logger.add_log('Scrapping : ' + article_url)

    except TimeoutError:
        Logger.add_error('TimeoutError ' + article_url)

    except requests.ConnectionError:
        Logger.add_error('ConnectionError ' + article_url)