Esempio n. 1
0
def extract_content(url):
    content = {}
    content = get_static_content()
    try:
        ua = UserAgent()
    
        config = Config()
        config.browser_user_agent = ua.chrome
        config.language = 'es'
        
        article = Article(url, config= config)
         
        article.download()    
     
        article.parse()    
        
        text = article.text
        content['text'] = text
        
        top_image = article.top_image
        content['image'] = top_image

        movielinks = []
        for movie in article.movies:
            movielinks.append(movie)
        content['videos'] = movielinks

    except Exception as e:
        print_exc(e)
    
    return content
Esempio n. 2
0
def read_article(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    print(article.text)
    # print 'SUMMARY'
    print(article.summary)
Esempio n. 3
0
def get_article(url):
    article = Article(url, language='pt')
    # Simples parsing de url e usando o requests para
    # fazer download do html como texto, fazer um simples
    # cleance desse html e retornar o texto completo
    # para depois gerar um Parser com esse texto
    article.download()
    # Analysing
    article.parse()

    return article
Esempio n. 4
0
def crawl_today():
    """   每天定时爬取, 5小时一次即可,每个类别爬取一页  """
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "BEGIN")
    while True:
        try:
            conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean')
        except OperationalError as e:
            print(e)
            time.sleep(3)
        else:
            break
    for class_1 in categories:
        params['tag'] = class_1
        params['pgno'] = str(1)
        page_url = base_url + urlencode(params)
        failure = 0
        while failure < 3:
            try:
                doc = pq(page_url, headers=headers, verify=False)
            except Exception as e:
                failure += 1
                print('\r获取新闻链接失败,原因:', e, end='', flush=True)
            else:
                break
        else:
            continue
        ul = doc('ul.timeline > li')
        for li in ul.items():
            url = li.find('h2 a').attr('href')
            article = Article(url)
            try:
                article.download()
                article.parse()
            except ArticleException as e:
                # print(e)
                continue
            content = article.text
            if content:
                title = article.title
                date = article.publish_date
                class_2 = li.find('div.timeline-content > a').text()
                # print(title)
                cursor = conn.cursor()
                sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 8))
                cursor.execute(sql, ('Malaysia', domain, class_1, class_2, title, date, content, url))
                conn.commit()
                cursor.close()
    conn.close()
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "DONE")
Esempio n. 5
0
def parseArticle(articles: ResultSet, host: str, src_ele: str,
                 summary_ele: str, date_ele: str, url_ele: str):
    global parse_done
    global config_en
    global articles_count
    articles_count += len(articles)
    for a in articles:
        src = a.find(src_ele)
        summary = a.find(summary_ele)
        date = a.find(date_ele)
        if src is None:
            src = host
        else:
            src = src.text
        if summary is None:
            summary = a.find('description')  # fallback
        if summary is not None:
            summary = summary.text
        url = a.find(url_ele)
        if url is not None:
            url = url.text.strip()
        else:
            url = ''
        if url != '':
            article = Article(url, config=config_en)
            if date is not None:
                try:
                    date = parse(date.text)
                except:
                    date = None
            try:
                article.download()
                article.parse()
            except Exception as ex:
                log(f'{ex}, url is "{url}"')
            finally:
                if article.publish_date is datetime and date is None:
                    date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
                insert_db((src, date, article.title, summary, article.text,
                           article.url))
        parse_done += 1
Esempio n. 6
0
def crawl_archive():
    """
    爬取过去一周的所有新闻
    """
    conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean')
    for c in categories:
        for i in range(1, MAX_PAGE + 1):
            params['tag'] = c
            params['pgno'] = str(i)
            page_url = base_url + urlencode(params)
            try:
                doc = pq(page_url, headers=headers, verify=False)
            except requests.exceptions.ConnectionError as e:
                print(e)
                doc = pq(page_url, headers=headers, verify=False)
            ul = doc('ul.timeline > li')
            for li in ul.items():
                url = li.find('h2 a').attr('href')
                article = Article(url)
                try:
                    article.download()
                    article.parse()
                except ArticleException as e:
                    print(e)
                    continue
                title = article.title
                date = article.publish_date
                content = article.text
                class_2 = li.find('div.timeline-content > a').text()
                if content:
                    print(title)
                    cursor = conn.cursor()
                    sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 9))
                    cursor.execute(sql, ('Malaysia', domain, c, class_2, None, title, date, content, url))
                    conn.commit()
                    cursor.close()
    conn.close()
Esempio n. 7
0
'''
Created on Feb 28, 2015

@author: hoavu
'''
from newspaper.article import Article

url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html'
article = Article(url)
article.download()
article.parse()
#print(article.html)
print(article.text)
Esempio n. 8
0
def fetch_story(request):
    if request.method == 'GET':

        # List to store all the parsed RSS entries.
        story_list = []

        # Get Source Object from 'item_id' passed through Request
        source_id = request.GET.get('item_id')

        if source_id is None:
            # If none, Return to sources list
            return HttpResponseRedirect('/sources_list/')

        # Get sourcing object
        try:
            rss_obj = Sourcing.objects.get(id=source_id)
        except Sourcing.DoesNotExist:
            messages.info(request,
                          'Source Does Not Exist, Please try another one.')
            return HttpResponseRedirect('/sources_list/')

        # Parse the RSS URL and get the data
        feed_data = feedparser.parse(rss_obj.rss_url)

        # Detects if the Url is not well formed RSS
        if feed_data.bozo == 1:
            url_error = {
                'Possible Wrong URL. Click here to go back to Sources page.'
            }
            return render_to_response('fetch_story.html', {
                'url_error': url_error,
                'user': request.user
            })
        else:
            for data in feed_data.get('entries'):
                story_url = data.get('link')

                # If RSS is Empty return Story listing page
                if story_url is None:
                    rss_error = {
                        'Either RSS is empty or RSS is broken. Click here to go back to Story Listing page'
                    }
                    return render_to_response('fetch_story.html', {
                        'rss_error': rss_error,
                        'user': request.user
                    })

                # Use newspaper library to download the article
                article = Article(story_url)

                try:
                    article.download()
                except ArticleException:
                    logger.debug("Article Download exception in : %s" %
                                 story_url)

                # Try to Parse Article
                try:
                    article.parse()
                except ArticleException:
                    logger.debug("Exception in article parse")

                article_instance = article

                # if Datetime is none, assign current datetime
                if article_instance.publish_date is None:
                    if data.get('published') is None:
                        article_instance.publish_date = datetime.now(
                        ).strftime('%Y-%m-%d %H:%M:%S')
                    else:
                        article_instance.publish_date = datetime.strptime(
                            data.get('published'),
                            '%a, %d %b %Y %H:%M:%S GMT').strftime(
                                '%Y-%m-%d %H:%M:%S')

                        # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S')
                elif not isinstance(article_instance.publish_date, datetime):
                    article_instance.publish_date = datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S')

                # if Body is empty, assign dummy Text
                if article_instance.text is '':
                    article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \
                                                Click the Story title to visit the Story page."

                try:
                    # Check if story exist
                    Stories.objects.select_related('source').get(url=story_url)
                except Stories.DoesNotExist:
                    story = Stories(title=article_instance.title,
                                    source=rss_obj,
                                    pub_date=article_instance.publish_date,
                                    body_text=article_instance.text,
                                    url=article_instance.url)
                    story.save()

                # Add each downloaded article details to Story_list and pass to HTML template.
                story_list += [article_instance]
            return render_to_response('fetch_story.html', {
                'data': story_list,
                'rss_id': rss_obj,
                'user': request.user
            })
    else:
        return HttpResponseRedirect('/sources_list/')
Esempio n. 9
0
    def handle(self, *args, **options):
        source_obj = Sourcing.objects.all()
        stories_list = list(Stories.objects.values_list('url', flat=True))

        # To store time and data iterated count
        not_rss_url = 0
        fetched_story_count = 0
        existing_story_count = len(stories_list)
        download_exception = 0
        parsing_exception = 0
        broken_rss_list = 0

        print("""\n\n
        ------------------------Started fetching Url's:------------------------
        \n
        """)
        start_time = datetime.now()

        sources = tqdm(source_obj)
        for list_item in sources:
                # Sources Progress bar
                sources.set_description('Source Completed  ')

                # Parse data from Rss Url
                feed_data = feedparser.parse(list_item.rss_url)

                # Detects if the Url is not well formed RSS
                if feed_data.bozo == 1:
                    logger.debug("Not a RSS url :    %s" % list_item.rss_url)
                    not_rss_url += 1
                else:
                    # Stories progess bar using tqdm
                    story_entries = tqdm(feed_data.get('entries'))

                    """
                        # This will iterate through each story url
                        # If story url is already in list fetched from DB
                        # It will not fetch for those URL.
                        # Else: It will download the story and save to Stories DB
                    """

                    for data in story_entries:
                        # Stories Progress bar Title
                        story_entries.set_description('Stories Completed ')

                        # Get story Url from story_entries list
                        story_url = data.get('link')

                        # If RSS is Empty return to Story listing page
                        if story_url is None:
                            logger.debug("No feed data in RSS URL:   %s" % list_item.rss_url)
                            broken_rss_list += 1
                        else:

                            # If story does not exist, It'll download and save it in database
                            if story_url in stories_list:
                                stories_list.remove(story_url)
                            else:
                                # Use Newspaper Library's
                                article = Article(story_url)

                                # Use newspaper library to download the article
                                try:
                                    article.download()
                                except ArticleException:
                                    logger.debug("Article Download exception in : %s" % story_url)
                                    download_exception += 1

                                # Parse Article
                                try:
                                    article.parse()
                                except ArticleException:
                                    logger.debug("Article parse exception in : %s" % story_url)
                                    parsing_exception += 1

                                article_instance = article

                                # if Datetime is none or not a Datetime, assign current datetime
                                if article_instance.publish_date is None:
                                    if data.get('published') is None:
                                        article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                                    else:
                                        article_instance.publish_date = datetime.strptime(
                                                                data.get('published'), '%a, %d %b %Y %H:%M:%S GMT'
                                                                ).strftime('%Y-%m-%d %H:%M:%S')

                                elif not isinstance(article_instance.publish_date, datetime):
                                    article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                                # if Body is empty, assign dummy Text
                                if article_instance.text is '':
                                    article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \
                                    Click the Story title to visit the Story page."

                                # Save story.
                                story = Stories(
                                    title=article_instance.title,
                                    source=list_item,
                                    pub_date=article_instance.publish_date,
                                    body_text=article_instance.text,
                                    url=article_instance.url
                                )
                                story.save()
                                fetched_story_count += 1

        stop_time = datetime.now()
        execution_time = stop_time - start_time
        final_count = len(Stories.objects.values_list('url', flat=True))
        print("""
        
        ------------------------Finished fetching Url's:------------------------


                                  Final Result:

                        No of Existing Stories          :   {0}
                        No of New Stories Fetched       :   {1}
                        No of wrong Rss Url's           :   {2}
                        No of Broken or Empty Rss Url's :   {3}
                        No of Stories not Downloaded    :   {4}
                        No of Stories not Parsed        :   {5}
                    -------------------------------------------------
                        Total Stories                   :   {6}
                    -------------------------------------------------

                        Process Execution time          :   {7}

        ------------------------------------------------------------------------
            
        """.format(existing_story_count, fetched_story_count,
                   not_rss_url, broken_rss_list, download_exception,
                   parsing_exception, final_count, execution_time))
Esempio n. 10
0
def crawl_today():
    """
    用于每日更新
    """
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "BEGIN", flush=True)
    while True:
        try:
            conn = connect(host=host,
                           user='******',
                           password='******',
                           port=3306,
                           db='chinaaseanocean')
        except OperationalError as e:
            print(e)
            time.sleep(3)
        else:
            break
    for key in class_.keys():
        class_1 = class_[key]
        # print(class_1)
        page = 1
        while page < 3:
            page_url = base_url + key + '&max_page=' + str(page)
            # print(page_url)
            failure = 0
            while failure < 3:
                try:
                    doc = pq(page_url, headers=headers, verify=False)
                except Exception as e:
                    failure += 1
                    print(e)
                else:
                    break
            else:
                continue
            a_list = doc('div.w3-justify a')
            for a in a_list.items():
                news_url = 'http://www.bernama.com/en/' + a.attr('href')
                article = Article(news_url)
                try:
                    article.download()
                    article.parse()
                except ArticleException as e:
                    print(e)
                    continue
                content = pattern.sub('', article.text).replace('\n', '')
                if content:
                    url = article.url
                    title = article.title
                    try:
                        date = '-'.join(
                            pattern.findall(article.html)[0].split('/')[::-1])
                    except:
                        date = ''
                    # print(title, date, content, sep='\n')
                    cursor = conn.cursor()
                    sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(
                        ['%s'] * 8))
                    cursor.execute(sql, ('Malaysia', domain, class_1, None,
                                         title, date, content, url))
                    conn.commit()
                    cursor.close()
            if len(a_list) < 7:
                break
            page = page + 1
    conn.close()
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "DONE")
Esempio n. 11
0
'''

import requests
import urllib.parse
import nltk
from newspaper.article import Article
from crawlerApp.utils import get_cosine, text_to_vector, normalize_text

'''get stop words first '''

with open ("stopwords_en.txt", "r") as myfile:
    stopwords=myfile.read().replace('\n', '')

normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html'
article1 = Article(normalized_url)
article1.download()

""" 0.62 is good threshold"""

normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512'
article2 = Article(normalized_url2)
article2.download()

print("download finished")

article1.parse()
string1 = article1.text
article2.parse()
string2 = article2.text

normalised_string1 = normalize_text(string1)
Esempio n. 12
0
def getArticleTitleText(url):
    article = Article(url)
    article.download()
    article.html
    article.parse()
    return [article.title, article.text.encode('utf-8')]
Esempio n. 13
0
@author: hoavu
'''

import requests
import urllib.parse
import nltk
from newspaper.article import Article
from crawlerApp.utils import get_cosine, text_to_vector, normalize_text
'''get stop words first '''

with open("stopwords_en.txt", "r") as myfile:
    stopwords = myfile.read().replace('\n', '')

normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html'
article1 = Article(normalized_url)
article1.download()
""" 0.62 is good threshold"""

normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512'
article2 = Article(normalized_url2)
article2.download()

print("download finished")

article1.parse()
string1 = article1.text
article2.parse()
string2 = article2.text

normalised_string1 = normalize_text(string1)
normalised_string2 = normalize_text(string2)
'''
Created on Feb 28, 2015

@author: hoavu
'''
from newspaper.article import Article


url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html'
article = Article(url)
article.download()
article.parse()
#print(article.html)
print(article.text)

Esempio n. 15
0
 def download_and_parse(article: Article):
     try:
         article.download()
         article.parse()
     except newspaper.article.ArticleException:
         pass
Esempio n. 16
0
from newspaper.article import Article

import newspaper
# url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
#
# article = Article(url)
#
# article.parse()
# article.authors
# article.text
# cnn_paper = newspaper.build('http://cnn.com')

url = 'http://news.163.com/17/0312/10/CFAP3Q9G000189FH.html'
a = Article(url, language='zh')  # Chinese
a.download()

a.parse()

print(a.keywords)
print("===============")
print(a.title)
print("===============")
print(a.authors)
print("===============")
print(a.text[:150])

# filter_regex = re.compile(r'[^a-zA-Z0-9\ ]')
# title_text_h1 = "我与总书记议国是:建设社会稳定长治久安新AA边疆,总书记 社会稳定 全国人大代表00000ddd"
# filter_title_text_h1 = filter_regex.sub('', title_text_h1).lower()
# print(filter_title_text_h1)