Beispiel #1
0
def extract_content(url):
    content = {}
    content = get_static_content()
    try:
        ua = UserAgent()
    
        config = Config()
        config.browser_user_agent = ua.chrome
        config.language = 'es'
        
        article = Article(url, config= config)
         
        article.download()    
     
        article.parse()    
        
        text = article.text
        content['text'] = text
        
        top_image = article.top_image
        content['image'] = top_image

        movielinks = []
        for movie in article.movies:
            movielinks.append(movie)
        content['videos'] = movielinks

    except Exception as e:
        print_exc(e)
    
    return content
 def write_article(self):
     if(self.message['data']==1L):
         return
     collection = self.db['articles']
     data = json.loads(self.message['data'])
     articles = data['articles']
     payloads = []
     count = 0;
     for article_url in articles:
         article = Article(article_url)
         article.build()
         payload = {}
         payload['meta_keywords'] = article.meta_keywords
         payload['title'] = article.title
         payload['url'] = article.url
         payload['text'] = article.text
         payload['html'] = article.html
         payload['keywords'] = article.keywords
         payload['_id'] = str(hashlib.sha1(article.title).hexdigest())
         payload['crawled_at'] = str(int(time.time()))
         payloads.append(payload)
         count+=1
         if(count%100==0):
             collection.insert_many(payloads)
             payloads = []
     if payloads:
         collection.insert_many(payloads)    
Beispiel #3
0
    def parse_article(self, response):
        # utilize newspaper for article parsing
        article = Article(url=response.url, config=self.config)
        article.set_html(response.body)

        article.parse()
        item = Art()
        item['title'] = article.title
        item['url'] = article.url
        item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' ')))
        yield item
Beispiel #4
0
    def parse_article(self, response):
        # utilize newspaper for article parsing
        article = Article(url=response.url, config=self.config)
        article.set_html(response.body)

        article.parse()
        item = Art()
        item["title"] = article.title
        item["url"] = article.url
        item["text"] = "\n".join(nlp.split_sentences(article.text.replace("\n", " ")))
        yield item
Beispiel #5
0
 def get_summary(self, title, text):
     article = Article(url='')
     article.title = title
     article.text = text
     article.download_state = ArticleDownloadState.SUCCESS
     article.is_parsed = True
     article.nlp()
     return self.preprocess_text(article.summary)
Beispiel #6
0
    def parse_article(self, response):
        if len(response.body) > 0:
            # utilize newspaper for article parsing
            article = Article(url=response.url, config=self.config)
            article.set_html(response.body)
            article.parse()

            #self.sentences.append(nlp.split_sentences(article.text))
            
            item = Art()
            item['title'] = article.title
            item['url'] = article.url
            item['text'] = '\n'.join(nlp.split_sentences(article.text.replace('\n', ' ')))
            yield item
        else:
            print response.url + ' DEAD LINK'
Beispiel #7
0
def read_article(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    print(article.text)
    # print 'SUMMARY'
    print(article.summary)
    def parse(self, response):
        #print type(response)

        article = None
        try:
            article = NewsPlease.from_html(response.body.encode("utf-8"))
        except:
            article = NewsPlease.from_html(
                response.body.decode('latin-1').encode("utf-8"))
            print "EXCEPTION OCCURED"

        print article.date_publish
        #print article.text
        article2 = Article(url="", language="es")
        article2.set_html(response.body)
        article2.parse()

        print response.url
        self.db.articles_es.insert({
            "title": article.title,
            "pub_date": article.date_publish,
            "url": response.url,
            "content": article2.text,
            "raw_html": response.body
        })

        links = self.linkExtractor.extract_links(response)
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse)
Beispiel #9
0
def get_article(url):
    article = Article(url, language='pt')
    # Simples parsing de url e usando o requests para
    # fazer download do html como texto, fazer um simples
    # cleance desse html e retornar o texto completo
    # para depois gerar um Parser com esse texto
    article.download()
    # Analysing
    article.parse()

    return article
Beispiel #10
0
def crawl_today():
    """   每天定时爬取, 5小时一次即可,每个类别爬取一页  """
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "BEGIN")
    while True:
        try:
            conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean')
        except OperationalError as e:
            print(e)
            time.sleep(3)
        else:
            break
    for class_1 in categories:
        params['tag'] = class_1
        params['pgno'] = str(1)
        page_url = base_url + urlencode(params)
        failure = 0
        while failure < 3:
            try:
                doc = pq(page_url, headers=headers, verify=False)
            except Exception as e:
                failure += 1
                print('\r获取新闻链接失败,原因:', e, end='', flush=True)
            else:
                break
        else:
            continue
        ul = doc('ul.timeline > li')
        for li in ul.items():
            url = li.find('h2 a').attr('href')
            article = Article(url)
            try:
                article.download()
                article.parse()
            except ArticleException as e:
                # print(e)
                continue
            content = article.text
            if content:
                title = article.title
                date = article.publish_date
                class_2 = li.find('div.timeline-content > a').text()
                # print(title)
                cursor = conn.cursor()
                sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 8))
                cursor.execute(sql, ('Malaysia', domain, class_1, class_2, title, date, content, url))
                conn.commit()
                cursor.close()
    conn.close()
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "DONE")
Beispiel #11
0
def parseArticle(articles: ResultSet, host: str, src_ele: str,
                 summary_ele: str, date_ele: str, url_ele: str):
    global parse_done
    global config_en
    global articles_count
    articles_count += len(articles)
    for a in articles:
        src = a.find(src_ele)
        summary = a.find(summary_ele)
        date = a.find(date_ele)
        if src is None:
            src = host
        else:
            src = src.text
        if summary is None:
            summary = a.find('description')  # fallback
        if summary is not None:
            summary = summary.text
        url = a.find(url_ele)
        if url is not None:
            url = url.text.strip()
        else:
            url = ''
        if url != '':
            article = Article(url, config=config_en)
            if date is not None:
                try:
                    date = parse(date.text)
                except:
                    date = None
            try:
                article.download()
                article.parse()
            except Exception as ex:
                log(f'{ex}, url is "{url}"')
            finally:
                if article.publish_date is datetime and date is None:
                    date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
                insert_db((src, date, article.title, summary, article.text,
                           article.url))
        parse_done += 1
Beispiel #12
0
def crawl_archive():
    """
    爬取过去一周的所有新闻
    """
    conn = connect(host=host, user='******', password='******', port=3306, db='chinaaseanocean')
    for c in categories:
        for i in range(1, MAX_PAGE + 1):
            params['tag'] = c
            params['pgno'] = str(i)
            page_url = base_url + urlencode(params)
            try:
                doc = pq(page_url, headers=headers, verify=False)
            except requests.exceptions.ConnectionError as e:
                print(e)
                doc = pq(page_url, headers=headers, verify=False)
            ul = doc('ul.timeline > li')
            for li in ul.items():
                url = li.find('h2 a').attr('href')
                article = Article(url)
                try:
                    article.download()
                    article.parse()
                except ArticleException as e:
                    print(e)
                    continue
                title = article.title
                date = article.publish_date
                content = article.text
                class_2 = li.find('div.timeline-content > a').text()
                if content:
                    print(title)
                    cursor = conn.cursor()
                    sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(['%s'] * 9))
                    cursor.execute(sql, ('Malaysia', domain, c, class_2, None, title, date, content, url))
                    conn.commit()
                    cursor.close()
    conn.close()
Beispiel #13
0
    db_connect.init_database_cont()     
                  
                  
                  
    ''' we process homepage'''
    for home_page in washington_post_home_pages:
        print("extracting: " + home_page)
        washington_page = requests.get(home_page)
        html_tree = html.fromstring(washington_page.text)
        article_urls = html_tree.xpath('//a/@href')
        for home_url in article_urls:
            if  home_url is not None and len(home_url) > 16: 
                if ('http://' not in home_url and 'https://' not in home_url):
                    home_url = WASHINGTON_POST + home_url
                try:
                    article_home = Article(home_url, keep_article_html=True)
                    extract_washington_post_article(article_home, True, washington_post_home_pages.get(home_page))
                except Exception as e:
                    print('Smt wrong when process homepage' + home_page +  'article:  {}'.format(e) + home_url)
                  
                  
                  
                  
                  
                  
                  
                  
               
    db_connect.close_database_cont()   
except Exception as e:        
    print('Something went wrong with database: {}'.format(e))    
Beispiel #14
0
 def download_and_parse(article: Article):
     try:
         article.download()
         article.parse()
     except newspaper.article.ArticleException:
         pass
Beispiel #15
0
def fetch_story(request):
    if request.method == 'GET':

        # List to store all the parsed RSS entries.
        story_list = []

        # Get Source Object from 'item_id' passed through Request
        source_id = request.GET.get('item_id')

        if source_id is None:
            # If none, Return to sources list
            return HttpResponseRedirect('/sources_list/')

        # Get sourcing object
        try:
            rss_obj = Sourcing.objects.get(id=source_id)
        except Sourcing.DoesNotExist:
            messages.info(request,
                          'Source Does Not Exist, Please try another one.')
            return HttpResponseRedirect('/sources_list/')

        # Parse the RSS URL and get the data
        feed_data = feedparser.parse(rss_obj.rss_url)

        # Detects if the Url is not well formed RSS
        if feed_data.bozo == 1:
            url_error = {
                'Possible Wrong URL. Click here to go back to Sources page.'
            }
            return render_to_response('fetch_story.html', {
                'url_error': url_error,
                'user': request.user
            })
        else:
            for data in feed_data.get('entries'):
                story_url = data.get('link')

                # If RSS is Empty return Story listing page
                if story_url is None:
                    rss_error = {
                        'Either RSS is empty or RSS is broken. Click here to go back to Story Listing page'
                    }
                    return render_to_response('fetch_story.html', {
                        'rss_error': rss_error,
                        'user': request.user
                    })

                # Use newspaper library to download the article
                article = Article(story_url)

                try:
                    article.download()
                except ArticleException:
                    logger.debug("Article Download exception in : %s" %
                                 story_url)

                # Try to Parse Article
                try:
                    article.parse()
                except ArticleException:
                    logger.debug("Exception in article parse")

                article_instance = article

                # if Datetime is none, assign current datetime
                if article_instance.publish_date is None:
                    if data.get('published') is None:
                        article_instance.publish_date = datetime.now(
                        ).strftime('%Y-%m-%d %H:%M:%S')
                    else:
                        article_instance.publish_date = datetime.strptime(
                            data.get('published'),
                            '%a, %d %b %Y %H:%M:%S GMT').strftime(
                                '%Y-%m-%d %H:%M:%S')

                        # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S')
                elif not isinstance(article_instance.publish_date, datetime):
                    article_instance.publish_date = datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    # article_instance.publish_date = datetime.now().strftime('%a, %e %b %Y %H:%M:%S')

                # if Body is empty, assign dummy Text
                if article_instance.text is '':
                    article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \
                                                Click the Story title to visit the Story page."

                try:
                    # Check if story exist
                    Stories.objects.select_related('source').get(url=story_url)
                except Stories.DoesNotExist:
                    story = Stories(title=article_instance.title,
                                    source=rss_obj,
                                    pub_date=article_instance.publish_date,
                                    body_text=article_instance.text,
                                    url=article_instance.url)
                    story.save()

                # Add each downloaded article details to Story_list and pass to HTML template.
                story_list += [article_instance]
            return render_to_response('fetch_story.html', {
                'data': story_list,
                'rss_id': rss_obj,
                'user': request.user
            })
    else:
        return HttpResponseRedirect('/sources_list/')
    def handle(self, *args, **options):
        source_obj = Sourcing.objects.all()
        stories_list = list(Stories.objects.values_list('url', flat=True))

        # To store time and data iterated count
        not_rss_url = 0
        fetched_story_count = 0
        existing_story_count = len(stories_list)
        download_exception = 0
        parsing_exception = 0
        broken_rss_list = 0

        print("""\n\n
        ------------------------Started fetching Url's:------------------------
        \n
        """)
        start_time = datetime.now()

        sources = tqdm(source_obj)
        for list_item in sources:
                # Sources Progress bar
                sources.set_description('Source Completed  ')

                # Parse data from Rss Url
                feed_data = feedparser.parse(list_item.rss_url)

                # Detects if the Url is not well formed RSS
                if feed_data.bozo == 1:
                    logger.debug("Not a RSS url :    %s" % list_item.rss_url)
                    not_rss_url += 1
                else:
                    # Stories progess bar using tqdm
                    story_entries = tqdm(feed_data.get('entries'))

                    """
                        # This will iterate through each story url
                        # If story url is already in list fetched from DB
                        # It will not fetch for those URL.
                        # Else: It will download the story and save to Stories DB
                    """

                    for data in story_entries:
                        # Stories Progress bar Title
                        story_entries.set_description('Stories Completed ')

                        # Get story Url from story_entries list
                        story_url = data.get('link')

                        # If RSS is Empty return to Story listing page
                        if story_url is None:
                            logger.debug("No feed data in RSS URL:   %s" % list_item.rss_url)
                            broken_rss_list += 1
                        else:

                            # If story does not exist, It'll download and save it in database
                            if story_url in stories_list:
                                stories_list.remove(story_url)
                            else:
                                # Use Newspaper Library's
                                article = Article(story_url)

                                # Use newspaper library to download the article
                                try:
                                    article.download()
                                except ArticleException:
                                    logger.debug("Article Download exception in : %s" % story_url)
                                    download_exception += 1

                                # Parse Article
                                try:
                                    article.parse()
                                except ArticleException:
                                    logger.debug("Article parse exception in : %s" % story_url)
                                    parsing_exception += 1

                                article_instance = article

                                # if Datetime is none or not a Datetime, assign current datetime
                                if article_instance.publish_date is None:
                                    if data.get('published') is None:
                                        article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                                    else:
                                        article_instance.publish_date = datetime.strptime(
                                                                data.get('published'), '%a, %d %b %Y %H:%M:%S GMT'
                                                                ).strftime('%Y-%m-%d %H:%M:%S')

                                elif not isinstance(article_instance.publish_date, datetime):
                                    article_instance.publish_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                                # if Body is empty, assign dummy Text
                                if article_instance.text is '':
                                    article_instance.text = "This is a Dummy text as some error occurred while fetching body of this story. \
                                    Click the Story title to visit the Story page."

                                # Save story.
                                story = Stories(
                                    title=article_instance.title,
                                    source=list_item,
                                    pub_date=article_instance.publish_date,
                                    body_text=article_instance.text,
                                    url=article_instance.url
                                )
                                story.save()
                                fetched_story_count += 1

        stop_time = datetime.now()
        execution_time = stop_time - start_time
        final_count = len(Stories.objects.values_list('url', flat=True))
        print("""
        
        ------------------------Finished fetching Url's:------------------------


                                  Final Result:

                        No of Existing Stories          :   {0}
                        No of New Stories Fetched       :   {1}
                        No of wrong Rss Url's           :   {2}
                        No of Broken or Empty Rss Url's :   {3}
                        No of Stories not Downloaded    :   {4}
                        No of Stories not Parsed        :   {5}
                    -------------------------------------------------
                        Total Stories                   :   {6}
                    -------------------------------------------------

                        Process Execution time          :   {7}

        ------------------------------------------------------------------------
            
        """.format(existing_story_count, fetched_story_count,
                   not_rss_url, broken_rss_list, download_exception,
                   parsing_exception, final_count, execution_time))
'''
Created on Feb 28, 2015

@author: hoavu
'''
from newspaper.article import Article


url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html'
article = Article(url)
article.download()
article.parse()
#print(article.html)
print(article.text)

Beispiel #18
0
import re

from newspaper.article import Article

import newspaper
# url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
#
# article = Article(url)
#
# article.parse()
# article.authors
# article.text
# cnn_paper = newspaper.build('http://cnn.com')

url = 'http://news.163.com/17/0312/10/CFAP3Q9G000189FH.html'
a = Article(url, language='zh')  # Chinese
a.download()

a.parse()

print(a.keywords)
print("===============")
print(a.title)
print("===============")
print(a.authors)
print("===============")
print(a.text[:150])

# filter_regex = re.compile(r'[^a-zA-Z0-9\ ]')
# title_text_h1 = "我与总书记议国是:建设社会稳定长治久安新AA边疆,总书记 社会稳定 全国人大代表00000ddd"
# filter_title_text_h1 = filter_regex.sub('', title_text_h1).lower()
@author: hoavu
'''

import requests
import urllib.parse
import nltk
from newspaper.article import Article
from crawlerApp.utils import get_cosine, text_to_vector, normalize_text

'''get stop words first '''

with open ("stopwords_en.txt", "r") as myfile:
    stopwords=myfile.read().replace('\n', '')

normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html'
article1 = Article(normalized_url)
article1.download()

""" 0.62 is good threshold"""

normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512'
article2 = Article(normalized_url2)
article2.download()

print("download finished")

article1.parse()
string1 = article1.text
article2.parse()
string2 = article2.text
Beispiel #20
0
def getArticleTitleText(url):
    article = Article(url)
    article.download()
    article.html
    article.parse()
    return [article.title, article.text.encode('utf-8')]
Beispiel #21
0
    #             try:
    #                 article_home = Article(home_url)
    #                 extract_vnexpress_article(article_home, True, 'expressing')
    #             except Exception as e:
    #                 print('Smt wrong when process homepage expressing  article:  {}'.format(e) + home_url)
    ''' we process homepage'''
    VNEXPRESS_HOMPAGE = 'http://vnexpress.net/'
    vnexpress_homepage = requests.get(VNEXPRESS_HOMPAGE)
    html_tree = html.fromstring(vnexpress_homepage.text)
    article_urls = html_tree.xpath('//a/@href')
    for home_url in article_urls:
        if home_url is not None and len(home_url) > 16:
            if ('http://' not in home_url and 'https://' not in home_url):
                home_url = VNEXPRESS_HOME + home_url
            try:
                article_home = Article(home_url)
                extract_vnexpress_article(article_home, True)
            except Exception as e:
                print(
                    'Smt wrong when process homepage  article:  {}'.format(e) +
                    home_url)

    db_connect.close_database_cont()
except Exception as e:
    print('Something went wrong with database: {}'.format(e))
    '''
    =======================================================================================================================================
    =======================================================================================================================================
    ================================================= VNEpress stop ======================================================================
    =======================================================================================================================================
    =======================================================================================================================================
Beispiel #22
0
from typing import List
from newspaper.article import Article
from kindle_news_assistant.agent import Agent


@pytest.fixture
def agent():
    return Agent()


@pytest.mark.parametrize(
    "articles,expected_len",
    [
        (
            [
                Article("https://cnn.com/0/16/article-title.html",
                        "https://cnn.com"),
                Article(
                    "https://cnn.com/0/16/article-title.html?query=yes",
                    "https://cnn.com",
                ),
                Article("https://cnn.com/1/16/different-title.html",
                        "https://cnn.com"),
            ],
            2,
        ),
    ],
)
def test_filter_duplicates(agent: Agent, articles: List[Article],
                           expected_len: int):
    filtered = agent.filter_duplicates(articles)
    assert len(filtered) == expected_len
Beispiel #23
0
def crawl_today():
    """
    用于每日更新
    """
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "BEGIN", flush=True)
    while True:
        try:
            conn = connect(host=host,
                           user='******',
                           password='******',
                           port=3306,
                           db='chinaaseanocean')
        except OperationalError as e:
            print(e)
            time.sleep(3)
        else:
            break
    for key in class_.keys():
        class_1 = class_[key]
        # print(class_1)
        page = 1
        while page < 3:
            page_url = base_url + key + '&max_page=' + str(page)
            # print(page_url)
            failure = 0
            while failure < 3:
                try:
                    doc = pq(page_url, headers=headers, verify=False)
                except Exception as e:
                    failure += 1
                    print(e)
                else:
                    break
            else:
                continue
            a_list = doc('div.w3-justify a')
            for a in a_list.items():
                news_url = 'http://www.bernama.com/en/' + a.attr('href')
                article = Article(news_url)
                try:
                    article.download()
                    article.parse()
                except ArticleException as e:
                    print(e)
                    continue
                content = pattern.sub('', article.text).replace('\n', '')
                if content:
                    url = article.url
                    title = article.title
                    try:
                        date = '-'.join(
                            pattern.findall(article.html)[0].split('/')[::-1])
                    except:
                        date = ''
                    # print(title, date, content, sep='\n')
                    cursor = conn.cursor()
                    sql = 'REPLACE INTO `asean_news` VALUES (%s)' % (','.join(
                        ['%s'] * 8))
                    cursor.execute(sql, ('Malaysia', domain, class_1, None,
                                         title, date, content, url))
                    conn.commit()
                    cursor.close()
            if len(a_list) < 7:
                break
            page = page + 1
    conn.close()
    now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print('\n', domain, now, "DONE")
Beispiel #24
0
'''
Created on Feb 28, 2015

@author: hoavu
'''
from newspaper.article import Article

url = 'http://www.huffingtonpost.com/2015/02/27/jennifer-lawrence-david-o-russell_n_6772866.html'
article = Article(url)
article.download()
article.parse()
#print(article.html)
print(article.text)
Beispiel #25
0
@author: hoavu
'''

import requests
import urllib.parse
import nltk
from newspaper.article import Article
from crawlerApp.utils import get_cosine, text_to_vector, normalize_text
'''get stop words first '''

with open("stopwords_en.txt", "r") as myfile:
    stopwords = myfile.read().replace('\n', '')

normalized_url = 'http://www.nytimes.com/2015/05/25/science/john-nash-a-beautiful-mind-subject-and-nobel-winner-dies-at-86.html'
article1 = Article(normalized_url)
article1.download()
""" 0.62 is good threshold"""

normalized_url2 = 'http://abcnews.go.com/US/john-nash-beautiful-mind-mathematician-wife-killed-jersey/story?id=31268512'
article2 = Article(normalized_url2)
article2.download()

print("download finished")

article1.parse()
string1 = article1.text
article2.parse()
string2 = article2.text

normalised_string1 = normalize_text(string1)