Ejemplo n.º 1
0
 def __init__(self, search_type):
     user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0'
     self.config = Config()
     self.config.browser_user_agent = user_agent
     self.config.request_timeout = 120
     self.search_type = search_type
     self.prepare_input()
Ejemplo n.º 2
0
def home(request):
    main_url = "http://newsapi.org/v2/top-headlines?country=in&pageSize=17&apiKey=813ecab326254348a7d1e8ef73d2838f"

    # fetching data in json format
    open_news_page = requests.get(main_url).json()
    # getting all articles in a string article
    article = open_news_page["articles"]
    results = []

    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    for ar in article:
        article = Articles()

        intl = []
        intl.append(ar["title"])
        intl.append(ar["description"])
        intl.append(ar["urlToImage"])

        intl.append(ar["url"])
        toi_article = Article(intl[3], language="en", config=config)
        toi_article.download()

        #To parse the article
        toi_article.parse()

        #To perform natural language processing ie..nlp
        toi_article.nlp()
        summ = toi_article.summary
        intl.append(summ)
        results.append(intl)
    content = {'article': results}
    return render(request, 'summnews/index.html', content)
Ejemplo n.º 3
0
def handle(task, progress):
	url = task.url
	progress.set_status("Requesting page...")
	resp = http_downloader.page_text(url, json=False)
	if not resp:
		return False

	config = Config()
	config.memoize_articles = False
	config.verbose = False
	article = Article(url='', config=config)

	article.download()
	article.set_html(resp)
	article.parse()
	if not article.top_image:
		return None

	src = article.top_image
	if 'http' not in src:
		if 'https' in url:
			src = 'https://' + src.lstrip('/ ').strip()
		else:
			src = 'http://' + src.lstrip('/ ').strip()

	progress.set_status("Downloading image...")

	return http_downloader.download_binary(src, task.file, prog=progress, handler_id=tag)
Ejemplo n.º 4
0
def _config():
    config = Config()
    config.fetch_images = False
    config.memoize_articles = False
    config.request_timeout = 10
    config.language = 'en'
    return config
Ejemplo n.º 5
0
    def __init__(self, url):
        self.url = url
        self.config = Config()
        self.config.memoize_articles = False
        #self.config.fetch_images = False
        self.config.browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.35 Safari/537.36'
        self.config.request_timeout = 30
   
        #logger.info('url is %s', url)
        self.article = Article(url, config=self.config)
        self._import_article()

        self.results = {}
        self.results['sourceurl'] = url
        try:
            self.results['title'] = self.article.title
            self.results['authors'] = self.article.authors
            self.results['canonical_link'] = self.article.canonical_link
            self.results['url'] = self.article.url
            self.results['top_image'] = self.article.top_image
            self.results['images'] = list(self.article.images)
            self.results['movies'] = self.article.movies
            self.results['description'] = self.article.meta_description
            #self.results['meta_favicon'] = self.article.meta_favicon
            self.results['keywords'] = self.article.meta_keywords
            self.results['lang'] = self.article.meta_lang
            self.results['summary'] = self.article.summary
            self.results['tags'] = list(self.article.tags)
            self.results['text'] = self.article.text
        except Exception:
            logger.fatal('Unable to make results for:%s, %s', url, results)
            pass
Ejemplo n.º 6
0
Archivo: luna.py Proyecto: 5l1v3r1/luna
def get_content_dates(link):
    global keywords
    global dates
    config = Config()
    config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
    config.memoize_articles = True
    config.fetch_images = False
    config.https_success_only = False

    article = Article(url=link, config=config)
    sleep(0.1)
    article.download()
    if article.download_state == 1:  #has the download failed? eg. due to 403
        print(link)
        print("\033[101m\033[1m\033[!] Download failed\033[0m")
        return None
    else:
        article.parse()
        article.nlp()
        if (str(article.publish_date).split(" "))[0] not in dates:
            return None
        else:
            if relevancy_check(link, article.text, article.keywords) == False:
                return None
            else:
                if len(article.text) < 200:  #too short?
                    return None
                else:
                    print(link)
                    print(article.title)
                    print(article.publish_date)
                    print("=" * 80)
                    keywords = keywords + article.keywords
                    return (article.text + "\n=====DATE:" +
                            str(article.publish_date))
Ejemplo n.º 7
0
def retrieveUrlText(url):
    try:
        config = Config()
        config.request_timeout = 1000
        config.memoize_articles = False
        config.fetch_images = False
        config.browser_user_agent = 'Mozilla/5.0'
        article = Article(url, config)
        article.download(recursion_counter=5)
        if article.download_state != 2:
            return ''
        article.parse()
        articleText = article.text.replace('\n', ' ')
    except KeyboardInterrupt:
        raise
    except Exception:
        return ''
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'et', 'al', 'fig', 'figs',
        'chem', 'ph'
    ])
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    articleSentences = validateSentences(
        sentence_splitter.tokenize(articleText))
    return articleSentences
Ejemplo n.º 8
0
def get_text(url):
    """Given URL of the article extract text from it

    Arguments:
        url {str} -- url of the article

    Returns:
        str -- Text of thr article
    """
    global CNT
    global NAN
    if (CNT % 100 == 0):
        print(CNT)
    config = Config()
    config.keep_article_html = True
    try:
        article = Article(url, config=config)
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(e)
        NAN += 1
        return np.nan
    finally:
        CNT += 1
Ejemplo n.º 9
0
def fetch_by_newspaper(url):
    is_linkedin_url = url.startswith(linkedinUrl)
    if is_linkedin_url:
        config = Config()
        config.MAX_TITLE = 1000
        article = get_article(url, config)
        article = replace_title_text_from_title_url(article)
    else:
        article = get_article(url)

    return json.dumps({
        "authors":
        article.authors,
        "html":
        article.html,
        "images:":
        list(article.images),
        "movies":
        article.movies,
        "publish_date":
        article.publish_date.strftime("%s") if article.publish_date else None,
        "text":
        article.text,
        "title":
        article.title,
        "topimage":
        article.top_image
    }), 200, {
        'Content-Type': 'application/json'
    }
Ejemplo n.º 10
0
def filterNews(url, keyword):
    """
    对新闻进行筛选,通过关键词,以新闻标题为对象筛查
    通过user_agent和timeout来解决访问超时问题
    https://stackoverflow.com/questions/63061172/newspaper3k-api-article-download-failed-with-httpsconnectionpool-port-443-read
    :param url:
    :param keyword:
    :return:
    """
    user_agent = r"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
    r"Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47 "
    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 30
    # Try-Except-Continue will skip to the next article in the For loop if there is an exception
    # https://github.com/codelucas/newspaper/issues/444  solve the bugs for not found url
    try:
        news = newspaper.Article(url, config=config)
        news.download()
        news.parse()
        for word in keyword:
            if word in news.title:
                # print(word)
                return True
        return False
    except ArticleException:
        return False
Ejemplo n.º 11
0
def cache_art(url):
    headers = {
        'User-Agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        'From': '*****@*****.**'
    }
    r = requests.get(url, headers=headers)

    from newspaper import Config
    from newspaper import Article

    config = Config()
    config.browser_user_agent = headers['User-Agent']
    article = Article(url, keep_article_html=True, config=config)
    article.download()
    # article = Article(url='')
    # article.set_html(str(soup))
    article.parse()
    struct = {
        'html': article.article_html,
        'author': article.authors,
        'pubdate': article.publish_date,
        'text': article.text,
        'image': article.top_image,
        'rawhtml': str(r.content),
        'buzotext': scrape(r.content, 'html5lib')
    }
    return struct
Ejemplo n.º 12
0
def get_top_news(news_information, todays_date):
    load_dotenv(find_dotenv(r"path to env variable"))
    news_api_key = os.getenv("news_api")
    params = (
        ('country', 'us'),
        ('apiKey', f'{news_api_key}'),
    )
    response = requests.get(
        """https://newsapi.org/v2/top-headlines""",
        params=params)
    news_articles = response.json()['articles']
    for i in news_articles:
        user_agent = """Mozilla/5.0 (Macintosh;
        Intel Mac OS X 10.15; rv:78.0)
        Gecko/20100101 Firefox/78.0"""
        config = Config()
        config.browser_user_agent = user_agent
        config.request_timeout = 10
        article = Article(i['url'], config=config)
        print(i["url"])
        article.download()
        article.parse()
        article.nlp()
        summary = article.summary  # noqa
        news_information[i["source"]["name"]] = {"title": i["title"], "summary": summary, "url": i["url"], "date": f"{todays_date}"}  # noqa
Ejemplo n.º 13
0
def get_article_text(url: str) -> str:
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54'
    }
    text = ''
    try:
        config = Config()
        config.browser_user_agent = headers['User-Agent']
        article = Article(url,
                          keep_article_html=True,
                          fetch_images=False,
                          config=config)
        article.download()
        article.parse()
        # article.nlp()
        text = article.text
        meta = {
            "source": "web_article",
            "source_url": article.source_url,
            "article_html": article.article_html,
            "title": article.title,
            "top_image": article.top_image,
            "images": article.images,
            "videos": article.movies,
            "meta_language": article.meta_lang,
            "meta_keywords": article.meta_keywords,
            "authors": article.authors,
            "publish_date": article.publish_date,
        }
        text = clean(text)
    except Exception as e:
        msg = f"Error using newspaper3k extractor: {str(e)}"
        print(msg)
    return text
Ejemplo n.º 14
0
def download_article(url):
    config = Config()
    config.follow_meta_refresh = True
    config.keep_article_html = True
    article = Article(url, config=config)
    article.download()
    return article
Ejemplo n.º 15
0
def getPolarity(uniName):
    from GoogleNews import GoogleNews
    from newspaper import Article
    from newspaper import Config
    import pandas as pd
    from textblob import TextBlob

    uniName = uniName + ' Coronavirus'
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent
    googlenews = GoogleNews(start='08/01/2020', end='09/26/2020')
    googlenews.search(uniName)
    result = googlenews.result()

    for i in range(0, 5):
        googlenews.getpage(i)
        result = googlenews.result()
    df = pd.DataFrame(result)
    sum = 0
    counter = 1
    for ind in df.index:
        try:
            article = Article(df['link'][ind], config=config)
            article.download()
            article.parse()
            article.nlp()
            testimonial = TextBlob(article.summary)
            counter += 1
            sum += testimonial.sentiment.polarity
        except:
            pass

    return sum / counter
Ejemplo n.º 16
0
def cache_art(url):
    headers = {
        'User-Agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
        'From': '*****@*****.**'
    }

    from newspaper import Config
    from newspaper import Article

    config = Config()
    config.browser_user_agent = headers['User-Agent']
    article = Article(url, keep_article_html=True, config=config)
    article.download()

    article.parse()
    struct = {
        'html': article.article_html,
        'link': url,
        'author': article.authors,
        'pubdate': article.publish_date,
        'text': article.text,
        'title': article.title,
        'image': article.top_image,
        'description': article.meta_description
    }

    return struct
Ejemplo n.º 17
0
    def __init__(self,
                 output_dir,
                 dataframe,
                 num_rows=None,
                 user_agent=default_user_agent) -> None:
        self.output_dir = output_dir

        temp_df = dataframe
        if num_rows:
            self.df = temp_df.iloc[:num_rows]
        else:
            self.df = temp_df

        self.user_agent = user_agent
        self.config = Config()
        self.config.browser_user_agent = user_agent

        self.download_logs_dict = {
            "link": [],
            "download_state": [],
            "download_exception_msg": []
        }

        self.number_of_downloaded = 0

        self.preprocess_df()
    def extract(self, request: ExtractorRequest) -> ExtractorResponse:
        try:
            user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
            config = Config()
            config.browser_user_agent = user_agent
            article = Article(
                request.url, keep_article_html=True, fetch_images=False, config=config
            )
            article.download()
            article.parse()
            # article.nlp()

            text = article.text
            meta = {
                "source": "web_article",
                "source_url": article.source_url,
                "article_html": article.article_html,
                "title": article.title,
                "top_image": article.top_image,
                "images": article.images,
                "videos": article.movies,
                "meta_language": article.meta_lang,
                "meta_keywords": article.meta_keywords,
                "authors": article.authors,
                "publish_date": article.publish_date,
            }
            # construct response
            response = ExtractorResponse(meta=meta, text=text or "")
        except Exception as e:
            msg = f"Error using newspaper3k extractor: {str(e)}"
            log.error(msg)
            response = ExtractorResponse(error=msg)

        return response
Ejemplo n.º 19
0
    def process_link(self, link=None, nlp=False):
        """processes the linksobtain by get_links() method, extracts
            both the text and a summary of the article with newspaper package
            
        Args:
            link :: str
                URL of links stored in the dictionary returned by get_links()
            nlp :: bool
                Whether or not to perform nlp on the text of the link. This extracts
                a summary of the text, but is a somewhat expensive operation.
        
        Returns:
            article :: 'article' object
                object that contains parsed properties for the link, such as
                summary, text and date.
        """

        #parameters for the processing
        config = Config()
        config.fetch_images = False  #no need for images
        config.memoize_articles = False  #no need for article caching

        try:
            article = Article(link, language="en", config=config)
            article.download()
            article.parse()
            if nlp:
                article.nlp()  #extract summary as per the newspaper API
        except:
            return None

        return article
Ejemplo n.º 20
0
def subjectivity():
    """
    Graphs the posts based on the relationship between subjectivity and popularity
    :return: a graph with the x axis being the subjectivity value, and y axis being upvotes
    """
    scores = []
    for index, row in topics_data.iterrows():
        if index in actual_list:
            scores.append(row['score'])

    subs = []
    for index, row in topics_data.iterrows():
        if index in actual_list:
            url = row['url']
        if 'newsweek' or 'democracynow' in url:
            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
            config = Config()
            config.browser_user_agent = user_agent
            article = Article(url, config=config)
        else:
            article = Article(url)
        article.download()
        article.parse()
        article.nlp()
        text = article.summary
        obj = TextBlob(text)
        subjectivity = obj.sentiment.subjectivity
        subs.append(subjectivity)

        plt.figure(figsize=(50, 10))
        plt.scatter(subs, scores)
        plt.xlabel('Subjectivity')
        plt.ylabel('Score')
        plt.title('Posts in r/politics')
        plt.show()
Ejemplo n.º 21
0
def fetch_and_format(url, fetch_img=True):
    cfg = Config()
    cfg.keep_article_html = True
    cfg.browser_user_agent = USERAGENT
    cfg.drop_text_node = lambda x: x in ('', 'Advertisement', 'advertisement')

    def transform_img(i):
        try:
            i.attrib['src'] = fetch_image_to_b64(i.attrib['src'])
        except Exception as e:
            # print(e)
            pass

    if fetch_img:
        cfg.element_transformers['img'] = transform_img

    def transform_picture(i):
        try:
            img = i.find('img')
            if img is not None:
                i.tag = 'img'
                for k, v in img.items():
                    i.attrib[k] = v
                if 'src' in i.attrib:
                    i.attrib['src'] = fetch_image_to_b64(i.attrib['src'])
        except Exception as e:
            # print(e)
            pass

    if fetch_img:
        cfg.element_transformers['picture'] = transform_picture

    art = Article(url, config=cfg)
    art.download()
    art.parse()
    title = decode_to_entity(art.title)
    # TODO: Newspaper3k parses the same author multiple times. Fix!
    author = art.authors[0] if art.authors else ''
    publish_date = art.publish_date.strftime(
        '%B %d %Y') if art.publish_date else ''
    source = tldextract.extract(
        url).registered_domain  # Is there a better source for this?
    text = art.article_html
    doc = f'''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><html>
    <head>
    <title>{title}</title>
    <META http-equiv="Content-Type" content="text/html; charset=utf-8">
    <META name="author" content="{author}">
    <META name="title" content="{title}">
    </head>
    <body><div>
    <h1>{title}</h1>
    <h3>{author}</h3>
    <h4><a href='{url}'>{source}</a></h4>
    <h4>{publish_date}</h4>
    {text}
    </body>
    </html>'''
    return title, doc
Ejemplo n.º 22
0
def GetArticle(FeedLink, request):
    user_agent = request.META['HTTP_USER_AGENT']
    config = Config()
    config.browser_user_agent = user_agent
    newarticle = Article(FeedLink, config=config)
    newarticle.download()
    newarticle.parse()
    return newarticle
Ejemplo n.º 23
0
 def _getArticleObj(self):
     config = Config()
     config.keep_article_html = True
     article = Article(self.url, config=config)
     article.download()
     article.parse()
     article.nlp()
     return article
Ejemplo n.º 24
0
def make_news_table(db, articles, logger):
    class News(db.Model):
        uuid = db.Column(UUIDType(), primary_key=True)
        url = db.Column(db.Text, nullable=False)
        title = db.Column(db.Text, nullable=False)
        picture = db.Column(URLType, nullable=True)
        icon = db.Column(URLType, nullable=True)
        body = db.Column(db.Text, nullable=False)
        publish_time = db.Column(db.DateTime, nullable=False)

        __tablename__ = 'News'

        def __repr__(self):
            return f'<News: {self.uuid}>'

    db.create_all()
    db.session.commit()

    urls = tuple(
        set(articles) - set(a[0]
                            for a in News.query.with_entities(News.url).all()))
    if len(urls) > 0:
        from newspaper import Article, Config
        from newspaper.mthreading import NewsPool
        from uuid import uuid4
        config = Config()
        config.memoize_articles = False
        config.fetch_images = True
        config.request_timeout = 20
        config.thread_request_timeout = 20
        config.follow_meta_refresh = True
        config.MIN_WORD_COUNT = 150
        config.MIN_SENT_COUNT = 4
        config.browser_user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
        news_pool = NewsPool(config=config)
        articles = tuple(Article(url, config=config) for url in urls)
        news_pool.set(articles, override_threads=4)
        logger.info(f"Articles being downloaded: {urls}")
        news_pool.join()
        for article in articles:
            logger.debug(f"Processing: {article.url}")
            if article.download_state == 1:
                logger.warn(f'Failed to download: {article.url}')
                continue
            article.parse()
            try:
                db.session.add(
                    News(uuid=uuid4(),
                         url=article.url,
                         title=article.title,
                         picture=article.top_image,
                         icon=article.meta_favicon,
                         body=article.text,
                         publish_time=article.publish_date))
            except Exception as e:
                logger.warn(e)
        db.session.commit()
    return News
Ejemplo n.º 25
0
def parse_submission(url):
    '''
    Input: HTTP URL
    Ouput: List of keywords compiled from names found in the article title and keywords found in the article text
    '''
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    config = Config()
    config.browser_user_agent = user_agent

    a = Article(url, config=config)
    try:
        a.download()
        a.parse()
    except Exception as e:
        logging.error(e)
        return

    a.nlp()
    keywords = a.keywords
    title = a.title
    content = a.text
    thumbnail_image = get_thumbnail_image(a.html)

    #clean up content
    content = to_lower(content)
    content = strip_numbers(content)

    #clean up title
    title = to_lower(title)
    title = strip_numbers(title)

    #get names and identities from content text
    content_names = get_names(''.join(content))

    top_keywords = []
    for name in content_names:
        if name in title:
            #if the name is in the title, it is significant
            top_keywords.append(name)

    #Find other keywords that are not names and are found in the title
    for keyword in keywords:
        if keyword in title and keyword not in stop_words and not keyword.isdigit(
        ):
            top_keywords.append(keyword)

    if not top_keywords:
        top_keywords = title.split(" ")
    else:
        #remove duplicates
        top_keywords = list(dict.fromkeys(top_keywords))
        top_keywords = remove_duplicates(top_keywords)

    return {
        'keywords': top_keywords,
        'title': a.title,
        'thumbnail_url': thumbnail_image
    }
Ejemplo n.º 26
0
def load_and_parse_full_article_text_and_image(url: str) -> Article:
    config = Config()
    config.MAX_SUMMARY_SENT = 8

    article = Article(url, config=config)
    article.set_html(load_page_safe(url))  # safer than article.download()
    article.parse()

    return article
def handle(url, data, log):
    try:
        log.out(0, 'Downloading article...')
        resp = requests.get(url, headers={'User-Agent': data['user_agent']})
        if resp.status_code != 200:
            return False  #!cover

        config = Config()
        config.memoize_articles = False
        config.verbose = False
        article = Article(url='', config=config)
        log.out(0, 'Parsing article...')

        article.download()
        article.set_html(resp.text)
        article.parse()
        if article.top_image:
            src = article.top_image
            if 'http' not in src:  #!cover
                if 'https' in url:
                    src = 'https://' + src.lstrip('/ ').strip()
                else:
                    src = 'http://' + src.lstrip('/ ').strip()
            log.out(0, 'Newspaper located image: %s' % src)

            r = requests.get(src,
                             headers={'User-Agent': data['user_agent']},
                             stream=True)
            if r.status_code == 200:
                content_type = r.headers['content-type']
                ext = mimetypes.guess_extension(content_type)
                if not ext or ext == '':  #!cover
                    log.out(
                        1, 'NewsPaper Error locating file MIME Type: %s' % url)
                    return False
                if '.jp' in ext:
                    ext = '.jpg'  #!cover
                path = data['single_file'] % ext
                if not os.path.isfile(path):
                    if not os.path.isdir(data['parent_dir']):  #!cover
                        log.out(1, ("+Building dir: %s" % data['parent_dir']))
                        os.makedirs(
                            data['parent_dir']
                        )  # Parent dir for the full filepath is supplied already.
                    with open(path, 'wb') as f:
                        r.raw.decode_content = True
                        shutil.copyfileobj(r.raw, f)
                return path
            else:  #!cover
                log.out(
                    0, ('\t\tError Reading Image: %s responded with code %i!' %
                        (url, r.status_code)))
                return False
    except Exception as e:
        log.out(0, ('"Newspaper" Generic handler failed. ' + (str(e).strip())))
    return False  #!cover
Ejemplo n.º 28
0
def get_article(url):
    config = Config()
    config.request_timeout = 1.3
    article = Article(url, language='zh', config=config)
    try:
        article.download()
        article.parse()
        # article.nlp()
    except Exception:
        return
    return article.text
Ejemplo n.º 29
0
def getConf(language, parser):
    '''
    get newspaper Config object
    :return:
    '''
    conf = Config()
    conf.fetch_images = False
    conf.memoize_articles = False
    conf.set_language(language)
    conf.parser_class = parser
    return conf
Ejemplo n.º 30
0
    def parse(self, response):

        newsItem = CNNItem()
        newsItem['title'] = uniToAscii(
            response.selector.xpath(
                '//meta[@property="og:title"]/@content').extract_first())
        newsItem['author'] = uniToAscii(
            response.selector.xpath(
                '//meta[@name="author"]/@content').extract_first())
        newsItem['link'] = response.url

        print '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', newsItem[
            'title']

        text = ""
        pars = response.selector.xpath(
            '//*[@class="zn-body__paragraph"]//text()').extract()
        for par in pars:
            par_text = uniToAscii(par)
            if par_text[-1:] != '.':
                par_text += '.'
            text = text + par_text + " "
        newsItem['text'] = text

        keywords = uniToAscii(
            response.xpath(
                "//meta[@name='keywords']/@content").extract_first())
        keywords_list = keywords.split(',')
        newsItem['keywords'] = [word.strip() for word in keywords_list]

        print '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n', newsItem[
            'title'], ' ', len(pars), ' ', newsItem['keywords']
        if newsItem['text']:
            print newsItem['text'][:100]

        yield newsItem

        config = Config()
        config.fetch_images = False
        config.memoize_articles = False

        url_build = newspaper.build(url=response.url, config=config)

        print response.url

        storyLinks = [article_url for article_url in url_build.article_urls()]

        print storyLinks

        for link in storyLinks:
            if re.match('.*/2017/03/0.*\.html', str(link)):
                print 'Fetching from ', str(link)
                yield scrapy.Request(str(link))