def extract(self, url):
        try:
            html, after_url = self.extract_html(url)
            a = Article(url, language='zh')
            a.download()
            a.parse()
            try:
                row_t = str(a.publish_date)[0:16]
                create_time = self.extract_time_str(html, row_t)
            except:
                create_time = 0
            if not a.title:
                a.title = ''
            # url = self.url_pattern.search(html).group(1)

            # split()使用默认分隔符
            d_r = {
                'title': a.title,
                'article': a.text.split(),
                'html': html,
                'create_time': int(create_time),
                'url': after_url
            }
            return d_r
        except:
            print('抽取错误!!!')
            print(url)
Exemple #2
0
def getArticleInfo():
    post_data = (literal_eval(request.data.decode('utf8')))
    country = post_data["country"]
    articleInfo = {}
    urls = getNewsUrls(country)
    count = 0
    goodCount = 0
    while count < len(urls):
        article = Article(urls[count])
        try:
            article.download()
            article.parse()
            if (isinstance(article.publish_date, datetime)):
                date = article.publish_date.strftime('%m/%d/%Y')
            else:
                date = article.publish_date
            authors = []
            for x in article.authors:
                if len(x.split(" ")) == 2:
                    authors.append(x)
            if not authors:
                authors[0] = "No Author"
            if date == None:
                date = "No Date"
            if article.top_image == None:
                article.top_image = "No imageURL"
            if article.title == None:
                article.title = "No title"
            if count != 0 and goodCount != 0 and urls[count] == articleInfo[
                    goodCount - 1]["url"]:
                print("Inside if statement")
                raise Exception
            articleInfo[goodCount] = {
                "authors": authors,
                "date": date,
                "url": urls[count],
                "imageURL": article.top_image,
                "title": article.title
            }
            count = count + 1
            goodCount = goodCount + 1
        except Exception as e:
            print(e)
            count = count + 1
            print("bad article")
    return articleInfo
Exemple #3
0
def home(url):

    data = {}
    data['url'] = url

    # Validate url
    if urlparse.urlparse(url).scheme not in ('http', 'https'):
        data['error'] = 'Invalid URL'
        return json.dumps(data)

    a = Article(url)
    a.download()
    a.parse()

    data['title'] = a.title
    data['authors'] = a.authors
    data['text'] = a.text

    try:
        a.nlp()
    except UnicodeDecodeError:
        # Strip non-ascii characters
        a.title = to_ascii(a.title)
        a.text = to_ascii(a.text)
        a.nlp()

    # NLP
    data['summary'] = a.summary
    data['keywords'] = a.keywords
    data['tags'] = list(a.tags)

    # Media
    data['top_image'] = a.top_image
    data['images'] = a.images
    data['movies'] = a.movies

    # Meta
    data['source_url'] = a.source_url
    data['published_date'] = a.published_date

    data['meta_img'] = a.meta_img
    data['meta_keywords'] = a.meta_keywords
    data['meta_lang'] = a.meta_lang

    return json.dumps(data)
Exemple #4
0
def ProcessPage(keyword, vBrowser, vNews_name, vNews_url, language):
    """
    Process search result page
    get articles and save them to a pandas dataframe (articles_page)
    (1) list results from page
    (2) loop over results, get article
    (3) return dataframe
    """

    # output: pandas dataframe with title, publishing date, article text and url
    articles_page = pd.DataFrame(
        columns=['title', 'publish_date', 'text', 'url'])

    # 1) list results
    search_result_page_source = vBrowser.page_source

    # make url regex-usable
    url_any = vNews_url
    url_any = re.sub(re.escape('?s=' + keyword), '', url_any)
    url_any = re.sub(re.escape('search?k=' + keyword), '', url_any)
    url_any = re.sub('\?m\=[0-9]{6}', '', url_any)
    url_any = re.escape(url_any) + '(?=\S*[-])([0-9a-zA-Z-\/\.]+)'
    regex = re.compile(url_any)
    logger.info('searching for {}'.format(url_any))
    search_results = list(
        set([
            match[0] for match in regex.finditer(search_result_page_source)
            if keyword in match[0].lower()
        ]))

    if vNews_name in ['NewVision']:
        regex = re.compile(
            '\/new\_vision\/news\/(?=\S*[-])([0-9a-zA-Z-\/\.]+)')
        search_results = list(
            set([
                match[0] for match in regex.finditer(search_result_page_source)
                if keyword in match[0].lower()
            ]))
        search_results = [
            'https://www.newvision.co.ug' + search_result
            for search_result in search_results
        ]

    if len(search_results) > 0:
        logger.info("found {0} article(s):".format(len(search_results)))
        for title in search_results:
            logger.info("url: {0}".format(title))
    else:
        logger.info('no articles found')

    # 2) for each result, get article and save it
    for idx, search_result in enumerate(search_results):

        logger.info('processing {}'.format(search_result))
        # download article
        article = Article(search_result, keep_article_html=True)
        article.download()
        attempts = 0
        while (article.download_state !=
               2) & (attempts < 5):  #ArticleDownloadState.SUCCESS is 2
            attempts += 1
            time.sleep(1)
        if article.download_state != 2:
            logger.warning(
                'unable to download article: {}'.format(search_result))
            continue
        article.parse()

        article_html = str(article.html)

        # select articles with keyword
        regex = re.compile(keyword, re.IGNORECASE)

        if re.search(regex, article.html) is not None:

            logger.debug('{}'.format(article_html))

            # get date
            date = article.publish_date
            date_str = ""
            search_date = False

            if date is not None:
                # keep date found only if older than today
                if pd.to_datetime(date_str).date() < pd.to_datetime(
                        datetime.today()).date():
                    date_str = date.strftime(DATE_FORMAT)
                else:
                    search_date = True
            else:
                search_date = True

            if search_date:
                article_html = re.sub('\s+', ' ', article_html)
                dates_found = []

                res_date = [
                    re.compile('[a-zA-ZÀ-ÿ]\w+\s[0-9]+\,\s[0-9]{4}'),
                    re.compile('[a-zA-ZÀ-ÿ]\w+\s[0-9]+\s[0-9]{4}'),
                    re.compile('[0-9]\w+\s[a-zA-ZÀ-ÿ]+\,\s[0-9]{4}'),
                    re.compile('[0-9]\w+\s[a-zA-ZÀ-ÿ]+\s[0-9]{4}'),
                    re.compile('[0-9]+\s[a-zA-ZÀ-ÿ]+\,\s[0-9]{4}'),
                    re.compile('[0-9]+\s[a-zA-ZÀ-ÿ]+\s[0-9]{4}'),
                    re.compile('[0-9]{2}\/[0-9]{2}\/[0-9]{4}'),
                    re.compile('[0-9]{2}\-[0-9]{2}\-[0-9]{4}'),
                    re.compile('[0-9]{2}\.[0-9]{2}\.[0-9]{4}')
                ]
                for re_date in res_date:
                    for match in re_date.finditer(article_html):
                        if is_date(match.group(), language):
                            dates_found.append((match.start(), match.group()))
                if len(dates_found) > 0:
                    logger.info('{}'.format(dates_found))
                    dates_found.sort(key=lambda tup: tup[0])
                    for res in dates_found:
                        try:
                            res_date = dateparser.parse(res[1],
                                                        languages=[language],
                                                        settings={
                                                            'DATE_ORDER': 'DMY'
                                                        }).date()
                            if (res_date < pd.to_datetime(
                                    datetime.today()).date() and res_date >
                                    pd.to_datetime('30/04/1993',
                                                   format="%d/%m/%Y").date()):
                                date_str = res_date.strftime(DATE_FORMAT)
                                break
                        except:
                            pass

            if date_str == "":
                logger.warning(
                    'Publication date not found or wrongly assigned, skipping article'
                )
                continue
            else:
                logger.info('Publication date assigned: {}'.format(date_str))

            # Take newspaper name out of article title
            article.title = remove_newspaper_name_from_title(
                article.title, vNews_name)

            # if no text is present (e.g. only video), use title as text
            article_text = article.text
            if len(str(article.text)) == 0:
                article_text = article.title

            # add to dataframe
            logger.info('{0} : {1}'.format(article.title, date_str))
            articles_page.loc[idx] = [
                article.title, date_str, article_text, article.url
            ]

    # 3) return dataframe
    if len(search_results) > 0:
        logger.info('{}'.format(articles_page.head()))
    return articles_page
def ProcessPage(vBrowser, vArticles_all):
    """Process search result page
    get articles and save them to a pandas dataframe (articles_page)
    (1) list results from page
    (2) loop over results, get article and store it
    (3) return dataframe
    """

    # output: pandas dataframe with title, publishing date, article text and url
    articles_page = pd.DataFrame(
        columns=['title', 'publish_date', 'text', 'url'])

    # 1) list results
    search_result_page_source = vBrowser.page_source
    # for ZambianObserver *********************************
    regex = re.compile(
        'https:\/\/www\.zambianobserver\.com\/(?=\S*[-])([0-9a-zA-Z-]+)\/')
    search_results = list(
        set([
            match[0] for match in regex.finditer(search_result_page_source)
            if "flood" in match[0].lower()
        ]))
    # for TimesOfZambia ***********************************
    # search_results_we = vBrowser.find_elements_by_class_name("readmore");
    # search_results = [search_result.get_attribute("href") for search_result in search_results_we]
    # for Lusakatimes ***********************************
    # search_results_we = vBrowser.find_elements_by_class_name("td-image-wrap");
    # regex_prefilter = re.compile(r'flood', re.IGNORECASE)
    # search_results = [search_result.get_attribute("href") for search_result in search_results_we if re.search(regex_prefilter, search_result.get_attribute("title")) is not None]
    # for ZambiaDailyMail ***********************************
    # regex = re.compile('http:\/\/www\.daily-mail\.co\.zm\/(?=\S*[-])([0-9a-zA-Z-]+)\/')
    # search_results = list(set([ match[0] for match in regex.finditer(search_result_page_source) if "flood" in match[0].lower()]))
    # for ZambianWatchdog ***********************************
    # regex = re.compile('https:\/\/www\.zambiawatchdog\.com\/(?=\S*[-])([0-9a-zA-Z-]+)\/')
    # search_results = list(set([ match[0] for match in regex.finditer(search_result_page_source) if "flood" in match[0].lower()]))

    if len(search_results) > 0:
        print("found article(s):")
        for title in search_results:
            print("url: {0}".format(title))

    # 2) for each result, get article and save it
    for idx, search_result in enumerate(search_results):

        # download article
        article = Article(search_result)
        article.download()
        while article.download_state != 2:  #ArticleDownloadState.SUCCESS is 2
            time.sleep(1)
        article.parse()

        # select articles with "flood"
        regex = re.compile(r'flood', re.IGNORECASE)

        if re.search(regex, article.title) is not None:

            # get date
            date = article.publish_date
            if date is not None:
                date_str = date.strftime('%m/%d/%Y')
            else:
                # for TimesOfZambia *******************************************
                # date_re = re.search('[a-zA-z]\w+\s[0-9][0-9]\,\s[0-9]{4}', article.html)
                # date_str = date_re[0]
                # for ZambiaDailyMail, LusakaTimes ****************************
                dates_all = [
                    m.group(0) for m in re.finditer(
                        r'[a-zA-z]\w+\s[0-9]+\,\s[0-9]{4}', article.html)
                ]
                if len(dates_all) > 1:
                    date_str = dates_all[1]
                else:
                    date_str = ""
                # for ZambianWatchdog *****************************************
                # dates_all = [m.group(0) for m in re.finditer(r'[a-zA-z]\w+\s[0-9]+\,\s[0-9]{4}', article.html)]
                # if len(dates_all) > 1:
                #     date_str = dates_all[0]
                # else:
                #     date_str = ""
                # *************************************************************

            # fix title, if necessary (only for LusakaTimes)
            article.title = re.sub('Zambia : ', '', article.title)

            # add to dataframe
            articles_page.loc[idx] = [
                article.title, date_str, article.text, article.url
            ]

            # print dataframe head, to check that things make sense
            if idx == 3:
                print(articles_page.head())

    # 3) return dataframe
    vArticles_all = vArticles_all.append(articles_page, ignore_index=True)
    return vArticles_all