Beispiel #1
0
def run_dateguesser(htmlstring):
   '''try with date_guesser'''
   guess = guess_date(url='https://www.example.org/test/', html=htmlstring)
   if guess.date is None:
      return None
   date = convert_date(guess.date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
   return date
Beispiel #2
0
def test_guess_date():
    # Just making sure it works
    url = 'https://www.nytimes.com/opinion/catalonia-spain-puigdemont.html'
    html = '<could be anything></could>'

    guess = guess_date(url, html)
    assert guess.date is None
    assert guess.accuracy is Accuracy.NONE
    assert guess.method is NO_METHOD
Beispiel #3
0
async def get_article(session, url):
    if url == None:
        return 'url parameter is required', 400

    print(f'Processing URL: {url}')

    sem = asyncio.Semaphore(10)
    async with sem:
        async with session.get(url, timeout=ClientTimeout(total=0)) as response:
            content = await response.read()
            article = newspaper.Article(url)
            article.set_html(content)
            article_dict = {}
            article_dict['last_updated'] = datetime.now().strftime(
                "%d/%m/%Y %H:%M:%S")

            if response.status == 200:
                article.parse()
                article_dict['status'] = 'ok'

                article_dict['article'] = {}
                article_dict['article']['source_url'] = article.source_url

                try:
                    guess = guess_date(url=url, html=article.html)
                    article_dict['article']['published'] = guess.date
                    article_dict['article']['published_method_found'] = guess.method
                    article_dict['article']['published_guess_accuracy'] = None
                    if guess.accuracy is Accuracy.PARTIAL:
                        article_dict['article']['published_guess_accuracy'] = 'partial'
                    if guess.accuracy is Accuracy.DATE:
                        article_dict['article']['published_guess_accuracy'] = 'date'
                    if guess.accuracy is Accuracy.DATETIME:
                        article_dict['article']['published_guess_accuracy'] = 'datetime'
                    if guess.accuracy is Accuracy.NONE:
                        article_dict['article']['published_guess_accuracy'] = None
                except:
                    article_dict['article']['published'] = article.publish_date
                    article_dict['article']['published_method_found'] = None
                    article_dict['article']['published_guess_accuracy'] = None

                article.nlp()

                article_dict['article']['title'] = article.title
                article_dict['article']['summary'] = article.summary
                article_dict['article']['keywords'] = list(article.keywords)
                article_dict['article']['authors'] = list(article.authors)

                sentiString = article.title + article.summary
                analyzer = SentimentIntensityAnalyzer()
                vs = analyzer.polarity_scores(sentiString)

                try:
                    title_lang = detect(article.title)
                except:
                    title_lang = None

                try:
                    text_lang = detect(article.text)
                except:
                    text_lang = None

                article_dict['article']['images'] = list(article.images)
                article_dict['article']['top_image'] = article.top_image
                article_dict['article']['meta_image'] = article.meta_img
                article_dict['article']['movies'] = list(article.movies)
                article_dict['article']['meta_keywords'] = list(
                    article.meta_keywords)
                article_dict['article']['tags'] = list(article.tags)
                article_dict['article']['meta_description'] = article.meta_description
                article_dict['article']['meta_lang'] = article.meta_lang
                article_dict['article']['title_lang'] = str(title_lang)
                article_dict['article']['text_lang'] = str(text_lang)
                article_dict['article']['meta_favicon'] = article.meta_favicon
                article_dict['article']['url'] = str(url)
                article_dict['article']['positive'] = vs['pos']
                article_dict['article']['neutral'] = vs['neu']
                article_dict['article']['negative'] = vs['neg']
                finalJSON.append(article_dict)
                # return article_dict
                # finalJSON = {**finalJSON, **article_dict}
            else:
                article_dict = {}
                article_dict['status'] = 'error'
                article_dict['article'] = 'An error occurred parsing this article'
                # return article_dict
                finalJSON.append(article_dict)
def get_article(url: str = Query(
    ..., title="Article URL",
    description="The URL of the requested article.")):

    article = Article(url)
    article.download()

    if (article.download_state == 2):
        article.parse()

        # Article
        article_response = {}
        article_response['source_url'] = article.url
        article_response['title'] = article.title
        article_response['text'] = article.text
        article_response['authors'] = list(article.authors)

        # Date Detection
        try:
            guess = guess_date(url=url, html=article.html)
            article_response['published_date'] = guess.date
            article_response['published_date_method'] = guess.method
            # Date Accuracy
            if guess.accuracy is Accuracy.PARTIAL:
                article_response['published_date_accuracy'] = 'partial'
            if guess.accuracy is Accuracy.DATE:
                article_response['published_date_accuracy'] = 'date'
            if guess.accuracy is Accuracy.DATETIME:
                article_response['published_date_accuracy'] = 'datetime'
            if guess.accuracy is Accuracy.NONE:
                article_response['published_date_accuracy'] = None
        except:
            article_response['published_date'] = article.published_date
            article_response['published_date_method'] = None
            article_response['published_date_accuracy'] = None

        # Language Detection
        try:
            article_response['title_lang'] = detect(article.title)
        except:
            article_response['title_lang'] = None

        try:
            article_response['text_lang'] = detect(article.text)
        except:
            article_response['text_lang'] = None

        # Meta / Other
        article_response['meta_description'] = article.meta_description
        article_response['meta_lang'] = article.meta_lang
        article_response['meta_favicon'] = article.meta_favicon
        article_response['meta_keywords'] = list(article.meta_keywords)
        article_response['tags'] = list(article.tags)

        # Images
        article_response['images'] = list(article.images)
        article_response['meta_image'] = article.meta_img
        article_response['top_image'] = article.top_image

        return ArticleOut(**article_response)

    else:
        raise HTTPException(status_code=404, detail="Article was not found")
Beispiel #5
0
def dateGuesser(urlParam):
    # Uses url slugs when available
    guess = guess_date(url=urlParam, html=requests.get(urlParam).text)

    #  Returns a Guess object with three properties
    return guess
Beispiel #6
0
def get_article():
    url = None

    url = request.args.get('url', type=str)

    if url == None:
        return 'url parameter is required', 400

    article = Article(url)
    article.download()

    if (article.download_state == 2):
        article.parse()
        article_dict = {}
        article_dict['status'] = 'ok'

        article_dict['article'] = {}
        article_dict['article']['source_url'] = article.source_url

        try:
            guess = guess_date(url=url, html=article.html)
            article_dict['article']['published'] = guess.date
            article_dict['article']['published_method_found'] = guess.method
            article_dict['article']['published_guess_accuracy'] = None
            if guess.accuracy is Accuracy.PARTIAL:
                article_dict['article']['published_guess_accuracy'] = 'partial'
            if guess.accuracy is Accuracy.DATE:
                article_dict['article']['published_guess_accuracy'] = 'date'
            if guess.accuracy is Accuracy.DATETIME:
                article_dict['article'][
                    'published_guess_accuracy'] = 'datetime'
            if guess.accuracy is Accuracy.NONE:
                article_dict['article']['published_guess_accuracy'] = None
        except:
            article_dict['article']['published'] = article.publish_date
            article_dict['article']['published_method_found'] = None
            article_dict['article']['published_guess_accuracy'] = None

        article_dict['article']['title'] = article.title
        article_dict['article']['text'] = article.text
        article_dict['article']['authors'] = list(article.authors)

        try:
            title_lang = detect(article.title)
        except:
            title_lang = None

        try:
            text_lang = detect(article.text)
        except:
            text_lang = None

        article_dict['article']['images'] = list(article.images)
        article_dict['article']['top_image'] = article.top_image
        article_dict['article']['meta_image'] = article.meta_img
        article_dict['article']['movies'] = list(article.movies)
        article_dict['article']['meta_keywords'] = list(article.meta_keywords)
        article_dict['article']['tags'] = list(article.tags)
        article_dict['article']['meta_description'] = article.meta_description
        article_dict['article']['meta_lang'] = article.meta_lang
        article_dict['article']['title_lang'] = str(title_lang)
        article_dict['article']['text_lang'] = str(text_lang)
        article_dict['article']['meta_favicon'] = article.meta_favicon
        return jsonify(article_dict)

    else:
        article_dict = {}
        article_dict['status'] = 'error'
        article_dict['article'] = article.download_exception_msg
        return jsonify(article_dict)
Beispiel #7
0
def get_dates(article_html: str, url: str) -> tuple:
    """
    Given the html and the url of the url,
    return the publication date and the modification date in isoformat as a tuple.

    # format is (date_published_iso, date_modified_iso)

    ("2020-05-27T21:59:25+01:00", "2020-05-28T18:34:13+01:00")

    If either of the publication date or the modification date cannot be found, they will be a
    empty string in the tuple.

    For instance, here is the example if the modification date was not found

    ("2020-05-27T21:59:25+01:00", "")

    How it works:

    1) Looks for date in a website's json.

    2) If date not found, look for date in url.

    3) If date still not found, look for date in html.

    4) Use media cloud's dateguesser.
    """
    # first try the json method
    datedict = datefind_json(article_html)  # method name changed
    pubtime = datedict.get("datePublished", '')
    modtime = datedict.get("dateModified", "")

    # print("Go JSON", datedict)  # add print statement

    # add html to try because some news sources like psychology today only has html way of scraping
    if pubtime == "":
        pubtime = datefind_html(article_html, url)

        # print("Go HTML", pubtime)  # add print statement

    # now, try to look at the url
    url_date = re.search(
        r"(19|20)\d{2}[/\-_]"  # year
        r"[0-1]?\d[/\-_]"  # month
        r"[0-3]?\d",  # day
        url)

    if url_date is not None:
        date_found = re.split(r"[/\-_]", url_date.group())
        year = int(date_found[0])
        month = int(date_found[1])
        day = int(date_found[2])
        pubtime = datetime.date(year, month, day).isoformat()

        # print('Go url!', pubtime)

    # add date-guesser
    if pubtime == "":
        guess = guess_date(url, article_html)
        if guess.accuracy == Accuracy.DATE or guess.accuracy == Accuracy.DATETIME:
            pubtime = guess.date.isoformat()
            # print(guess.date.isoformat()[:19], guess.method)
            # print("Go dateguesser", pubtime, guess.method)
        elif guess.accuracy == Accuracy.PARTIAL:
            pubtime = guess.date.isoformat()[:7]
            # print("Go dateguesser", pubtime, guess.method)
            # print(guess.accuracy, guess.method, guess.date.isoformat()[:7])
        # else:
        # print(guess.accuracy, guess.method, guess.date)

    return pubtime, modtime