Example #1
0
def parse_url():
    print("Got request", request.args)
    # No URL found. Raise error
    url = request.args.get('url', None)
    print(url)
    try:
        if url is None:
            raise ApplicationError(*error_list["URL_NT_FND"])
    except ApplicationError as error:
        return return_result(error)

    # TODO: Throwing error not added
    news_obj, twitter_obj, error = preprocessor(url, published=True)
    
    if error is not None:
        return return_result(error)
    if len(news_obj.content.split(' ')) < MIN_CONTENT_LEN:
        return return_result(ApplicationError(*error_list["CONTENT_TOO_SHORT"]))

    aggregator = Aggregator(news=news_obj, tweet=twitter_obj, is_twitter=twitter_obj is not None)
    try:
        aggregator.run_models()
    except ApplicationError as error:
        return return_result(error)

    return return_result(error, True,  aggregator, twitter_obj, news_obj)
Example #2
0
def parse_file():
    print("Got request", request.args)

    # If file not found, raise error
    try:
        if 'file' not in request.files:
            raise ApplicationError(*error_list["FILE_NT_FND"])
        else:
            filest = request.files['file']
            if not filest.filename.endswith('doc') and not filest.filename.endswith('docx'):
                raise ApplicationError(*error_list["FILE_NT_SUP"])
            else:
                file_obj = io.BytesIO(filest.read())
    except ApplicationError as error:
        return return_result(error)

    news_obj, twitter_obj, error = preprocessor(file_obj, published=False)
    
    if error is not None:
        return return_result(error)
    if len(news_obj.content.split(' ')) < MIN_CONTENT_LEN:
        return return_result(ApplicationError(*error_list["CONTENT_TOO_SHORT"]))

    aggregator = Aggregator(news=news_obj, tweet=twitter_obj, is_twitter=False)
    try:
        aggregator.run_models()
    except ApplicationError as error:
        return return_result(error)

    # TODO: returning result
    return return_result(error, False, aggregator, twitter_obj, news_obj)
Example #3
0
def get_news_from_file(path: str) -> [NewsObject, ApplicationError]:
    try:
        word_file = NewsObject(path)
        word_file.fetch_from_file()
        if word_file.error:
            raise ApplicationError(*error_list["UNBL_FTCH_NEWS"])
        return word_file, None
    except ApplicationError as err:
        return None, err
Example #4
0
def get_news_from_url(url: str) -> [NewsObject, ApplicationError]:
    try:
        news = NewsObject(url)
        news.fetch_from_url()
        # expecting that in case of error you throw error
        if news.error:
            raise ApplicationError(*error_list["UNBL_FTCH_NEWS"])
        return news, None
    except ApplicationError as err:
        return None, err
Example #5
0
def is_whitelisted_url(url):
    """
    Parse a given url to determine whether it is a valid url and whether the source is supported.
    :param url: A url given by user.
    :return: Return the source.
    :error: Application error if the source is not supported or invalid. 
    """

    if not validators.url(url):
        print('invalid url')
        raise ApplicationError(*error_list["MAL_URL"])

    _, source, _ = tldextract.extract(url)
    if source in supported_sources:
        # replace tldextract.extract(url).domain) with source
        # pytest.set_trace()
        return source
    else:
        print('unsupported source')
        raise ApplicationError(*error_list["UNSUP_SRC"])
Example #6
0
def return_result(error: ApplicationError, published=None, aggregator=None, tweet=None, news_obj=None):
    if error is None:
        agg_dict = aggregator.to_dict() if aggregator is not None else None
        news_dict = news_obj.to_dict() if news_obj is not None else None
        tweet_dict = tweet.to_dict() if tweet is not None else None
        if published:
            input_type = 'Twitter' if tweet is not None else "NonTwitter"
        else:
            input_type = "UnPub"
        return jsonify({
            "input_type": input_type,
            "models": agg_dict,
            "details": news_dict,
            "metrics": tweet_dict,
            "error": ""
        })
    else:
        return jsonify({"error": error.to_dict()})
Example #7
0
def preprocessor(url, published):
    """
    Given a url(a web url or a file object), the preprocessor identifies 
    the type of the url and generates
    news article data and/or twitter data. 
    
    :returns: Returns newsObject, tweeter object and error object
    """
    news, tweet, error = None, None, None
    # article is published
    if published:
        if url is not None and not url.startswith("https://"):
            url = "https://" + url

        try:
            source = is_whitelisted_url(url)
        except ApplicationError as error:
            return None, None, error

        # if the url is from twitter
        if source == "twitter":
            tweet, error = get_tweet(url)
            if error is not None:
                return None, None, error
            # check expanded url to make sure it is supported
            try:
                is_whitelisted_url(tweet.expanded_url)
                news, error = get_news_from_url(tweet.expanded_url)
            except ApplicationError as error:
                error = ApplicationError(*error_list["UNSUP_EMB_URL"])
                return None, None, error
        else:
            news, error = get_news_from_url(url)

        if error is not None:
            return None, None, error
    # article is not published
    else:
        news, error = get_news_from_file(url)
        if error is not None:
            return None, None, error

    return news, tweet, error
Example #8
0
def ratelimit_handler(e):
    return return_result(ApplicationError(*error_list["RATE_LIMIT_EXCEEDED"]))