Ejemplo n.º 1
0
def twitter(url):
    """
    Response/Returns::

        {
            "count":19514340,
            "url":"http:\/\/www.google.com\/"
        }

    Returns the count.

    note::

        This is an undocumented/unofficial endpoint,
        so it could be gone at any moment.
    """
    retries = 0
    while retries < 5:
        try:
            data = _request(
                'https://cdn.api.twitter.com/1/urls/count.json?url=', url)
            return int(data.get('count', 0))

        except error.HTTPError as e:
            logger.exception(
                'Error getting score for `twitter` ({0}): {1}'.format(url, e))
            return 0

        # This Twitter endpoint occassionally, for some reason, returns undecodable bytes.
        # This is often resolved after a few tries.
        except UnicodeDecodeError as e:
            sleep(1 * retries)
            retries += 1
    return 0
Ejemplo n.º 2
0
def extract_entry_data(url):
    """
    Fetch the full content for a feed entry url.

    Args:
        | url (str)    -- the url of the entry.

    Returns:
        | entry_data -- Goose object.
        | str        -- the full text, including html.
    """

    html = _get_html(url)

    try:
        # Use Goose to extract data from the raw html,
        # Use readability to give us the html of the main document.

        # Some HTML comes with additional characters prior
        # to the actual document, so we want to strip everything up
        # to the first tag.
        html = html[html.index(b'<'):]

        return g.extract(raw_html=html), Document(html).summary()

    except UnicodeDecodeError as e:
        logger.exception('UnicodeDecodeError with html: {0}'.format(html))
        return None, ''
Ejemplo n.º 3
0
def stumbleupon(url):
    """
    Response::

        {
            "result": {
                "url":"http:\/\/www.google.com\/",
                "in_index":true,
                "publicid":"2pI1xR",
                "views":254956,
                "title":"Google",
                "thumbnail":"http:\/\/cdn.stumble-upon.com\/mthumb\/31\/10031.jpg",
                "thumbnail_b":"http:\/\/cdn.stumble-upon.com\/bthumb\/31\/10031.jpg",
                "submit_link":"http:\/\/www.stumbleupon.com\/submit\/?url=http:\/\/www.google.com\/",
                "badge_link":"http:\/\/www.stumbleupon.com\/badge\/?url=http:\/\/www.google.com\/",
                "info_link":"http:\/\/www.stumbleupon.com\/url\/www.google.com\/"
            },
            "timestamp":1393894952,
            "success":true
        }

    Returns the view count.
    """
    try:
        data = _request(
            'http://www.stumbleupon.com/services/1.01/badge.getinfo?url=', url)
        return int(data.get('result', {}).get('views', 0))
    except error.HTTPError as e:
        logger.exception(
            'Error getting score for `stumbleupon` ({0}): {1}'.format(url, e))
        return 0
Ejemplo n.º 4
0
def twitter(url):
    """
    Response/Returns::

        {
            "count":19514340,
            "url":"http:\/\/www.google.com\/"
        }

    Returns the count.

    note::

        This is an undocumented/unofficial endpoint,
        so it could be gone at any moment.
    """
    retries = 0
    while retries < 5:
        try:
            data = _request("https://cdn.api.twitter.com/1/urls/count.json?url=", url)
            return int(data.get("count", 0))

        except error.HTTPError as e:
            logger.exception("Error getting score for `twitter` ({0}): {1}".format(url, e))
            return 0

        # This Twitter endpoint occassionally, for some reason, returns undecodable bytes.
        # This is often resolved after a few tries.
        except UnicodeDecodeError as e:
            sleep(1 * retries)
            retries += 1
    return 0
Ejemplo n.º 5
0
def stumbleupon(url):
    """
    Response::

        {
            "result": {
                "url":"http:\/\/www.google.com\/",
                "in_index":true,
                "publicid":"2pI1xR",
                "views":254956,
                "title":"Google",
                "thumbnail":"http:\/\/cdn.stumble-upon.com\/mthumb\/31\/10031.jpg",
                "thumbnail_b":"http:\/\/cdn.stumble-upon.com\/bthumb\/31\/10031.jpg",
                "submit_link":"http:\/\/www.stumbleupon.com\/submit\/?url=http:\/\/www.google.com\/",
                "badge_link":"http:\/\/www.stumbleupon.com\/badge\/?url=http:\/\/www.google.com\/",
                "info_link":"http:\/\/www.stumbleupon.com\/url\/www.google.com\/"
            },
            "timestamp":1393894952,
            "success":true
        }

    Returns the view count.
    """
    try:
        data = _request("http://www.stumbleupon.com/services/1.01/badge.getinfo?url=", url)
        return int(data.get("result", {}).get("views", 0))
    except error.HTTPError as e:
        logger.exception("Error getting score for `stumbleupon` ({0}): {1}".format(url, e))
        return 0
Ejemplo n.º 6
0
def _query_live(query):
    """
    Query the DBpedia live endpoint. We should be respectful of
    using this endpoint too much; if we find that is the case,
    we should setup our own synchronized DBpedia live endpoint.
    """
    data = {'query': '{0} {1}'.format(PREFIXES, query).replace('\n', ' ')}
    endpoint = 'http://dbpedia-live.openlinksw.com/sparql'
    url = '{endpoint}?{query}'.format(endpoint=endpoint, query=urlencode(data))
    req = request.Request(url,
                          headers={
                              'Accept': 'application/json',
                              'Content-Type': 'application/sparql-query'
                          })
    try:
        res = request.urlopen(req)
    except error.HTTPError as e:
        logger.exception('Error with query: {0}\n\nError: {1}'.format(
            query, e.read()))
        raise e
    if res.status != 200:
        raise Exception('Response error, status was not 200')
    else:
        content = res.read()
        return json.loads(content.decode('utf-8'))
    return None
Ejemplo n.º 7
0
def save_from_url(url, filename):
    """
    Saves a remote file to S3 and returns
    its S3 URL.
    """
    try:
        res = make_request(url)

    except error.HTTPError as e:
        # Wikimedia 404 errors are very common, since images may go
        # out of date.
        # So common that for now these exceptions are just ignored.
        if e.code == 404 and 'wikimedia' in url:
            logger.warning('Error requesting {0} : {1}'.format(url, e))

        # Other exceptions are more remarkable and should be brought up.
        else:
            logger.exception('Error requesting {0} : {1}'.format(url, e))
        return None

    except (ConnectionResetError, BadStatusLine, ValueError) as e:
        logger.exception('Error requesting {0} : {1}'.format(url, e))
        return None

    data = BytesIO(res.read())
    return save_from_file(data, filename)
Ejemplo n.º 8
0
def collect():
    """
    Looks for a source which has not been
    updated in at least an hour
    and fetches new articles for it.
    """
    # Get a feed which has not yet been updated
    # and is not currently being updated.
    feeds = Feed.query.filter(Feed.updated_at < datetime.utcnow() - timedelta(hours=1), ~Feed.updating).all()
    if feeds:
        feed = random.choice(feeds)

        # "Claim" this feed,
        # so other workers won't pick it.
        feed.updating = True
        db.session.commit()

        try:
            collector.collect(feed)
            feed.updated_at = datetime.utcnow()

        except Exception:
            logger.exception('Exception while collecting for feed {0}'.format(feed.ext_url))
            raise

        finally:
            feed.updating = False
            db.session.commit()
Ejemplo n.º 9
0
def _query_live(query):
    """
    Query the DBpedia live endpoint. We should be respectful of
    using this endpoint too much; if we find that is the case,
    we should setup our own synchronized DBpedia live endpoint.
    """
    data = {'query': '{0} {1}'.format(PREFIXES, query).replace('\n', ' ')}
    endpoint = 'http://dbpedia-live.openlinksw.com/sparql'
    url = '{endpoint}?{query}'.format(
            endpoint=endpoint,
            query=urlencode(data)
          )
    req = request.Request(url,
            headers={
                'Accept': 'application/json',
                'Content-Type': 'application/sparql-query'
            })
    try:
        res = request.urlopen(req)
    except error.HTTPError as e:
        logger.exception('Error with query: {0}\n\nError: {1}'.format(query, e.read()))
        raise e
    if res.status != 200:
        raise Exception('Response error, status was not 200')
    else:
        content = res.read()
        return json.loads(content.decode('utf-8'))
    return None
Ejemplo n.º 10
0
def extract_entry_data(url):
    """
    Fetch the full content for a feed entry url.

    Args:
        | url (str)    -- the url of the entry.

    Returns:
        | entry_data -- Goose object.
        | str        -- the full text, including html.
    """

    html = _get_html(url)

    try:
        # Use Goose to extract data from the raw html,
        # Use readability to give us the html of the main document.

        # Some HTML comes with additional characters prior
        # to the actual document, so we want to strip everything up
        # to the first tag.
        html = html[html.index(b'<'):]

        return g.extract(raw_html=html), Document(html).summary()

    except UnicodeDecodeError as e:
        logger.exception('UnicodeDecodeError with html: {0}'.format(html))
        return None, ''
Ejemplo n.º 11
0
def save_from_url(url, filename):
    """
    Saves a remote file to S3 and returns
    its S3 URL.
    """
    try:
        res = make_request(url)

    except error.HTTPError as e:
        # Wikimedia 404 errors are very common, since images may go
        # out of date.
        # So common that for now these exceptions are just ignored.
        if e.code == 404 and 'wikimedia' in url:
            logger.warning('Error requesting {0} : {1}'.format(url, e))

        # Other exceptions are more remarkable and should be brought up.
        else:
            logger.exception('Error requesting {0} : {1}'.format(url, e))
        return None

    except (ConnectionResetError, BadStatusLine, ValueError) as e:
        logger.exception('Error requesting {0} : {1}'.format(url, e))
        return None

    data = BytesIO(res.read())
    return save_from_file(data, filename)
Ejemplo n.º 12
0
def facebook(url):
    """
    Response::

        <links_getStats_response xmlns="http://api.facebook.com/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://api.facebook.com/1.0/ http://api.facebook.com/1.0/facebook.xsd" list="true">
            <link_stat>
                <url>www.google.com</url>
                <normalized_url>http://www.google.com/</normalized_url>
                <share_count>5402940</share_count>
                <like_count>1371562</like_count>
                <comment_count>1728901</comment_count>
                <total_count>8503403</total_count>
                <click_count>265614</click_count>
                <comments_fbid>381702034999</comments_fbid>
                <commentsbox_count>841</commentsbox_count>
            </link_stat>
        </links_getStats_response>

    In JSON::

        {
            'click_count': '265614',
            'comment_count': '1728901',
            'comments_fbid': '381702034999',
            'commentsbox_count': '841',
            'like_count': '1371562',
            'normalized_url': 'http://www.google.com/',
            'share_count': '5403040',
            'total_count': '8503503',
            'url': 'www.google.com'
        }

    Returns the click count (weighted by 0.25) plus the share + like + comment counts
    plus the external comments count (`commentsbox_count`).

    Note: `total_count` is the same as `shares` in `facebook_graph`.
    `total_count` is the sum of `comment_count, `like_count`, and `share_count`.

    Note: `commentsbox_count` refers to the number of comments external to Facebook, i.e.
    those that occur on their embedded widgets.

    This differs from `facebook_graph` in that the click count is incorporated (though weighted less).
    """

    try:
        data = _request(
            'https://api.facebook.com/restserver.php?method=links.getStats&urls=',
            url,
            format='xml')
        data_ = dict(data['links_getStats_response']['link_stat'])
        return int(data_['click_count']) / 4 + int(data_['total_count']) + int(
            data_['commentsbox_count'])
    except (error.HTTPError, KeyError) as e:
        logger.exception(
            'Error getting score for `facebook` ({0}): {1}'.format(url, e))
        return 0
Ejemplo n.º 13
0
def facebook(url):
    """
    Response::

        <links_getStats_response xmlns="http://api.facebook.com/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://api.facebook.com/1.0/ http://api.facebook.com/1.0/facebook.xsd" list="true">
            <link_stat>
                <url>www.google.com</url>
                <normalized_url>http://www.google.com/</normalized_url>
                <share_count>5402940</share_count>
                <like_count>1371562</like_count>
                <comment_count>1728901</comment_count>
                <total_count>8503403</total_count>
                <click_count>265614</click_count>
                <comments_fbid>381702034999</comments_fbid>
                <commentsbox_count>841</commentsbox_count>
            </link_stat>
        </links_getStats_response>

    In JSON::

        {
            'click_count': '265614',
            'comment_count': '1728901',
            'comments_fbid': '381702034999',
            'commentsbox_count': '841',
            'like_count': '1371562',
            'normalized_url': 'http://www.google.com/',
            'share_count': '5403040',
            'total_count': '8503503',
            'url': 'www.google.com'
        }

    Returns the click count (weighted by 0.25) plus the share + like + comment counts
    plus the external comments count (`commentsbox_count`).

    Note: `total_count` is the same as `shares` in `facebook_graph`.
    `total_count` is the sum of `comment_count, `like_count`, and `share_count`.

    Note: `commentsbox_count` refers to the number of comments external to Facebook, i.e.
    those that occur on their embedded widgets.

    This differs from `facebook_graph` in that the click count is incorporated (though weighted less).
    """

    try:
        data = _request("https://api.facebook.com/restserver.php?method=links.getStats&urls=", url, format="xml")
        data_ = dict(data["links_getStats_response"]["link_stat"])
        return int(data_["click_count"]) / 4 + int(data_["total_count"]) + int(data_["commentsbox_count"])
    except (error.HTTPError, KeyError) as e:
        logger.exception("Error getting score for `facebook` ({0}): {1}".format(url, e))
        return 0
Ejemplo n.º 14
0
def facebook_graph(url):
    """
    Response/Returns::

        {
            'comments': 841,
            'id': 'http://www.google.com',
            'shares': 8503503
        }

    Returns total shares (i.e. likes, shares, and comments) plus the external comments.
    """
    try:
        data = _request("https://graph.facebook.com/", url)
        return data["comments"] + data["shares"]
    except error.HTTPError as e:
        logger.exception("Error getting score for `facebook_graph` ({0}): {1}".format(url, e))
        return 0
Ejemplo n.º 15
0
def linkedin(url):
    """
    Response::

        {
            "count":12815,
            "fCnt":"12K",
            "fCntPlusOne":"12K",
            "url":"http:\/\/www.google.com\/"
        }

    Returns the count.
    """
    try:
        data = _request("https://www.linkedin.com/countserv/count/share?format=json&url=", url)
        return int(data["count"])
    except (error.HTTPError, ValueError) as e:
        logger.exception("Error getting score for `linkedin` ({0}): {1}".format(url, e))
        return 0
Ejemplo n.º 16
0
def _query(query):
    data = '{0} {1}'.format(PREFIXES, query).encode('utf-8')
    req = request.Request('http://{host}:3030/knowledge/query'.format(host=APP['KNOWLEDGE_HOST']),
            headers={
                'Accept': 'application/sparql-results+json',
                'Content-Type': 'application/sparql-query'
            },
            data=data)
    try:
        res = request.urlopen(req)
    except error.HTTPError as e:
        logger.exception('Error with query: {0}'.format(query))
        raise e
    if res.status != 200:
        raise Exception('Response error, status was not 200')
    else:
        content = res.read()
        return json.loads(content.decode('utf-8'))
    return None
Ejemplo n.º 17
0
def _query(query):
    data = '{0} {1}'.format(PREFIXES, query).encode('utf-8')
    req = request.Request('http://{host}:3030/knowledge/query'.format(
        host=APP['KNOWLEDGE_HOST']),
                          headers={
                              'Accept': 'application/sparql-results+json',
                              'Content-Type': 'application/sparql-query'
                          },
                          data=data)
    try:
        res = request.urlopen(req)
    except error.HTTPError as e:
        logger.exception('Error with query: {0}'.format(query))
        raise e
    if res.status != 200:
        raise Exception('Response error, status was not 200')
    else:
        content = res.read()
        return json.loads(content.decode('utf-8'))
    return None
Ejemplo n.º 18
0
def facebook_graph(url):
    """
    Response/Returns::

        {
            'comments': 841,
            'id': 'http://www.google.com',
            'shares': 8503503
        }

    Returns total shares (i.e. likes, shares, and comments) plus the external comments.
    """
    try:
        data = _request('https://graph.facebook.com/', url)
        return data['comments'] + data['shares']
    except error.HTTPError as e:
        logger.exception(
            'Error getting score for `facebook_graph` ({0}): {1}'.format(
                url, e))
        return 0
Ejemplo n.º 19
0
def linkedin(url):
    """
    Response::

        {
            "count":12815,
            "fCnt":"12K",
            "fCntPlusOne":"12K",
            "url":"http:\/\/www.google.com\/"
        }

    Returns the count.
    """
    try:
        data = _request(
            'https://www.linkedin.com/countserv/count/share?format=json&url=',
            url)
        return int(data['count'])
    except (error.HTTPError, ValueError) as e:
        logger.exception(
            'Error getting score for `linkedin` ({0}): {1}'.format(url, e))
        return 0
Ejemplo n.º 20
0
def get_articles(feed, fn):
    """
    Parse the specified feed,
    gathering the latest new articles.

    If an article matches one that already exists,
    it is skipped.

    The minimum length of an entry is
    500 characters. Anything under will be ignored.

    This will silently skip articles for which the full text
    can't be retrieved (i.e. if it returns 404).

    Some feeds, for whatever reason, do not include a `published`
    date in their entry data. In which case, it is left as an
    empty string.

    Args:
        | feed (Feed)    -- the feed to fetch from.
        | fn (Callable)  -- function to use an article
    """
    # Fetch the feed data.
    data = feedparser.parse(feed.ext_url)

    # If the `bozo` value is anything
    # but 0, there was an error parsing (or connecting) to the feed.
    if data.bozo:
        # Some errors are ok.
        if not isinstance(
                data.bozo_exception,
                feedparser.CharacterEncodingOverride) and not isinstance(
                    data.bozo_exception, feedparser.NonXMLContentType):
            raise data.bozo_exception

    for entry in data.entries:

        # URL for this entry.
        url = entry['links'][0]['href']

        # Check for an existing Article.
        # If one exists, skip.
        if Article.query.filter_by(ext_url=url).count():
            continue

        # Complete HTML content for this entry.
        try:
            entry_data, html = extractor.extract_entry_data(url)
        except (error.HTTPError, error.URLError, ConnectionResetError,
                BadStatusLine) as e:
            if type(e) == error.URLError or e.code == 404:
                # Can't reach, skip.
                logger.exception(
                    'Error extracting data for url {0}'.format(url))
                continue
            else:
                # Just skip so things don't break!
                logger.exception(
                    'Error extracting data for url {0}'.format(url))
                continue

        if entry_data is None:
            continue

        full_text = entry_data.cleaned_text

        # Skip over entries that are too short.
        if len(full_text) < 400:
            continue

        url = entry_data.canonical_link or url
        published = parse(entry.get('published')) if entry.get(
            'published') else entry_data.publish_date
        updated = parse(
            entry.get('updated')) if entry.get('updated') else published
        title = entry.get('title', entry_data.title)

        # Secondary check for an existing Article,
        # by checking the title and source.
        existing = Article.query.filter_by(title=title).first()
        if existing and existing.source == feed.source:
            continue

        # Download and save the top article image.
        image_url = extractor.extract_image(entry_data, filename=hash(url))

        fn(
            Article(ext_url=url,
                    source=feed.source,
                    feed=feed,
                    html=html,
                    text=fix_text_segment(full_text),
                    authors=extractor.extract_authors(entry),
                    tags=extractor.extract_tags(entry,
                                                known_tags=entry_data.tags),
                    title=fix_text_segment(title),
                    created_at=published,
                    updated_at=updated,
                    image=image_url,
                    score=evaluator.score(url)))
Ejemplo n.º 21
0
def internal_error(error):
    logger.exception('Interal server error when requesting {0}: {1}'.format(
        request.path, error))
    return jsonify(status=500, message='Internal server error.'), 500
Ejemplo n.º 22
0
def internal_error(error):
    logger.exception('Interal server error when requesting {0}: {1}'.format(request.path, error))
    return jsonify(status=500, message='Internal server error.'), 500
Ejemplo n.º 23
0
def get_articles(feed, fn):
    """
    Parse the specified feed,
    gathering the latest new articles.

    If an article matches one that already exists,
    it is skipped.

    The minimum length of an entry is
    500 characters. Anything under will be ignored.

    This will silently skip articles for which the full text
    can't be retrieved (i.e. if it returns 404).

    Some feeds, for whatever reason, do not include a `published`
    date in their entry data. In which case, it is left as an
    empty string.

    Args:
        | feed (Feed)    -- the feed to fetch from.
        | fn (Callable)  -- function to use an article
    """
    # Fetch the feed data.
    data = feedparser.parse(feed.ext_url)

    # If the `bozo` value is anything
    # but 0, there was an error parsing (or connecting) to the feed.
    if data.bozo:
        # Some errors are ok.
        if not isinstance(data.bozo_exception, feedparser.CharacterEncodingOverride) and not isinstance(data.bozo_exception, feedparser.NonXMLContentType):
            raise data.bozo_exception

    for entry in data.entries:

        # URL for this entry.
        url = entry['links'][0]['href']

        # Check for an existing Article.
        # If one exists, skip.
        if Article.query.filter_by(ext_url=url).count():
            continue

        # Complete HTML content for this entry.
        try:
            entry_data, html = extractor.extract_entry_data(url)
        except (error.HTTPError, error.URLError, ConnectionResetError, BadStatusLine) as e:
            if type(e) == error.URLError or e.code == 404:
                # Can't reach, skip.
                logger.exception('Error extracting data for url {0}'.format(url))
                continue
            else:
                # Just skip so things don't break!
                logger.exception('Error extracting data for url {0}'.format(url))
                continue

        if entry_data is None:
            continue

        full_text = entry_data.cleaned_text

        # Skip over entries that are too short.
        if len(full_text) < 400:
            continue

        url = entry_data.canonical_link or url
        published = parse(entry.get('published')) if entry.get('published') else entry_data.publish_date
        updated = parse(entry.get('updated')) if entry.get('updated') else published
        title = entry.get('title', entry_data.title)

        # Secondary check for an existing Article,
        # by checking the title and source.
        existing = Article.query.filter_by(title=title).first()
        if existing and existing.source == feed.source:
            continue

        # Download and save the top article image.
        image_url = extractor.extract_image(entry_data, filename=hash(url))

        fn(Article(
            ext_url=url,
            source=feed.source,
            feed=feed,
            html=html,
            text=fix_text_segment(full_text),
            authors=extractor.extract_authors(entry),
            tags=extractor.extract_tags(entry, known_tags=entry_data.tags),
            title=fix_text_segment(title),
            created_at=published,
            updated_at=updated,
            image=image_url,
            score=evaluator.score(url)
        ))