Ejemplo n.º 1
0
def crawl_feedly(from_date, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2017-dec-07)
        "Authorization":
        "AzNr8sCFyRuIX3upnzA-VnUUebvthkUEF0R9bccg352muEznNt9hK9m4kj8ljkQvFfoVGDYHZcLBFKuFgXRVy4HN1sVV2WYowIsQZ7lTGxB9WYNqxRGimPyZUAijHL7ugMo9hxRgYij_rOonwruuus3O2BQe7U_sNGy_SKL6nmEVDh-DsQL5EOVM34C3-0tcATwEMoaQxUUQ78bAJ6i3HrnLy8NPUg:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            feedlymap.url = entry['originId'][n:]
                        elif 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Ejemplo n.º 2
0
def crawl_feedly(from_date, rss_field):
    global headers

    bulk_data = []
    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2017-07-20)
        "Authorization":
        "A2JxorrfeTBQbMUsDIU3_zexSwY8191e3P9EvewYowjfbhKwOgHk84ErlXAWXpucZ_McfTDHLZN6yLxWqxgjWM8Upp1c-6Nb_RpZd0jWA9mJkVLN1JTETefaVNZtZqzTGTf8_qeT2ZE8z6Bf4LqLOUfQaQH2-jj8XIaxAyWMZ5BDRtfpgwVYrEEM2ii5KXnMJZxGNEvcqAV4Dke_subaM-wlnC8N63g:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        feed_category = feed['categories'][0]['label']
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or feed_category == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        feedlymap.published_date = datetime(
                            2010, 1, 1, 00, 00, 00)
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        n = entry['originId'].find('http')
                        feedlymap.url = entry['originId'][n:]
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)

    bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Ejemplo n.º 3
0
def crawl_feedly(from_date, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2018-feb-02)
        "Authorization":
        "A1j2bsImQdCENT7FyWxSWABu7_KwSQOKNvAySLwJQlQT3QoRlpur6iG56Xju8owoOfMF7byi1ApQUIHbUpsEBoFH-CijTCUi72hl1U1MG7eaY07ctFiEbL-e9D17yUdq3OT3iRoE04F0_1h-JcUBP513gnObI0JxD0LQk4bagAv3b22ot3jbXLoLoQgBPbBf4eKS97oyGntWM_3GMa66m1ElrAeP5R42V25WPqXZmmEwAouivQp31kDLxqFLIA:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            feedlymap.url = entry['originId'][n:]
                        elif 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Ejemplo n.º 4
0
def crawl_feedly(from_dt, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_dt, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2018-aug-26)
        "Authorization":
        "A3iuGsp9UjnsSiLwl5ZoPrLZj3mO4d16muxgezgpLesPhJ4YoKgC0XdiW_ucnm7b1Z-o5DKK6oLqoW9SRNUkoTcQ8npBBmqbOF03zF3tFWaNI0Lir_hrAahmVuypG5BXVZidJJ4PuaXr4zg5pYRE32OxO0N05X_A2sdZC93oWwQU1GVLJ9evh3qmu0WXYPVXpxffytgnFjUg2JB1zGK3KJkbDl-6ioJudiD2IZczA0R52tPwFZZ0FimkE3zV:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    feedlymap.url = ""
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    if len(feedlymap.url) == 0:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            if n > 0:
                                feedlymap.url = entry['originId'][n:]
                    if len(feedlymap.url) == 0:
                        if 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Ejemplo n.º 5
0
def crawl_feedly(from_dt, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_dt, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2020-jul-10)
        "Authorization":
        "Azsr6uaruKGMnymDVmYUkDrF33mC2csnyv1OScN4hpsnH5w2ngb0zEBlwyAo4izpB3W3a2RYDAW99xYFM61U5g0U13M59tiAjZFqHkVpAXVeG8PAYl5Y060wwErrxvjj12UNeQ4bk23mzCcoa9AAJtBvUMl_DZl2-jaX0cf_vmlZuVMQh-B2Srv1FUEkno3fbVJtTdZeOc1YP29aRluNyYndpm2CWYKFjaeL1LicHbObhdjgHQAZ-EFUUDCA:feedlydev"
    }

    params_streams = {
        #       "count"     : "100",
        "count": "1000",
        "ranked": "newest",
        "unreadOnly": "false",
        "newerThan": newerthan
    }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category,
              feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    feedlymap.url = ""
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(
                            entry['published'] / 1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(
                            last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    if 'title' in entry:
                        feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    if len(feedlymap.url) == 0:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            if n > 0:
                                feedlymap.url = entry['originId'][n:]
                    if len(feedlymap.url) == 0:
                        if 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],
                                           "lxml")  # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(
                            entry['content']['content'],
                            "lxml")  # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True
Ejemplo n.º 6
0
Archivo: crawl.py Proyecto: VinACE/FMI
def crawl_feedly(from_date, rss_field):
    global headers

    today = datetime.now()
    days = timedelta(days=31)
    yesterday = today - days
    s = yesterday.timestamp()
    t = time(0, 0)
    dt = datetime.combine(from_date, t)
    s = dt.timestamp()
    #datetime.datetime.fromtimestamp(s).strftime('%c')
    ms = s * 1000
    newerthan = "{:.0f}".format(ms)
    headers = {
        #[email protected] (expires on 2017-10-04)
        "Authorization" : "A2nU8r1LuQ_wUuYHftraCIc0imow9HY7GYB1qxm-OeaU--I-cVt69lCZfEkvsOSX8R9qI6C6ABH5Nq1XKFnKX6JlkY_myGM_hfksTQe4wmWlqRxj-LBQ7n9UhIL1oXfAf80jAVhiz6w8tB9ToYV_YwB47sHASzTMlybx-5bXgmu9gtR-N-FUKByfgihrIjpShy6hMwHYYnKhz73DfQ3JhMCAdAqL1RA:feedlydev"
        }

    params_streams = {
#       "count"     : "100",
        "count"     : "1000",
        "ranked"    : "newest",
        "unreadOnly": "false",
        "newerThan" : newerthan
        }
    #url = "http://cloud.feedly.com/v3/profile"
    #r = requests.get(url, headers=headers)
    url = "http://cloud.feedly.com/v3/subscriptions"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return False
    feeds = r.json()
    for feed in feeds:
        feed_id = feed['id']
        feed_title = feed['title'].encode("ascii", 'replace')
        # the category label can contain the subset and category name
        category_label = feed['categories'][0]['label']
        label_split = category_label.split('-')
        if len(label_split) > 1:
            feed_subset = label_split[0].strip()
            feed_category = label_split[1].strip()
        else:
            feed_subset = 'SI'
            feed_category = label_split[0].strip()
        print("crawl_feedly: scraping feed category/title", feed_category, feed_title)
        if rss_field == '' or category_label == rss_field:
            url = "http://cloud.feedly.com/v3/streams/contents"
            params_streams['streamId'] = feed_id
            r = requests.get(url, headers=headers, params=params_streams)
            stream = r.json()
            if 'items' in stream:
                bulk_data = None
                bulk_data = []
                for entry in stream['items']:
                    feedlymap = models.FeedlyMap()
                    feedlymap.post_id = entry['id']
                    try:
                        feedlymap.published_date = datetime.fromtimestamp(entry['published']/1000)
                    except:
                        last_year = datetime.now().year - 1
                        feedlymap.published_date = datetime(last_year, 1, 1, 00, 00, 00)
                    feedlymap.subset = feed_subset
                    feedlymap.category = feed_category
                    feedlymap.feed = feed_title
                    if 'topics' in feed:
                        feedlymap.feed_topics = feed['topics']
                    if 'keywords' in entry:
                        feedlymap.body_topics = entry['keywords']
                    feedlymap.title = entry['title']
                    if 'canonicalUrl' in entry:
                        feedlymap.url = entry['canonicalUrl']
                    else:
                        if 'originId' in entry:
                            n = entry['originId'].find('http')
                            feedlymap.url = entry['originId'][n:]
                        elif 'origin' in entry:
                            origin = entry['origin']
                            feedlymap.url = origin['htmlUrl']
                    feedlymap.post_id = feedlymap.url
                    if 'summary' in entry:
                        bs = BeautifulSoup(entry['summary']['content'],  "lxml") # in case of RSS feed
                    if 'content' in entry:
                        bs = BeautifulSoup(entry['content']['content'], "lxml") # in case of Google News feed
                    feedlymap.body = bs.get_text().encode("ascii", 'replace')
                    data = elastic.convert_for_bulk(feedlymap, 'update')
                    bulk_data.append(data)
                bulk(models.client, actions=bulk_data, stats_only=True)
    return True