def crawl_feedly(from_date, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2017-dec-07) "Authorization": "AzNr8sCFyRuIX3upnzA-VnUUebvthkUEF0R9bccg352muEznNt9hK9m4kj8ljkQvFfoVGDYHZcLBFKuFgXRVy4HN1sVV2WYowIsQZ7lTGxB9WYNqxRGimPyZUAijHL7ugMo9hxRgYij_rOonwruuus3O2BQe7U_sNGy_SKL6nmEVDh-DsQL5EOVM34C3-0tcATwEMoaQxUUQ78bAJ6i3HrnLy8NPUg:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: if 'originId' in entry: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] elif 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_date, rss_field): global headers bulk_data = [] today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2017-07-20) "Authorization": "A2JxorrfeTBQbMUsDIU3_zexSwY8191e3P9EvewYowjfbhKwOgHk84ErlXAWXpucZ_McfTDHLZN6yLxWqxgjWM8Upp1c-6Nb_RpZd0jWA9mJkVLN1JTETefaVNZtZqzTGTf8_qeT2ZE8z6Bf4LqLOUfQaQH2-jj8XIaxAyWMZ5BDRtfpgwVYrEEM2ii5KXnMJZxGNEvcqAV4Dke_subaM-wlnC8N63g:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') feed_category = feed['categories'][0]['label'] print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or feed_category == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: feedlymap.published_date = datetime( 2010, 1, 1, 00, 00, 00) feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_date, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2018-feb-02) "Authorization": "A1j2bsImQdCENT7FyWxSWABu7_KwSQOKNvAySLwJQlQT3QoRlpur6iG56Xju8owoOfMF7byi1ApQUIHbUpsEBoFH-CijTCUi72hl1U1MG7eaY07ctFiEbL-e9D17yUdq3OT3iRoE04F0_1h-JcUBP513gnObI0JxD0LQk4bagAv3b22ot3jbXLoLoQgBPbBf4eKS97oyGntWM_3GMa66m1ElrAeP5R42V25WPqXZmmEwAouivQp31kDLxqFLIA:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: if 'originId' in entry: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] elif 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_dt, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_dt, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2018-aug-26) "Authorization": "A3iuGsp9UjnsSiLwl5ZoPrLZj3mO4d16muxgezgpLesPhJ4YoKgC0XdiW_ucnm7b1Z-o5DKK6oLqoW9SRNUkoTcQ8npBBmqbOF03zF3tFWaNI0Lir_hrAahmVuypG5BXVZidJJ4PuaXr4zg5pYRE32OxO0N05X_A2sdZC93oWwQU1GVLJ9evh3qmu0WXYPVXpxffytgnFjUg2JB1zGK3KJkbDl-6ioJudiD2IZczA0R52tPwFZZ0FimkE3zV:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] feedlymap.url = "" try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] if len(feedlymap.url) == 0: if 'originId' in entry: n = entry['originId'].find('http') if n > 0: feedlymap.url = entry['originId'][n:] if len(feedlymap.url) == 0: if 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_dt, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_dt, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2020-jul-10) "Authorization": "Azsr6uaruKGMnymDVmYUkDrF33mC2csnyv1OScN4hpsnH5w2ngb0zEBlwyAo4izpB3W3a2RYDAW99xYFM61U5g0U13M59tiAjZFqHkVpAXVeG8PAYl5Y060wwErrxvjj12UNeQ4bk23mzCcoa9AAJtBvUMl_DZl2-jaX0cf_vmlZuVMQh-B2Srv1FUEkno3fbVJtTdZeOc1YP29aRluNyYndpm2CWYKFjaeL1LicHbObhdjgHQAZ-EFUUDCA:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] feedlymap.url = "" try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] if len(feedlymap.url) == 0: if 'originId' in entry: n = entry['originId'].find('http') if n > 0: feedlymap.url = entry['originId'][n:] if len(feedlymap.url) == 0: if 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_date, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2017-10-04) "Authorization" : "A2nU8r1LuQ_wUuYHftraCIc0imow9HY7GYB1qxm-OeaU--I-cVt69lCZfEkvsOSX8R9qI6C6ABH5Nq1XKFnKX6JlkY_myGM_hfksTQe4wmWlqRxj-LBQ7n9UhIL1oXfAf80jAVhiz6w8tB9ToYV_YwB47sHASzTMlybx-5bXgmu9gtR-N-FUKByfgihrIjpShy6hMwHYYnKhz73DfQ3JhMCAdAqL1RA:feedlydev" } params_streams = { # "count" : "100", "count" : "1000", "ranked" : "newest", "unreadOnly": "false", "newerThan" : newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp(entry['published']/1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime(last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: if 'originId' in entry: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] elif 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup(entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True