def index_products_data(): count = 1 data = [] for product_data in models.scrape_li: review_count = 1 reviews = product_data['reviews'] for review in reviews: r = models.Review() r.reviewid = product_data['url'] + "?review=" + str(review_count) review_count = review_count + 1 r.perfume = product_data['perfume'] r.site = product_data['site'] r.brand_name = product_data['brand_name'] r.brand_variant = product_data['brand_variant'] r.review_date = datetime.strptime(review['date'], '%b %d %Y').date() r.review = review['body'] r.label = review['label'] r.accords = product_data.get('accords', {}) notespyramid = product_data.get('notespyramid', {}) r.notespyramid = notespyramid.get('top', []) r.moods = product_data.get('moods', {}) r.notes = product_data.get('notes', {}) r.longevity = product_data.get('longevity', {}) r.sillage = product_data.get('sillage', {}) r.ratings = product_data.get('ratings', {}) r.img_src = product_data.get('img_src', "") data.append(elastic.convert_for_bulk(r, 'update')) count = count + 1 if count > 100: bulk(models.client, actions=data, stats_only=True) count = 1 bulk(models.client, actions=data, stats_only=True)
def scrape_page_map(self, sub_site, url, bs): id = url site_url = urlparse(url).netloc.split('.')[1] sub_site_url = urlparse(url).path.split('/') sub_site_name = '-'.join(sub_site[1:-1]) if sub_site_name == '': sub_site_name = 'Home' pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url pagemap.section = '' try: # get posted date pagemap.published_date = datetime.today() except: pass try: # get page pagemap.page = bs.get_text() except: pass try: # get title if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url # get posted date try: pagemap.posted_date = datetime.today() author_info_tag = bs.find("div", class_="author_info") published = author_info_tag.find('p', class_='date').text pagemap.posted_date = datetime.strptime(published, '%d-%b-%Y') except: pass try: box_1_tag = bs.find("div", class_="box_1") product_info_bar_tag = box_1_tag.find("div", class_="product_info_bar") published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})', product_info_bar.text, re.MULTILINE) pagemap.posted_date = datetime.strptime(published.group(0), '%d-%b-%Y') except: pass # get page try: pagemap.page = bs.get_text() box_1_tag = bs.find("div", class_="box_1") pagemap.page = box_1_tag.text product_main_text_tag = box_1_tag.find("div", class_="product_main_text") if product_main_text_tag != None: pagemap.page = product_main_text_tag.text else: story_tag = box_1_tag.find("div", class_="story") pagemap.page = story_tag.text except: pass # get title try: if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' box_1_tag = bs.find("div", class_="box_1") pagemap.title = box_1_tag.find("h1").text except: pass # get section try: box_2_tag = bs.find("div", class_="box_2") pagemap.section = box_2_tag.text.strip(' \t\n\r') except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def crawl_studies_facts(survey_field, facts_d): bulk_data = [] count = 0 total_count = 0 facts_df = DataFrame.from_dict(facts_d, orient='index') facts_df['blindcode'] = [ix[0] for ix in facts_df.index] facts_df['fact'] = [ix[1] for ix in facts_df.index] facts_df['answer'] = [ix[2] for ix in facts_df.index] for blindcode, facts_blindcode_df in facts_df.groupby( facts_df['blindcode']): se = models.StudiesMap() se.cft_id = blindcode se.dataset = survey_field se.ingr_name = blindcode se.IPC = blindcode percentile = {} for idx, fact_s in facts_blindcode_df.iterrows(): fact = fact_s['fact'] answer = fact_s['answer'] #se.supplier = "CI" #se.olfactive = cft_s.olfactive #se.region = cft_s.region #se.review = cft_s.review #se.dilution = cft_s.dilution #se.intensity = cft_s.intensity if fact not in percentile.keys(): percentile[fact] = [] val = answer prc = fact_s[0] if prc > 0: percentile[fact].append((val, prc)) for fact in percentile.keys(): if fact == 'emotion': se.emotion = percentile[fact] if fact == 'suitable_stage': se.suitable_stage = percentile[fact] if fact == 'hedonics': se.hedonics = percentile[fact] if fact == 'freshness': se.freshness = percentile[fact] data = elastic.convert_for_bulk(se, 'update') bulk_data.append(data) count = count + 1 if count > 100: bulk(models.client, actions=bulk_data, stats_only=True) total_count = total_count + count print( "crawl_studies_facts: written another batch, total written {0:d}" .format(total_count)) bulk_data = [] count = 1 bulk(models.client, actions=bulk_data, stats_only=True) pass
def load_scentemotion(cft_filename): ml_file = 'data/' + cft_filename cft_df = pd.read_csv(ml_file, sep=';', encoding='ISO-8859-1', low_memory=False) cft_df.fillna(0, inplace=True) cft_df.index = cft_df['cft_id'] bulk_data = [] count = 0 total_count = 0 for cft_id, cft_s in cft_df.iterrows(): se = models.ScentemotionMap() se.cft_id = cft_id se.dataset = "ingredients" se.ingr_name = cft_s.ingr_name se.IPC = cft_s.IPC se.supplier = cft_s.supplier se.olfactive = cft_s.olfactive se.region = cft_s.region se.review = cft_s.review se.dilution = cft_s.dilution se.intensity = cft_s.intensity percentile = {} for col in cft_s.index: col_l = col.split("_", 1) fct = col_l[0] if fct not in ["mood", "smell", "negative", "descriptor", "color", "texture"]: continue if fct not in percentile.keys(): percentile[fct] = [] val = col_l[1] prc = cft_s[col] if prc > 0: #percentile[fct].append((val, "{0:4.2f}".format(prc))) percentile[fct].append((val, prc)) se.mood = percentile["mood"] se.smell = percentile["smell"] se.negative = percentile["negative"] se.descriptor = percentile["descriptor"] se.color = percentile["color"] se.texture = percentile["texture"] data = elastic.convert_for_bulk(se, 'update') bulk_data.append(data) count = count + 1 if count > 100: bulk(models.client, actions=bulk_data, stats_only=True) total_count = total_count + count print("load_scentemotion: written another batch, total written {0:d}".format(total_count)) bulk_data = [] count = 1 bulk(models.client, actions=bulk_data, stats_only=True) pass
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url # get posted date # <span class="entry-date">May 23, 2017</span> try: pagemap.posted_date = datetime.today() entry_date_tag = bs.find("span", class_="entry-date") published = entry_date_tag.text pagemap.posted_date = datetime.strptime(published, '%B %d, %Y') except: pass #try: # box_1_tag = bs.find("div", class_="box_1") # product_info_bar_tag = box_1_tag.find("div", class_="product_info_bar") # published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})', product_info_bar.text, re.MULTILINE) # pagemap.posted_date = datetime.strptime(published.group(0), '%d-%b-%Y') #except: # pass # get page # <section class="entry-content"> try: pagemap.page = bs.get_text() entry_content_tag = bs.find("section", class_="entry-content") pagemap.page = entry_content_tag.text except: pass # get title # <h1 class="entry-title"></h1> text try: if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' entry_title_tag = bs.find("h1", class_="entry-title") pagemap.title = entry_title_tag.text except: pass # get section try: pagemap.section = sub_site except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url # get posted date # <span class="entry-date">May 23, 2017</span> try: pagemap.published_date = datetime.today() entry_date_tag = bs.find("span", class_="entry-date") published = entry_date_tag.text pagemap.published_date = datetime.strptime(published, '%B %d, %Y').date() except: pass # get page # <section class="entry-content"> try: pagemap.page = bs.get_text() entry_content_tag = bs.find("section", class_="entry-content") pagemap.page = entry_content_tag.text except: pass # get title # <h1 class="entry-title"></h1> text try: if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' entry_title_tag = bs.find("h1", class_="entry-title") pagemap.title = entry_title_tag.text except: pass # get section try: pagemap.section = sub_site except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url article_tag = bs.find('article') try: # posted date published = article_tag.find('time').text pagemap.published_date = datetime.strptime(published, '%d-%b-%Y').date() except: pass try: # title if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = article_tag.header.h1.text except: pass try: # section if sub_site in ['Skin-care', 'Hair-care']: pagemap.section = article_tag.header.p.text.strip() else: pagemap.section = 'blog' except: pass try: # img_src pagemap.img_src = article_tag.header.figure.img.attrs['src'] except: pass try: # page pagemap.page = article_tag.find('div', class_='Detail-content').text except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def push_review_to_index(): count = 1 data = [] for scrape_perfume in models.scrape_li: review_count =1 scrape_reviews = scrape_perfume[1][4] for scrape_review in scrape_reviews: review = models.Review() review.reviewid = scrape_perfume[1][0] + "?review=" + str(review_count) review_count = review_count + 1 review.perfume = scrape_perfume[0] review.review_date = datetime.strptime(scrape_review[0],'%b %d %Y').date() review.review = scrape_review[1] review.label = scrape_review[2] review.accords = scrape_perfume[1][1] review.img_src = scrape_perfume[1][5] if count < 100: data.append(elastic.convert_for_bulk(review, 'update')) count = count + 1 bulk(models.client, actions=data, stats_only=True)
def push_posts_to_index(): id = 1 data = [] for index, sp_post in models.posts_df.iterrows(): mi_post = models.PostMap() mi_post.post_id = sp_post.post_id if sp_post.editor_id in editor_map: mi_post.editor_id = editor_map[sp_post.editor_id] else: mi_post.editor_id = sp_post.editor_id mi_post.published_date = datetime.strptime( sp_post.published_date[0:10], '%Y-%m-%d').date() if len(sp_post.post_category_id['results']) > 0: post_category_id = sp_post.post_category_id['results'][0] else: post_category_id = 0 if post_category_id in categoy_map: mi_post.post_category_id = categoy_map[post_category_id] else: mi_post.post_category_id = post_category_id mi_post.title = sp_post.title.encode("ascii", 'replace') mi_post.relevance, mi_post.subject, mi_post.topline, mi_post.source, mi_post.article = scrape_body( mi_post.title, sp_post.body.encode("ascii", 'replace')) try: mi_post.average_rating = float(sp_post.average_rating) mi_post.rating_count = int(sp_post.rating_count) mi_post.num_comments_id = int(sp_post.num_comments_id) except: print("conversion failed", sp_post.average_rating) data.append(elastic.convert_for_bulk(mi_post, 'update')) id = id + 1 # To add a link the next URL is needed https://iffconnect.iff.com/Fragrances/marketintelligence/Lists/Posts/ViewPost.aspx?ID=2922 bulk(models.client, actions=data, stats_only=True)
def crawl_feedly(from_dt, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_dt, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2020-jul-10) "Authorization": "Azsr6uaruKGMnymDVmYUkDrF33mC2csnyv1OScN4hpsnH5w2ngb0zEBlwyAo4izpB3W3a2RYDAW99xYFM61U5g0U13M59tiAjZFqHkVpAXVeG8PAYl5Y060wwErrxvjj12UNeQ4bk23mzCcoa9AAJtBvUMl_DZl2-jaX0cf_vmlZuVMQh-B2Srv1FUEkno3fbVJtTdZeOc1YP29aRluNyYndpm2CWYKFjaeL1LicHbObhdjgHQAZ-EFUUDCA:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] feedlymap.url = "" try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] if len(feedlymap.url) == 0: if 'originId' in entry: n = entry['originId'].find('http') if n > 0: feedlymap.url = entry['originId'][n:] if len(feedlymap.url) == 0: if 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_date, rss_field): global headers bulk_data = [] today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2017-07-20) "Authorization": "A2JxorrfeTBQbMUsDIU3_zexSwY8191e3P9EvewYowjfbhKwOgHk84ErlXAWXpucZ_McfTDHLZN6yLxWqxgjWM8Upp1c-6Nb_RpZd0jWA9mJkVLN1JTETefaVNZtZqzTGTf8_qeT2ZE8z6Bf4LqLOUfQaQH2-jj8XIaxAyWMZ5BDRtfpgwVYrEEM2ii5KXnMJZxGNEvcqAV4Dke_subaM-wlnC8N63g:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') feed_category = feed['categories'][0]['label'] print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or feed_category == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: feedlymap.published_date = datetime( 2010, 1, 1, 00, 00, 00) feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_survey(survey_filename): ml_file = 'data/' + survey_filename survey_df = pd.read_csv(ml_file, sep=';', encoding='ISO-8859-1', low_memory=False) survey_df.fillna(0, inplace=True) # col_map[column]: (field, question, answer, dashboard) # field_map[field]: [question=0, answer=1, column=2)] field_map, col_map = survey.map_columns(survey_df.columns) survey_df.index = survey_df[field_map['resp_id'][0][2]] bulk_data = [] count = 0 total_count = 0 for resp_id, survey_s in survey_df.iterrows(): sl = models.SurveyMap() resp_id = survey.answer_value_to_string( survey_s[field_map['resp_id'][0][2]]) blindcode = survey.answer_value_to_string( survey_s[field_map['blindcode'][0][2]]) sl.resp_id = resp_id + "_" + blindcode sl.survey = survey_filename sl.children = {} sl.concept = {} sl.emotion = {} sl.fragrattr = {} sl.mood = {} sl.smell = {} sl.suitable_product = {} sl.suitable_stage = {} sl.question = {} for field, maps in field_map.items(): # resp_id is the unique id of the record, this is already set above if field == 'resp_id': continue # map: 0=answer, 1=column map = maps[0] answer_value = survey_s[map[2]] answer_value = survey.answer_value_to_string(answer_value) answer_value = survey.answer_value_encode(map[1], answer_value) # column mapping, no question if map[0] == None: # in case of multiple mapping search for the column that has a value for ix in range(1, len(maps)): map = maps[ix] answer_value_2 = survey_s[map[2]] answer_value_2 = survey.answer_value_to_string( answer_value_2) if (field == 'blindcode'): answer_value = answer_value + '-' + answer_value_2[:3] else: if len(answer_value_2) > len(answer_value): answer_value = answer_value_2 setattr(sl, field, answer_value) # answer mapping else: setattr(sl, field, {map[1]: answer_value}) attr = getattr(sl, field) for ix in range(1, len(maps)): map = maps[ix] answer_value = survey_s[map[2]] answer_value = survey.answer_value_to_string(answer_value) answer_value = survey.answer_value_encode( map[1], answer_value) attr[map[1]] = answer_value #attr.append({map[1]: answer_value}) data = elastic.convert_for_bulk(sl, 'update') bulk_data.append(data) count = count + 1 if count > 100: bulk(models.client, actions=bulk_data, stats_only=True) total_count = total_count + count print("crawl_survey: written another batch, total written {0:d}". format(total_count)) bulk_data = [] count = 1 #break bulk(models.client, actions=bulk_data, stats_only=True) pass
def crawl_feedly(from_date, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2018-feb-02) "Authorization": "A1j2bsImQdCENT7FyWxSWABu7_KwSQOKNvAySLwJQlQT3QoRlpur6iG56Xju8owoOfMF7byi1ApQUIHbUpsEBoFH-CijTCUi72hl1U1MG7eaY07ctFiEbL-e9D17yUdq3OT3iRoE04F0_1h-JcUBP513gnObI0JxD0LQk4bagAv3b22ot3jbXLoLoQgBPbBf4eKS97oyGntWM_3GMa66m1ElrAeP5R42V25WPqXZmmEwAouivQp31kDLxqFLIA:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: if 'originId' in entry: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] elif 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_dt, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_dt, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2018-aug-26) "Authorization": "A3iuGsp9UjnsSiLwl5ZoPrLZj3mO4d16muxgezgpLesPhJ4YoKgC0XdiW_ucnm7b1Z-o5DKK6oLqoW9SRNUkoTcQ8npBBmqbOF03zF3tFWaNI0Lir_hrAahmVuypG5BXVZidJJ4PuaXr4zg5pYRE32OxO0N05X_A2sdZC93oWwQU1GVLJ9evh3qmu0WXYPVXpxffytgnFjUg2JB1zGK3KJkbDl-6ioJudiD2IZczA0R52tPwFZZ0FimkE3zV:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] feedlymap.url = "" try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] if len(feedlymap.url) == 0: if 'originId' in entry: n = entry['originId'].find('http') if n > 0: feedlymap.url = entry['originId'][n:] if len(feedlymap.url) == 0: if 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_date, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2017-dec-07) "Authorization": "AzNr8sCFyRuIX3upnzA-VnUUebvthkUEF0R9bccg352muEznNt9hK9m4kj8ljkQvFfoVGDYHZcLBFKuFgXRVy4HN1sVV2WYowIsQZ7lTGxB9WYNqxRGimPyZUAijHL7ugMo9hxRgYij_rOonwruuus3O2BQe7U_sNGy_SKL6nmEVDh-DsQL5EOVM34C3-0tcATwEMoaQxUUQ78bAJ6i3HrnLy8NPUg:feedlydev" } params_streams = { # "count" : "100", "count": "1000", "ranked": "newest", "unreadOnly": "false", "newerThan": newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp( entry['published'] / 1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime( last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] if 'title' in entry: feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: if 'originId' in entry: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] elif 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup( entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True
def load_mail(email_choices, email_address, email_password): #server = IMAPClient('imap.kpnmail.nl', use_uid=True) server = IMAPClient('imap.deheerlijkekeuken.nl', use_uid=True, ssl=False) resp = server.login(email_address, email_password) resp = resp.decode() if resp != "LOGIN Ok.": return False select_info = server.select_folder('INBOX') print('%d messages in INBOX' % select_info[b'EXISTS']) messages = server.search(['ALL']) response = server.fetch(messages, ['ENVELOPE', 'RFC822', 'BODY[TEXT]']) server.logout() text_maker = html2text.HTML2Text() text_maker.ignore_links = True text_maker.bypass_tables = False bulk_data = [] count = 0 total_count = 0 for msgid, raw_email in response.items(): envelope = raw_email[b'ENVELOPE'] post_id = envelope.message_id.decode() subject = envelope.subject.decode() from_addr = envelope.from_[0].mailbox.decode( ) + '@' + envelope.from_[0].host.decode() to_addr = envelope.to[0].mailbox.decode( ) + '@' + envelope.to[0].host.decode() print('ID #%d: "%s" received %s' % (msgid, subject, envelope.date)) raw_email_string = raw_email[b'RFC822'].decode('utf-8') email_message = email.message_from_string(raw_email_string) body_text = "" # this will loop through all the available multiparts in mail for part in email_message.walk(): if part.get_content_type() == "text/plain": # ignore attachments body = part.get_payload(decode=True) body_text = body.decode('utf-8') links = set() if part.get_content_type() == "text/html": # ignore attachments body = part.get_payload(decode=True) body = body.decode('utf-8').strip() bs = BeautifulSoup(body, "lxml") body_tag = bs.find('body') body_text = body_tag.text links = mail.get_href_links(subject, bs) link_bodies = mail.get_href_link_bodies(links) #body = text_maker.handle(body) break from_index = body_text.find("From:") if from_index > 0: nl_index = body_text.find("\n", from_index) txt = body_text[from_index + 6:nl_index].replace('\r', '') from_addr = txt sent_index = body_text.find("Sent:") if sent_index > 0: nl_index = body_text.find("\n", sent_index) txt = body_text[sent_index + 5:nl_index].strip().split(' ') txt = ' '.join(txt[1:4]) #conversion fails because of month in local language #published_date = datetime.strptime(txt, "%d %B %Y").date() to_index = body_text.find("To:") if to_index > 0: nl_index = body_text.find("\n", to_index) txt = body_text[to_index + 3:nl_index] to_addr = txt subject_index = body_text.find("Subject:") if subject_index > 0: nl_index = body_text.find("\n", subject_index) txt = body_text[subject_index + 9:nl_index] subject = txt body_text.replace("\r\n", " ") body_text.replace("\n", " ") body_text.replace(" ", " ") mail_doc = models.MailMap() mail_doc.post_id = msgid mail_doc.to_addr = to_addr mail_doc.from_addr = from_addr mail_doc.published_date = envelope.date.date() #mail_doc.links = [link[0] for link in links] mail_doc.links = link_bodies mail_doc.subject = subject mail_doc.url = "" mail_doc.body = body_text data = elastic.convert_for_bulk(mail_doc, 'update') bulk_data.append(data) count = count + 1 if count > 100: bulk(models.client, actions=bulk_data, stats_only=True) total_count = total_count + count print( "load_mail: written another batch, total written {0:d}".format( total_count)) bulk_data = [] count = 1 bulk(models.client, actions=bulk_data, stats_only=True) return True
def crawl_feedly(from_date, rss_field): global headers today = datetime.now() days = timedelta(days=31) yesterday = today - days s = yesterday.timestamp() t = time(0, 0) dt = datetime.combine(from_date, t) s = dt.timestamp() #datetime.datetime.fromtimestamp(s).strftime('%c') ms = s * 1000 newerthan = "{:.0f}".format(ms) headers = { #[email protected] (expires on 2017-10-04) "Authorization" : "A2nU8r1LuQ_wUuYHftraCIc0imow9HY7GYB1qxm-OeaU--I-cVt69lCZfEkvsOSX8R9qI6C6ABH5Nq1XKFnKX6JlkY_myGM_hfksTQe4wmWlqRxj-LBQ7n9UhIL1oXfAf80jAVhiz6w8tB9ToYV_YwB47sHASzTMlybx-5bXgmu9gtR-N-FUKByfgihrIjpShy6hMwHYYnKhz73DfQ3JhMCAdAqL1RA:feedlydev" } params_streams = { # "count" : "100", "count" : "1000", "ranked" : "newest", "unreadOnly": "false", "newerThan" : newerthan } #url = "http://cloud.feedly.com/v3/profile" #r = requests.get(url, headers=headers) url = "http://cloud.feedly.com/v3/subscriptions" r = requests.get(url, headers=headers) if r.status_code != 200: return False feeds = r.json() for feed in feeds: feed_id = feed['id'] feed_title = feed['title'].encode("ascii", 'replace') # the category label can contain the subset and category name category_label = feed['categories'][0]['label'] label_split = category_label.split('-') if len(label_split) > 1: feed_subset = label_split[0].strip() feed_category = label_split[1].strip() else: feed_subset = 'SI' feed_category = label_split[0].strip() print("crawl_feedly: scraping feed category/title", feed_category, feed_title) if rss_field == '' or category_label == rss_field: url = "http://cloud.feedly.com/v3/streams/contents" params_streams['streamId'] = feed_id r = requests.get(url, headers=headers, params=params_streams) stream = r.json() if 'items' in stream: bulk_data = None bulk_data = [] for entry in stream['items']: feedlymap = models.FeedlyMap() feedlymap.post_id = entry['id'] try: feedlymap.published_date = datetime.fromtimestamp(entry['published']/1000) except: last_year = datetime.now().year - 1 feedlymap.published_date = datetime(last_year, 1, 1, 00, 00, 00) feedlymap.subset = feed_subset feedlymap.category = feed_category feedlymap.feed = feed_title if 'topics' in feed: feedlymap.feed_topics = feed['topics'] if 'keywords' in entry: feedlymap.body_topics = entry['keywords'] feedlymap.title = entry['title'] if 'canonicalUrl' in entry: feedlymap.url = entry['canonicalUrl'] else: if 'originId' in entry: n = entry['originId'].find('http') feedlymap.url = entry['originId'][n:] elif 'origin' in entry: origin = entry['origin'] feedlymap.url = origin['htmlUrl'] feedlymap.post_id = feedlymap.url if 'summary' in entry: bs = BeautifulSoup(entry['summary']['content'], "lxml") # in case of RSS feed if 'content' in entry: bs = BeautifulSoup(entry['content']['content'], "lxml") # in case of Google News feed feedlymap.body = bs.get_text().encode("ascii", 'replace') data = elastic.convert_for_bulk(feedlymap, 'update') bulk_data.append(data) bulk(models.client, actions=bulk_data, stats_only=True) return True