def mapper1(key, value): """ Starting with input like: K=>row_id, V=>1\t<html>...Earthquake strikes in <b>Chile!<b>...</html> where 1 is the issue ID Output: K=>(row_id, 'earthquake', 1), V=>1 K=>(row_id, 'strike', 1), V=>1 K=>(row_id, 'chile', 1), V=>1 ... """ issue_id, doc = value.split('\t') doc = html_to_story(doc) for word in search_features(doc): yield (key, issue_id, word), 1
def fetch_url(self, url): data = super(SocialMediaCrawler, self).fetch_url(url) if url else '' if data: data = html_to_story(data) return data