Esempio n. 1
0
def mapper1(key, value):
    """ Starting with input like:
    K=>row_id, V=>1\t<html>...Earthquake strikes in <b>Chile!<b>...</html>
    where 1 is the issue ID
    
    Output:
    K=>(row_id, 'earthquake', 1), V=>1
    K=>(row_id, 'strike', 1), V=>1
    K=>(row_id, 'chile', 1), V=>1
    ...
    """
    issue_id, doc = value.split('\t')

    doc = html_to_story(doc)

    for word in search_features(doc):
        yield (key, issue_id, word), 1
def mapper1(key, value):
    """ Starting with input like:
    K=>row_id, V=>1\t<html>...Earthquake strikes in <b>Chile!<b>...</html>
    where 1 is the issue ID
    
    Output:
    K=>(row_id, 'earthquake', 1), V=>1
    K=>(row_id, 'strike', 1), V=>1
    K=>(row_id, 'chile', 1), V=>1
    ...
    """
    issue_id, doc = value.split('\t')
    
    doc = html_to_story(doc)
    
    for word in search_features(doc):
        yield (key, issue_id, word), 1
Esempio n. 3
0
 def fetch_url(self, url):
     data = super(SocialMediaCrawler, self).fetch_url(url) if url else ''
     if data:
         data = html_to_story(data)
     
     return data
Esempio n. 4
0
    def fetch_url(self, url):
        data = super(SocialMediaCrawler, self).fetch_url(url) if url else ''
        if data:
            data = html_to_story(data)

        return data