Exemple #1
0
def html_to_story(doc, strip_control_chars=True):
    try:
        # Send the HTML over to Data Science Toolkit
        story = urllib2.urlopen( '/'.join([settings.DSTK_API_BASE, 'html2story']), data=doc).read()
        
        story = json.loads(story).get('story', '')
        
        if strip_control_chars:
            story = strip_control_characters(story)
        return story
    except urllib2.URLError, e:
        return ''
Exemple #2
0
def html_to_story(doc, strip_control_chars=True):
    try:
        # Send the HTML over to Data Science Toolkit
        story = urllib2.urlopen('/'.join(
            [settings.DSTK_API_BASE, 'html2story']),
                                data=doc).read()

        story = json.loads(story).get('story', '')

        if strip_control_chars:
            story = strip_control_characters(story)
        return story
    except urllib2.URLError, e:
        return ''
    def fetch_url(self, line):
        issue_id, url, data = line.split('\t')
        url = url.strip('"')
        
        outfile = os.sep.join([self.outfile_base_path, hashlib.md5(''.join([issue_id, url or data])).hexdigest()]) + '.out'
        
        if url and not os.path.exists(outfile):
            new_data = super(FeedItemWebCrawler, self).fetch_url(url)
            if new_data:
                data = new_data

        if not os.path.exists(outfile):            
            with open(outfile, 'w') as f:
                    f.write('\t'.join([issue_id, strip_control_characters(data)]))    
            return 'Wrote data'
        else:
            return 'Nada'
Exemple #4
0
    def fetch_url(self, line):
        issue_id, url, data = line.split('\t')
        url = url.strip('"')

        outfile = os.sep.join([
            self.outfile_base_path,
            hashlib.md5(''.join([issue_id, url or data])).hexdigest()
        ]) + '.out'

        if url and not os.path.exists(outfile):
            new_data = super(FeedItemWebCrawler, self).fetch_url(url)
            if new_data:
                data = new_data

        if not os.path.exists(outfile):
            with open(outfile, 'w') as f:
                f.write('\t'.join([issue_id, strip_control_characters(data)]))
            return 'Wrote data'
        else:
            return 'Nada'