def html_to_story(doc, strip_control_chars=True): try: # Send the HTML over to Data Science Toolkit story = urllib2.urlopen( '/'.join([settings.DSTK_API_BASE, 'html2story']), data=doc).read() story = json.loads(story).get('story', '') if strip_control_chars: story = strip_control_characters(story) return story except urllib2.URLError, e: return ''
def html_to_story(doc, strip_control_chars=True): try: # Send the HTML over to Data Science Toolkit story = urllib2.urlopen('/'.join( [settings.DSTK_API_BASE, 'html2story']), data=doc).read() story = json.loads(story).get('story', '') if strip_control_chars: story = strip_control_characters(story) return story except urllib2.URLError, e: return ''
def fetch_url(self, line): issue_id, url, data = line.split('\t') url = url.strip('"') outfile = os.sep.join([self.outfile_base_path, hashlib.md5(''.join([issue_id, url or data])).hexdigest()]) + '.out' if url and not os.path.exists(outfile): new_data = super(FeedItemWebCrawler, self).fetch_url(url) if new_data: data = new_data if not os.path.exists(outfile): with open(outfile, 'w') as f: f.write('\t'.join([issue_id, strip_control_characters(data)])) return 'Wrote data' else: return 'Nada'
def fetch_url(self, line): issue_id, url, data = line.split('\t') url = url.strip('"') outfile = os.sep.join([ self.outfile_base_path, hashlib.md5(''.join([issue_id, url or data])).hexdigest() ]) + '.out' if url and not os.path.exists(outfile): new_data = super(FeedItemWebCrawler, self).fetch_url(url) if new_data: data = new_data if not os.path.exists(outfile): with open(outfile, 'w') as f: f.write('\t'.join([issue_id, strip_control_characters(data)])) return 'Wrote data' else: return 'Nada'