Example #1
0
def detect_fetch_data(source):
    url = util.first_present([source.fetch_url_override, source.url])
    
    twitter_data = twitter_source_fetch.twitter_fetch_data_from_url(url)
    if twitter_data:
        return twitter_data, None
    
    markup = util.url_fetch(url)
    if not markup:
        return None, None
    
    # is this an rss feed itself?
    feed = parse_as_feed(markup)
    if feed:
        return {"type": "rss", "url": url}, feed
    
    # try finding some linked rss:
    soup = bs4.BeautifulSoup(markup, 'lxml')
    feed_url = rss_tools.find_linked_rss(soup, url)
    if feed_url:
        return {"type": "rss", "url": feed_url}, None
    
    wp_rss_link = url + "/?feed=rss"
    feed = parse_as_feed(util.url_fetch(wp_rss_link))
    if feed:
        return {"type": "rss", "url": wp_rss_link}, feed
    
    # is there a twitter account linked?
    twitter_data = twitter_source_fetch.linked_twitter_fetch_data(soup)
    if twitter_data:
        return twitter_data, None
    
    return None, None
Example #2
0
 def get(self):
     self.response.headers['Content-Type'] = 'application/json'
     resp = {
         "url": self.request.get('url')
     }
     html = util.url_fetch(self.request.get('url'))
     if html:        
         def text(node):
             return node.text if node else None
     
         def content(node):
             return node['content'] if node and node.has_attr('content') else None
     
         def src(node):
             return node['src'] if node and node.has_attr('src') else None
         
         soup = bs4.BeautifulSoup(html, 'lxml')
         resp['title'] = first([
             content(soup.find('meta', {'property': 'og:title'})),
             text(soup.find('title'))
         ])
         resp['description'] = first([
             content(soup.find('meta', {'property': 'og:description'})),
             content(soup.find('meta', {'name': 'description'}))
         ])
         resp['image'] = first([
             content(soup.find('meta', {'name': 'og:image'})),
             src(soup.find('img'))
         ])
         url = content(soup.find('meta', {"property": "og:url"}))
         if url: resp['url'] = url
     self.response.write(json.dumps(resp))
def _source_fetch(source):
    fetch_type = None
    url = util.first_present([source.fetch_url_override, source.url])
    markup = url_fetch(url)
    if markup:
        results = []
        rpcs = []
        def got_result(res):
            if res: results.append(res)
        def add_rpc(rpc):
            rpcs.append(rpc)
        for fn in fetch_functions_for_source(source):
            fn(source, markup, url, add_rpc, got_result)
        while len(rpcs):
            rpcs[0].wait()
            del rpcs[0]
        result = results[0] if len(results) else None
        if result:
            debug("SF: Fetched {0} as {1} source with {2} entries".format(url, result.method, len(result.entries)))
        else:
            warning("SF: Couldn't fetch {0} using any method".format(url))
        if result:
            debug("SF: starting brand fetch")
            result.brand = extract_brand(markup, source.url)
            debug("SF: done with brand fetch")
        return result
    else:
        print "URL error fetching {0}".format(source.url)
    return None
Example #4
0
def rss_fetch(data, feed_content):
    url = data['url']
    if not feed_content:
        markup = util.url_fetch(url)
        if markup:
            feed_content = parse_as_feed(markup)
    
    if not feed_content:
        return None
    
    parsed = feed_content
    
    source_entry_processor = create_source_entry_processor(url)
    feed_title = parsed['feed']['title']
    entries = []
    latest_date = None
    for entry in parsed['entries']:
        if 'link' in entry and 'title' in entry:
            # print entry
            link_url = urljoin(url, entry['link'].strip())
            title = entry['title']
            
            pub_time = entry.get('published_parsed', entry.get('updated_parsed'))
            if pub_time:
                published = datetime.datetime.fromtimestamp(mktime(pub_time))
            else:
                published = None
            result_entry = {"title": title, "url": link_url, "published": published}
            source_entry_processor(result_entry, entry)
            entries.append(result_entry)
    
    return FetchResult('rss', feed_title, entries)
Example #5
0
 def get(self):
     from bs4 import BeautifulSoup as bs
     from article_extractor import extract
     url = self.request.get('url')
     markup = util.url_fetch(url)
     soup = bs(markup, 'lxml')
     text = u""
     if soup.title:
         title = soup.title.string
         h1 = soup.new_tag('h1')
         h1.string = title
         text += unicode(h1)
     # print create_soup_with_ids(markup).prettify()
     text += extract(markup, url)
     self.response.headers['Access-Control-Allow-Origin'] = '*'
     self.response.write(text)
Example #6
0
 def fetch_normal():
     response = url_fetch(url, return_response_obj=True)
     # print 'INFO', response.info()
     if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html':
         markup = response.read()
     else:
         print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE'
         markup = None
 
     if markup:
         # process markup:
         markup_soup = BeautifulSoup(markup, 'lxml')
         og_title = find_meta_value(markup_soup, 'og:title')
         og_image = find_meta_value(markup_soup, 'og:image')
         og_description = find_meta_value(markup_soup, 'og:description')
         title_field = find_title(markup_soup)
     
         article.site_name = find_meta_value(markup_soup, 'og:site_name')
     
         # find author:
         article.author = find_author(markup_soup)
     
         # parse and process article content:
         content.html = article_extractor.extract(markup, article.url)
         doc_soup = BeautifulSoup(content.html, 'lxml')
     
         article.title = first_present([og_title, title_field, article.title])
         article.top_image = make_url_absolute(first_present([article.top_image, og_image]))
     
         populate_article_json(article, content)
     
         # compute description:
         description = None
         if og_description and len(og_description.strip()):
             description = truncate(og_description.strip(), words=40)
         elif content.text and len(content.text.strip()) > 0:
             description = truncate(content.text, words=40)
         article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None
             
         return True
     else:
         return False
def create_source_entry_processor(url):
    url = canonical_url(url)
    print "SEARCHING FOR SOURCE ENTRY PROCESSOR FOR:", url
    
    if url.startswith('http://www.reddit.com') and url.endswith('.rss'):
        print 'using reddit entry processor'
        json_url = url[:-len('.rss')] + '.json'
        api_resp = json.loads(url_fetch(json_url))
        url_map = {}
        for item_ in api_resp['data']['children']:
            item = item_['data']
            submission_url = 'https://www.reddit.com' + item['permalink']
            actual_url = item['url']
            url_map[submission_url] = actual_url
        print 'url map: {0}'.format(url_map)
        def process_reddit(entry, feed_entry):
            print 'entry url: {0}'.format(entry['url'])
            submission_url = entry.get('url', entry.get('link'))
            if submission_url in url_map:
                print 'MATCHING {0} -> {1}'.format(submission_url, url_map[submission_url])
                entry['url'] = url_map[submission_url]
                entry['submission_url'] = submission_url
        return process_reddit
    
    if url.startswith('http://longform.org/'):
        def longform_override(result_entry, feed_entry):
            if 'content' in feed_entry and len(feed_entry['content']) > 0:
                content = feed_entry['content'][0]['value']
                matches = re.findall(r"\"(.+)\"", content)
                if len(matches):
                    result_entry['url'] = matches[-1]
        return longform_override
    
    if url == 'http://www.designernews.co/?format=atom':
        def dn_override(result_entry, feed_entry):
            if 'summary' in feed_entry: result_entry['url'] = feed_entry['url']
        return dn_override
    
    def process_vanilla(result_entry, feed_entry):
        pass
    
    return process_vanilla
Example #8
0
	def get(self):
		# allow cross-origin use:
		self.response.headers.add_header('Access-Control-Allow-Origin', '*')
		self.response.headers.add_header('Cache-Control', 'max-age=604800')
		url = self.request.get('url')
		response = url_fetch(url, timeout=3, return_response_obj=True)
		original_headers = {}
		for header in response.info().headers:
			key, val = header.split(':', 1)
			original_headers[key.lower()] = val.strip()
		content_type = original_headers.get('content-type', 'application/octet-stream')
		data = response.read()
		
		def sanitize_filename(filename):
			chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_.,'
			return ''.join([c for c in filename if c in chars])
		
		force_download_with_filename = self.request.get('force_download_with_filename')
		if force_download_with_filename:
			self.response.headers.add_header('Content-Disposition', str('attachment; filename=' + sanitize_filename(force_download_with_filename)))
		
		resize = self.request.get('resize')
		if resize:
			w,h = map(float, resize.split(','))
			img = images.Image(data)
			ow, oh = img.width, img.height
			scale = min(w/ow, h/oh, 1)
			img.resize(int(ow*scale), int(oh*scale))
			if content_type == 'image/jpeg' or True: # ALWAYS send jpeg, never png
				output_encoding = images.JPEG
			else:
				output_encoding = images.PNG
				content_type = 'image/png'
			data = img.execute_transforms(output_encoding=output_encoding)
		
		self.response.headers['Content-Type'] = content_type
		self.response.write(data)
Example #9
0
def url_fetch_and_time(url, timeout):
    t1 = time.time()
    res = url_fetch(url, timeout=timeout)
    t2 = time.time()
    return res, (t2 - t1)