def detect_fetch_data(source): url = util.first_present([source.fetch_url_override, source.url]) twitter_data = twitter_source_fetch.twitter_fetch_data_from_url(url) if twitter_data: return twitter_data, None markup = util.url_fetch(url) if not markup: return None, None # is this an rss feed itself? feed = parse_as_feed(markup) if feed: return {"type": "rss", "url": url}, feed # try finding some linked rss: soup = bs4.BeautifulSoup(markup, 'lxml') feed_url = rss_tools.find_linked_rss(soup, url) if feed_url: return {"type": "rss", "url": feed_url}, None wp_rss_link = url + "/?feed=rss" feed = parse_as_feed(util.url_fetch(wp_rss_link)) if feed: return {"type": "rss", "url": wp_rss_link}, feed # is there a twitter account linked? twitter_data = twitter_source_fetch.linked_twitter_fetch_data(soup) if twitter_data: return twitter_data, None return None, None
def get(self): self.response.headers['Content-Type'] = 'application/json' resp = { "url": self.request.get('url') } html = util.url_fetch(self.request.get('url')) if html: def text(node): return node.text if node else None def content(node): return node['content'] if node and node.has_attr('content') else None def src(node): return node['src'] if node and node.has_attr('src') else None soup = bs4.BeautifulSoup(html, 'lxml') resp['title'] = first([ content(soup.find('meta', {'property': 'og:title'})), text(soup.find('title')) ]) resp['description'] = first([ content(soup.find('meta', {'property': 'og:description'})), content(soup.find('meta', {'name': 'description'})) ]) resp['image'] = first([ content(soup.find('meta', {'name': 'og:image'})), src(soup.find('img')) ]) url = content(soup.find('meta', {"property": "og:url"})) if url: resp['url'] = url self.response.write(json.dumps(resp))
def _source_fetch(source): fetch_type = None url = util.first_present([source.fetch_url_override, source.url]) markup = url_fetch(url) if markup: results = [] rpcs = [] def got_result(res): if res: results.append(res) def add_rpc(rpc): rpcs.append(rpc) for fn in fetch_functions_for_source(source): fn(source, markup, url, add_rpc, got_result) while len(rpcs): rpcs[0].wait() del rpcs[0] result = results[0] if len(results) else None if result: debug("SF: Fetched {0} as {1} source with {2} entries".format(url, result.method, len(result.entries))) else: warning("SF: Couldn't fetch {0} using any method".format(url)) if result: debug("SF: starting brand fetch") result.brand = extract_brand(markup, source.url) debug("SF: done with brand fetch") return result else: print "URL error fetching {0}".format(source.url) return None
def rss_fetch(data, feed_content): url = data['url'] if not feed_content: markup = util.url_fetch(url) if markup: feed_content = parse_as_feed(markup) if not feed_content: return None parsed = feed_content source_entry_processor = create_source_entry_processor(url) feed_title = parsed['feed']['title'] entries = [] latest_date = None for entry in parsed['entries']: if 'link' in entry and 'title' in entry: # print entry link_url = urljoin(url, entry['link'].strip()) title = entry['title'] pub_time = entry.get('published_parsed', entry.get('updated_parsed')) if pub_time: published = datetime.datetime.fromtimestamp(mktime(pub_time)) else: published = None result_entry = {"title": title, "url": link_url, "published": published} source_entry_processor(result_entry, entry) entries.append(result_entry) return FetchResult('rss', feed_title, entries)
def get(self): from bs4 import BeautifulSoup as bs from article_extractor import extract url = self.request.get('url') markup = util.url_fetch(url) soup = bs(markup, 'lxml') text = u"" if soup.title: title = soup.title.string h1 = soup.new_tag('h1') h1.string = title text += unicode(h1) # print create_soup_with_ids(markup).prettify() text += extract(markup, url) self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.write(text)
def fetch_normal(): response = url_fetch(url, return_response_obj=True) # print 'INFO', response.info() if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html': markup = response.read() else: print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE' markup = None if markup: # process markup: markup_soup = BeautifulSoup(markup, 'lxml') og_title = find_meta_value(markup_soup, 'og:title') og_image = find_meta_value(markup_soup, 'og:image') og_description = find_meta_value(markup_soup, 'og:description') title_field = find_title(markup_soup) article.site_name = find_meta_value(markup_soup, 'og:site_name') # find author: article.author = find_author(markup_soup) # parse and process article content: content.html = article_extractor.extract(markup, article.url) doc_soup = BeautifulSoup(content.html, 'lxml') article.title = first_present([og_title, title_field, article.title]) article.top_image = make_url_absolute(first_present([article.top_image, og_image])) populate_article_json(article, content) # compute description: description = None if og_description and len(og_description.strip()): description = truncate(og_description.strip(), words=40) elif content.text and len(content.text.strip()) > 0: description = truncate(content.text, words=40) article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None return True else: return False
def create_source_entry_processor(url): url = canonical_url(url) print "SEARCHING FOR SOURCE ENTRY PROCESSOR FOR:", url if url.startswith('http://www.reddit.com') and url.endswith('.rss'): print 'using reddit entry processor' json_url = url[:-len('.rss')] + '.json' api_resp = json.loads(url_fetch(json_url)) url_map = {} for item_ in api_resp['data']['children']: item = item_['data'] submission_url = 'https://www.reddit.com' + item['permalink'] actual_url = item['url'] url_map[submission_url] = actual_url print 'url map: {0}'.format(url_map) def process_reddit(entry, feed_entry): print 'entry url: {0}'.format(entry['url']) submission_url = entry.get('url', entry.get('link')) if submission_url in url_map: print 'MATCHING {0} -> {1}'.format(submission_url, url_map[submission_url]) entry['url'] = url_map[submission_url] entry['submission_url'] = submission_url return process_reddit if url.startswith('http://longform.org/'): def longform_override(result_entry, feed_entry): if 'content' in feed_entry and len(feed_entry['content']) > 0: content = feed_entry['content'][0]['value'] matches = re.findall(r"\"(.+)\"", content) if len(matches): result_entry['url'] = matches[-1] return longform_override if url == 'http://www.designernews.co/?format=atom': def dn_override(result_entry, feed_entry): if 'summary' in feed_entry: result_entry['url'] = feed_entry['url'] return dn_override def process_vanilla(result_entry, feed_entry): pass return process_vanilla
def get(self): # allow cross-origin use: self.response.headers.add_header('Access-Control-Allow-Origin', '*') self.response.headers.add_header('Cache-Control', 'max-age=604800') url = self.request.get('url') response = url_fetch(url, timeout=3, return_response_obj=True) original_headers = {} for header in response.info().headers: key, val = header.split(':', 1) original_headers[key.lower()] = val.strip() content_type = original_headers.get('content-type', 'application/octet-stream') data = response.read() def sanitize_filename(filename): chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_.,' return ''.join([c for c in filename if c in chars]) force_download_with_filename = self.request.get('force_download_with_filename') if force_download_with_filename: self.response.headers.add_header('Content-Disposition', str('attachment; filename=' + sanitize_filename(force_download_with_filename))) resize = self.request.get('resize') if resize: w,h = map(float, resize.split(',')) img = images.Image(data) ow, oh = img.width, img.height scale = min(w/ow, h/oh, 1) img.resize(int(ow*scale), int(oh*scale)) if content_type == 'image/jpeg' or True: # ALWAYS send jpeg, never png output_encoding = images.JPEG else: output_encoding = images.PNG content_type = 'image/png' data = img.execute_transforms(output_encoding=output_encoding) self.response.headers['Content-Type'] = content_type self.response.write(data)
def url_fetch_and_time(url, timeout): t1 = time.time() res = url_fetch(url, timeout=timeout) t2 = time.time() return res, (t2 - t1)