コード例 #1
0
ファイル: crawler.py プロジェクト: vandanab/Blog2Microblog
 def getnytimes(self, url):
     community_key = '35fb5c7778409b89b6869e2ac8f79838:0:65748530'
     article_key = '8870ced3a7299320d2efd36791e636c8:5:65748530'
     expanded_content = expandURL(url)
     self.url = exp_url = re.sub(r'(http://www)\d*(\..*)\?.*', r'\1\2', expanded_content['long-url'])
     q = 'url:' + '"' + exp_url + '"'
     article_api_query = 'http://api.nytimes.com/svc/search/v1/article?format=json&query={0}&api-key={1}'.format(urllib.quote_plus(q), urllib.quote_plus(article_key))
     http = httplib2.Http()
     
     (response, content) = http.request(article_api_query, 'GET')
     try:
         if response['status'] == '200':
             content = json.loads(content)
             self.body = content['results'][0]["body"]
             print self.body
     except:
         print "body not initialized: ", self.url
     
     try:
         #redirection failure
         (res1, cont1) = http.request(exp_url, 'GET')
         if res1['status'] == '200':
             print cont1
     except:
         print 'could not get data by crawling', exp_url
     
     comm_api_query = 'http://api.nytimes.com/svc/community/v2/comments/url/exact-match.json?&url={0}&api-key={1}'.format(urllib.quote_plus(exp_url), urllib.quote_plus(community_key))
     (response, content) = http.request(comm_api_query, 'GET')
     if response['status'] == '200':
         content = json.loads(content)
         self.comments = content['results']['comments'] 
コード例 #2
0
 def processrequest(self):
     domain = ""
     if re.search(r'(eng*\.co)|(engadget\.com.*)', self.url) != None:
         domain = 'engadget'
     elif re.search(r'(mash*\.to)|(mashable\.*)', self.url) != None:
         domain = 'mashable'
     elif re.search(r'ndtv', self.url) != None:
         domain = 'ndtv'
     elif re.search(r'fakingnews', self.url) != None:
         domain = 'fakingnews'
     elif re.search(r'treehugger', self.url) != None or 'treehugger' in expandURL(self.url)['long-url']:
         domain = 'treehugger'
     elif re.search(r'news\.cnet', self.url) != None:
         domain = 'cnetnews'
     if domain == "":
         return self.geterrorresponse("Url Not Valid...")
     
     self.domain = domain
     bh = crawler.BlogHtml(self.url, domain)
     self.title = bh.title
     self.items = util.getitems(bh.title, bh.blogparas)