Exemple #1
0
	def run(self, query):

		yql_url = "http://query.yahooapis.com/v1/public/yql?q=use%20'http%3A%2F%2Flimechile.com%2Fhacks%2Fagendatrends%2Fagendatrends.xml'%20as%20agendatrends.news%3B%20select%20*%20from%20agendatrends.news%20where%20q%20%3D%20%22"+urllib.quote(query)+"%22&format=json&diagnostics=false&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys"

		logging.info('GETTING NEWS FOR QUERY: '+str(query))
		logging.info('ENCODED: '+yql_url)
		
		respObj = self.urlfetch.fetch(yql_url)

		if respObj.status_code != 200:
			raise Exception('CODE WAS NOT 200! FAILURE: '+str(respObj.status_code))

		jsonContent = json.loads(respObj.content)

		articles = []
		
		logging.info('RESULT_COUNT: '+str(jsonContent['query']['count']))
		
		for i in range(jsonContent['query']['count']):

			## Pull out results
			unescapedUrl = jsonContent['query']['results']['result']['articles'][i]['url']
			publisher = jsonContent['query']['results']['result']['articles'][i]['publisher']
			date = jsonContent['query']['results']['result']['articles'][i]['date']
			
			## Find publisher
			source = DiscourseSource.get_by_key_name(publisher)
			if source is None:
				source = DiscourseSource(key_name=publisher.replace(' ', '-').lower(), name=publisher).put()
			
			n = NewsArticle.get_by_key_name(unescapedUrl)
			if n is None:
			
				## Create news article
				a = NewsArticle(key_name=unescapedUrl)
				a.url = unescapedUrl
				a.source = source
				a.published = datetime.datetime.strptime(date[0:-6], '%a, %d %b %Y %I:%M:%S')
				articles.append(a)
			
				logging.info('URL for article %s: %s' % (query, unescapedUrl))
			else:
				logging.info('URL ALREADY FOUND IN DB! Skipping article, it has already been analyzed')

		articles = self.db.put(articles)
		for article in articles:
			logging.info('PUT ARTICLE: '+str(article))
			o = OpenCalaisIdentity(str(article)).start(queue_name='data')
			

		return str(articles)
Exemple #2
0
	def run(self, article_key):
		
		from calaislib import Calais
		c = Calais(self.config['api_key'], submitter=self.config['submitter'])
		
		article = NewsArticle.get(self.db.Key(article_key))
		
		object_result = c.analyze_url(article.url)
		
		str_topics = {}
		for topic in object_result.socialTag:
			str_topics[topic['name'].replace(' ', '-')] = topic
			
		tags = []
		topics = Topic.get_by_key_name(str_topics.keys())

		i = 0
		for slug, topicname in str_topics.items():
			
			if topics[i] is None:
				t = Topic(key_name=slug, name=topicname)
				topic_key = t.put()
			else:
				topic_key = topics[i].key()
				
			tags.append(Tag(article, key_name=str(topic_key), topic=topic_key, discourse=article))
			i = i+1
			
		return [str(key) for key in self.db.put(tags)]
Exemple #3
0
    def run(self, query):

        yql_url = "http://query.yahooapis.com/v1/public/yql?q=use%20'http%3A%2F%2Flimechile.com%2Fhacks%2Fagendatrends%2Fagendatrends.xml'%20as%20agendatrends.news%3B%20select%20*%20from%20agendatrends.news%20where%20q%20%3D%20%22" + urllib.quote(
            query
        ) + "%22&format=json&diagnostics=false&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys"

        logging.info('GETTING NEWS FOR QUERY: ' + str(query))
        logging.info('ENCODED: ' + yql_url)

        respObj = self.urlfetch.fetch(yql_url)

        if respObj.status_code != 200:
            raise Exception('CODE WAS NOT 200! FAILURE: ' +
                            str(respObj.status_code))

        jsonContent = json.loads(respObj.content)

        articles = []

        logging.info('RESULT_COUNT: ' + str(jsonContent['query']['count']))

        for i in range(jsonContent['query']['count']):

            ## Pull out results
            unescapedUrl = jsonContent['query']['results']['result'][
                'articles'][i]['url']
            publisher = jsonContent['query']['results']['result']['articles'][
                i]['publisher']
            date = jsonContent['query']['results']['result']['articles'][i][
                'date']

            ## Find publisher
            source = DiscourseSource.get_by_key_name(publisher)
            if source is None:
                source = DiscourseSource(key_name=publisher.replace(
                    ' ', '-').lower(),
                                         name=publisher).put()

            n = NewsArticle.get_by_key_name(unescapedUrl)
            if n is None:

                ## Create news article
                a = NewsArticle(key_name=unescapedUrl)
                a.url = unescapedUrl
                a.source = source
                a.published = datetime.datetime.strptime(
                    date[0:-6], '%a, %d %b %Y %I:%M:%S')
                articles.append(a)

                logging.info('URL for article %s: %s' % (query, unescapedUrl))
            else:
                logging.info(
                    'URL ALREADY FOUND IN DB! Skipping article, it has already been analyzed'
                )

        articles = self.db.put(articles)
        for article in articles:
            logging.info('PUT ARTICLE: ' + str(article))
            o = OpenCalaisIdentity(str(article)).start(queue_name='data')

        return str(articles)