class YahooKeywordQueryBuilder(QueryBuilder): """ Builds a query by making a query for every relevant keyword returned from Yahoo for an entity description """ def __init__(self): # Cache the data retrieved self.cache = YQLCache() self.apiUrl = "*%20from%20search.termextract%20where%20context%3D%22####%22&format=json" def buildQueries(self, entity, idField="name"): """ Builds the queries for the given entity, starting with the given id field. @param entity The entity, given as a dictionary, for which to generate queries @param idField The field that uniquely identifies this entity """ entityId = entity[idField] # Get the lists of name & value queries attributeNames = self.getAttributeNames(entity, idField) attributeValues = self.getAttributeValues(entity, idField) attributeNamesAndValues = set(attributeNames).union(set(attributeValues)) attributeNamesAndValues.remove(entityId) # Generate the text entityDescription = ", ".join(attributeNamesAndValues) # Get the Yahoo keywords for this entity keywords = self.extractKeywords(entityDescription) # Build the queries queries = [entityId] for keyword in keywords: queries.append(entityId + " " + keyword) pprint(queries) return queries def getAttributeNames(self, entity, idField): attributeNames = [] # Generate queries for each data entry for property in entity: if property is not idField: attributeNames.append(str(property)) return attributeNames def getAttributeValues(self, entity, idField): attributeValues = [] # Generate queries for each data entry for property in entity: if property is not idField: # Get the property entityProperty = entity[property] if entityProperty is not None: # Generate a query for each entry in a list if type(entityProperty) == type([]): for property in entityProperty: attributeValues.append(str(property)) elif entityProperty is not None: attributeValues.append(str(entityProperty)) return attributeValues def extractKeywords(self, entityDescription): """ Retrieve the set of keywords for the entity description """ keywords = set([]) # Fill the URL template url = self.apiUrl.replace("####", entityDescription) url = url.replace(" ", "%20") url = url.replace("AT&T", "") # Blows up Yahoo API for some reason... # Get the keyword data, from the cache if possible keywordJson = if keywordJson is None: keywordJson = loadFromUrl(url) self.cache.write(url, keywordJson) keywordData = loads(keywordJson) try: fetchedKeywords = keywordData["query"]["results"].values() keywords = keywords.union(set(fetchedKeywords[0])) except AttributeError: pass return list(keywords)
class YQLKeywordExtension(Extension): """ An extension that allows keyword detection through Yahoo's YQL web interface. """ def __init__(self): # The template information for querying the Yahoo keyword service self.apiUrl = '*%20from%20search.termextract%20where%20context%3D%22####%22&format=json' self.contentSize = 5000 # Cache the data retrieved self.cache = YQLCache() def getKeywordsFromContent(self, content): """ Retrieve the Yahoo keywords from some content, by splitting the content and joining the results if the content is too long to be sent via a URL. @param content The content from which to retrieve the keyword information. """ # Split the content content = self.__cleanResults(content) contentChunks = self.__group(content, self.contentSize) # Get the keywords for each chunk keywords = set([]) for chunk in contentChunks: # Fill the URL template url = self.apiUrl.replace('####', chunk) url = url.replace(' ', '%20') # Get the keyword data, from the cache if possible keywordJson = if keywordJson is None: keywordJson = loadFromUrl(url) self.cache.write(url, keywordJson) keywordData = loads(keywordJson) try: fetchedKeywords = keywordData['query']['results'].values() keywords = keywords.union(set(fetchedKeywords[0])) except AttributeError: pass return list(keywords) def __cleanResults(self, content): """ Cleans out HTML tags from content of results for this analysis. Results will appear as this after calling this method: [ { 'url': <url> 'preview' : <preview snippet> 'title' : <title> 'description' : <meta description> 'pageRank' : <PageRank, between 0 and 10> 'content' : <page content> 'cleanContent' : <cleaned page content> }, ... ] """ # Remove stop words from the cleaned content stopWordsListPath = str(os.getcwd()) stopWordsListPath = stopWordsListPath[:stopWordsListPath.find('EntityQuerier') + len('EntityQuerier')] stopWordsListPath += "/src/analysis/StopWordList.json" stopWords = set(loads(open(stopWordsListPath).read())) # Extract <script> tags soup = BeautifulSoup(content.lower()) to_extract = soup.findAll('script') for item in to_extract: item.extract() # Extract <style> tags to_extract = soup.findAll('style') for item in to_extract: item.extract() # Extract all other tags cleanContent = ' '.join(soup.findAll(text=True)) # Add spaces for HTML spaces cleanContent = cleanContent.replace(' ', ' ') # Replace stop words & links cleanWords = [] words = cleanContent.split() for word in words: if word not in stopWords and 'http' not in word: charRegex = re.compile('[^a-zA-Z]') nonChars = charRegex.findall(word) if len(nonChars) == 0: cleanWords.append(word) cleanContent = ' '.join(cleanWords) return cleanContent def __group(self, content, groupSize): return [content[i : i + groupSize] for i in xrange(0, len(content), groupSize)] def run(self, resultDictionary): resultDictionary['yqlKeywords'] = self.getKeywordsFromContent(resultDictionary['content'])