def _getTopics(self, Keywords=None, Category=None): ''' Purpose: Get topics for keywords and/or categories. Arguments: Keywords - str - keywords to filter topics ( abbreviation / words that could be abbreviation ) Category - str - category to search for topics under Returns: TopicsDict - dict - dictionary of topic : topic_url topic_url could be used to further filter the abbreviation results or be used to get a page containing abbreviations for the topic ''' # VALIDATE USER INPUT if Category != None and not Category in self.categories: return [] TopicsDict = {} # GET TOPICS JUST FOR CATEGORY if Keywords == None and Category != None: # handle case where just category provided to filter topics if Category in self.categories: # BUILD TOPICS PAGE URL if 'any' in Category.lower( ): # handle case where no category selected TopicPageURL = SITE['root'] + SITE['top_topics'] else: # handle case where any other category selected TopicPageURL = SITE['root'] + self._categories[ Category ] + \ SITE['top_topics'] # PARSE TOPICS ON PAGE TopicResponse = requests.get(TopicPageURL) TopicResultXML = get_xml(TopicResponse) Topics = \ TopicResultXML.xpath( '//div[@class="popular"]/ul/li/a/text()' ) TopicURLs = \ TopicResultXML.xpath( '//div[@class="popular"]/ul/li/a/@href' ) TopicsDict = dict(zip(Topics, TopicURLs)) # GET TOPICS FOR KEYWORDS elif Keywords != None: # BUILD TOPICS PAGE URL TopicPageURL = SITE['root'] if Category != None and Category in self.categories and \ not 'any' in Category.lower(): # handle case where no category selected TopicPageURL += '/' + self._categories[Category] TopicPageURL += '/' + Keywords # PARSE TOPICS ON PAGE TopicResponse = requests.get(TopicPageURL) TopicResultXML = get_xml(TopicResponse.text) TopicsDict = self._extractTopicsFromSearchResult(TopicResultXML) return TopicsDict
def _search(self, Keywords, Reverse=False, TopCount=1): ''' Purpose: Search the site with the given set of search criteria. Arguments: Keywords - str - str of keywords to search under. Can be abbreviation or definition TopCount - int - number of results to return Returns: Abbs - list of Abbreviations - Abbreviations returned from search. ''' Abbs = [] # VALIDATE USER INPUTS if Keywords == None: raise ValueError('ERROR : KEYWORDS CANNOT BE NONE') elif not isinstance(Keywords, str): raise ValueError('ERROR : KEYWORDS MUST BE A STRING') SearchURL = SITE['root'] + SITE['search']. \ format(keywords=Keywords) # INITIAL SEARCH SearchResponse = requests.get(SearchURL, headers=self.headers) if SearchResponse.status_code != 200: # error return no results return [] SearchResultXML = get_xml(SearchResponse.text) # GET ABBREVIATIONS UNTIL TOPCOUNT MET OR SEARCH RESULTS END Abbs += self._extractAbbreviations(SearchResultXML) if len(Abbs) < TopCount: PageCountElms = SearchResultXML.xpath( '//div[@class="aa-pagination"]' + '/a[contains(@class,"counter")]' + '/text()') if len(PageCountElms) > 0: PageCount = int(PageCountElms[0].rsplit('/')[1]) Search_Base_URL = SearchResponse.url iPage = 2 while len( Abbs ) < TopCount and \ iPage < PageCount: Next_Search_URL = Search_Base_URL + '/' + str(iPage) NextSearchResponse = requests.get(Next_Search_URL) NextSearchXML = get_xml(NextSearchResponse.text) Abbs += self._extractAbbreviations(NextSearchXML, False) iPage += 1 Abbs = self._calculateConfidences(Abbs) return Abbs
def getRandom(self): ''' Purpose: Get random abbreviations. ''' RandomURL = SITE['root'] + SITE['random'] RandomResponse = requests.get(RandomURL, headers=self.headers) RandomResultXML = get_xml(RandomResponse.text) return self._extractAbbreviations(RandomResultXML, Random=True)
def _getCategories(self): ''' Purpose: Get all available categories, in the order the website uses. Returns: CategoriesDict - OrderedDict - ordered dict of ( category, category_url_path ) ''' MainPage = requests.get(SITE['root']) MainPageXML = get_xml(MainPage.text) Categories = MainPageXML.xpath( '//div[contains(@class,"category")]/ul/li/a/text()') URLs = MainPageXML.xpath( '//div[contains(@class,"category")]/ul/li/a/@href') CategoriesList = list(zip(Categories, URLs)) CategoriesDict = OrderedDict(CategoriesList) return CategoriesDict
def _search(self, Keywords, Category=None, Topic=None, TopCount=1): ''' Purpose: Search the site with the given set of search criteria. Arguments: Keywords - str - str of keywords to search under. Can be abbreviation or definition Category - str - category to search under Topic - str - topic to search under TopCount - int - number of results to return Returns: Abbs - list of Abbreviations - Abbreviations returned from search. ''' Abbs = [] # VALIDATE USER INPUTS if Keywords == None: raise ValueError('ERROR : KEYWORDS CANNOT BE NONE') elif not isinstance(Keywords, str): raise ValueError('ERROR : KEYWORDS MUST BE A STRING') if Category != None and not Category in self.categories: return [] # IF NOT DEFINITION, LET THE SITE FIGURE OUT IF DEF OR ABB GIVEN # BUILD API URL FOR QUERY / VALIDATE - site automatically detects if abb or def given # and generates the url based on this, if this GET request is used if Category == None: iCategory = 0 else: iCategory = list(self._categories.keys()).index(Category) SearchURL = SITE['root'] + SITE['search']. \ format(keywords=Keywords , icategory=iCategory) # INITIAL SEARCH SearchResponse = requests.get(SearchURL) if SearchResponse.status_code != 200: # error return no results return [] SearchResultXML = get_xml(SearchResponse.text) # IF TOPIC PROVIDED, FIND TOPIC URL FROM INITIAL SEARCH AND GET REFINED SEARCH WITH TOPIC if Topic != None: TopicsDict = self._extractTopicsFromSearchResult(SearchResultXML) if not Topic in TopicsDict: # if topic not found for search, return empty results return [] SearchURL = SearchResponse.url + '/' + TopicsDict[Topic] SearchResponse = requests.get(SearchURL) SearchResultXML = get_xml(SearchResponse.text) # GET ABBREVIATIONS UNTIL TOPCOUNT MET OR SEARCH RESULTS END Abbs += self._extractAbbreviations(SearchResultXML) if len(Abbs) < TopCount: PageCountElms = SearchResultXML.xpath( '//div[@class="aa-pagination"]' + '/a[contains(@class,"counter")]' + '/text()') if len(PageCountElms) > 0: PageCount = int(PageCountElms[0].rsplit('/')[1]) Search_Base_URL = SearchResponse.url iPage = 2 while len( Abbs ) < TopCount and \ iPage < PageCount: Next_Search_URL = Search_Base_URL + '/' + str(iPage) NextSearchResponse = requests.get(Next_Search_URL) NextSearchXML = get_xml(NextSearchResponse.text) Abbs += self._extractAbbreviations(NextSearchXML, False) iPage += 1 Abbs = self._calculateConfidences(Abbs) return Abbs