Python get_xml Examples, lxml.etree.get_xml Python Examples

Example #1

0

Show file

File: allacronyms.py Project: ConnorSMaynes/allacronyms

    def _getTopics(self, Keywords=None, Category=None):
        '''
        Purpose:    Get topics for keywords and/or categories.
        Arguments:
            Keywords - str - keywords to filter topics ( abbreviation / words that could be abbreviation )
            Category - str - category to search for topics under
        Returns:
            TopicsDict - dict - dictionary of topic : topic_url
                                topic_url could be used to further filter the abbreviation results
                                or be used to get a page containing abbreviations for the topic
        '''

        # VALIDATE USER INPUT
        if Category != None and not Category in self.categories:
            return []

        TopicsDict = {}

        # GET TOPICS JUST FOR CATEGORY
        if Keywords == None and Category != None:  # handle case where just category provided to filter topics
            if Category in self.categories:

                # BUILD TOPICS PAGE URL
                if 'any' in Category.lower(
                ):  # handle case where no category selected
                    TopicPageURL = SITE['root'] + SITE['top_topics']
                else:  # handle case where any other category selected
                    TopicPageURL = SITE['root'] + self._categories[ Category ] + \
                                   SITE['top_topics']

                # PARSE TOPICS ON PAGE
                TopicResponse = requests.get(TopicPageURL)
                TopicResultXML = get_xml(TopicResponse)
                Topics = \
                    TopicResultXML.xpath( '//div[@class="popular"]/ul/li/a/text()' )
                TopicURLs = \
                    TopicResultXML.xpath( '//div[@class="popular"]/ul/li/a/@href' )
                TopicsDict = dict(zip(Topics, TopicURLs))

        # GET TOPICS FOR KEYWORDS
        elif Keywords != None:

            # BUILD TOPICS PAGE URL
            TopicPageURL = SITE['root']
            if Category != None and Category in self.categories and \
                not 'any' in Category.lower():                                      # handle case where no category selected
                TopicPageURL += '/' + self._categories[Category]
            TopicPageURL += '/' + Keywords

            # PARSE TOPICS ON PAGE
            TopicResponse = requests.get(TopicPageURL)
            TopicResultXML = get_xml(TopicResponse.text)
            TopicsDict = self._extractTopicsFromSearchResult(TopicResultXML)

        return TopicsDict

Example #2

0

Show file

    def _search(self, Keywords, Reverse=False, TopCount=1):
        '''
        Purpose:    Search the site with the given set of search criteria.
        Arguments:
            Keywords - str - str of keywords to search under. Can be abbreviation
                                or definition
            TopCount - int - number of results to return
        Returns:
            Abbs - list of Abbreviations - Abbreviations returned from search.
        '''

        Abbs = []

        # VALIDATE USER INPUTS
        if Keywords == None:
            raise ValueError('ERROR : KEYWORDS CANNOT BE NONE')
        elif not isinstance(Keywords, str):
            raise ValueError('ERROR : KEYWORDS MUST BE A STRING')

        SearchURL = SITE['root'] + SITE['search']. \
                    format(keywords=Keywords)

        # INITIAL SEARCH
        SearchResponse = requests.get(SearchURL, headers=self.headers)
        if SearchResponse.status_code != 200:  # error return no results
            return []
        SearchResultXML = get_xml(SearchResponse.text)

        # GET ABBREVIATIONS UNTIL TOPCOUNT MET OR SEARCH RESULTS END
        Abbs += self._extractAbbreviations(SearchResultXML)
        if len(Abbs) < TopCount:
            PageCountElms = SearchResultXML.xpath(
                '//div[@class="aa-pagination"]' +
                '/a[contains(@class,"counter")]' + '/text()')
            if len(PageCountElms) > 0:
                PageCount = int(PageCountElms[0].rsplit('/')[1])
                Search_Base_URL = SearchResponse.url
                iPage = 2
                while len( Abbs ) < TopCount and \
                    iPage < PageCount:
                    Next_Search_URL = Search_Base_URL + '/' + str(iPage)
                    NextSearchResponse = requests.get(Next_Search_URL)
                    NextSearchXML = get_xml(NextSearchResponse.text)
                    Abbs += self._extractAbbreviations(NextSearchXML, False)
                    iPage += 1
                Abbs = self._calculateConfidences(Abbs)

        return Abbs

Example #3

0

Show file

    def getRandom(self):
        '''
        Purpose:    Get random abbreviations.
        '''
        RandomURL = SITE['root'] + SITE['random']
        RandomResponse = requests.get(RandomURL, headers=self.headers)
        RandomResultXML = get_xml(RandomResponse.text)

        return self._extractAbbreviations(RandomResultXML, Random=True)

Example #4

0

Show file

File: allacronyms.py Project: ConnorSMaynes/allacronyms

 def _getCategories(self):
     '''
     Purpose:    Get all available categories, in the order the website
                 uses.
     Returns:
         CategoriesDict - OrderedDict - ordered dict of ( category, category_url_path )
     '''
     MainPage = requests.get(SITE['root'])
     MainPageXML = get_xml(MainPage.text)
     Categories = MainPageXML.xpath(
         '//div[contains(@class,"category")]/ul/li/a/text()')
     URLs = MainPageXML.xpath(
         '//div[contains(@class,"category")]/ul/li/a/@href')
     CategoriesList = list(zip(Categories, URLs))
     CategoriesDict = OrderedDict(CategoriesList)
     return CategoriesDict

Example #5

0

Show file

File: allacronyms.py Project: ConnorSMaynes/allacronyms

    def _search(self, Keywords, Category=None, Topic=None, TopCount=1):
        '''
        Purpose:    Search the site with the given set of search criteria.
        Arguments:
            Keywords - str - str of keywords to search under. Can be abbreviation
                                or definition
            Category - str - category to search under
            Topic - str - topic to search under
            TopCount - int - number of results to return
        Returns:
            Abbs - list of Abbreviations - Abbreviations returned from search.
        '''

        Abbs = []

        # VALIDATE USER INPUTS
        if Keywords == None:
            raise ValueError('ERROR : KEYWORDS CANNOT BE NONE')
        elif not isinstance(Keywords, str):
            raise ValueError('ERROR : KEYWORDS MUST BE A STRING')
        if Category != None and not Category in self.categories:
            return []

        # IF NOT DEFINITION, LET THE SITE FIGURE OUT IF DEF OR ABB GIVEN
        # BUILD API URL FOR QUERY / VALIDATE - site automatically detects if abb or def given
        # and generates the url based on this, if this GET request is used
        if Category == None:
            iCategory = 0
        else:
            iCategory = list(self._categories.keys()).index(Category)
        SearchURL = SITE['root'] + SITE['search']. \
                    format(keywords=Keywords
                           , icategory=iCategory)

        # INITIAL SEARCH
        SearchResponse = requests.get(SearchURL)
        if SearchResponse.status_code != 200:  # error return no results
            return []
        SearchResultXML = get_xml(SearchResponse.text)

        # IF TOPIC PROVIDED, FIND TOPIC URL FROM INITIAL SEARCH AND GET REFINED SEARCH WITH TOPIC
        if Topic != None:
            TopicsDict = self._extractTopicsFromSearchResult(SearchResultXML)
            if not Topic in TopicsDict:  # if topic not found for search, return empty results
                return []
            SearchURL = SearchResponse.url + '/' + TopicsDict[Topic]
            SearchResponse = requests.get(SearchURL)
            SearchResultXML = get_xml(SearchResponse.text)

        # GET ABBREVIATIONS UNTIL TOPCOUNT MET OR SEARCH RESULTS END
        Abbs += self._extractAbbreviations(SearchResultXML)
        if len(Abbs) < TopCount:
            PageCountElms = SearchResultXML.xpath(
                '//div[@class="aa-pagination"]' +
                '/a[contains(@class,"counter")]' + '/text()')
            if len(PageCountElms) > 0:
                PageCount = int(PageCountElms[0].rsplit('/')[1])
                Search_Base_URL = SearchResponse.url
                iPage = 2
                while len( Abbs ) < TopCount and \
                    iPage < PageCount:
                    Next_Search_URL = Search_Base_URL + '/' + str(iPage)
                    NextSearchResponse = requests.get(Next_Search_URL)
                    NextSearchXML = get_xml(NextSearchResponse.text)
                    Abbs += self._extractAbbreviations(NextSearchXML, False)
                    iPage += 1
                Abbs = self._calculateConfidences(Abbs)

        return Abbs