Python Query Exemples, wordhoard.utilities.basic_soup.Query Python Exemples

Exemple #1

0

Afficher le fichier

    def find_hyponyms(self):
        """
        Purpose
        ----------
        This function queries classicthesaurus_com for hyponyms associated
        with the specific word provided to the Class Hyponyms.

        Returns
        ----------
         :returns:
             hyponyms: list of hyponyms

        :rtype: list

        Raises
        ----------
        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        valid_word = self._validate_word()
        if valid_word:
            check_cache = self._check_cache()
            if check_cache is False:
                try:
                    if self._proxies is None:
                        response = Query(
                            f'https://www.classicthesaurus.com/{self._word}/narrower'
                        ).get_single_page_html()
                        if response.status_code == 404:
                            logger.info(
                                f'Classic Thesaurus had no hyponyms reference for the word {self._word}'
                            )
                        else:
                            soup = BeautifulSoup(response.text, "lxml")
                            hyponym = _get_hyponyms(soup)
                            if 'no hyponyms found' in hyponym:
                                return f'No hyponyms were found for the word: {self._word}'
                            else:
                                number_of_pages = _get_number_of_pages(soup)
                                if number_of_pages >= 2:
                                    for page in range(2, number_of_pages):
                                        sub_html = Query(
                                            f'https://www.classicthesaurus.com/{self._word}/narrower/'
                                            f'{page}').get_single_page_html()
                                        sub_soup = BeautifulSoup(
                                            sub_html.text, 'lxml')
                                        additional_hyponym = _get_hyponyms(
                                            sub_soup)
                                        hyponym.union(additional_hyponym)
                                self._update_cache(sorted(hyponym))
                                return sorted(set(hyponym))
                    elif self._proxies is not None:
                        response = Query(
                            f'https://www.classicthesaurus.com/{self._word}/narrower',
                            self._proxies).get_single_page_html()
                        if response.status_code == 404:
                            logger.info(
                                f'Classic Thesaurus had no hyponyms reference for the word {self._word}'
                            )
                        else:
                            soup = BeautifulSoup(response.text, "lxml")
                            hyponym = _get_hyponyms(soup)
                            if 'no hyponyms found' in hyponym:
                                return f'No hyponyms were found for the word: {self._word}'
                            else:
                                number_of_pages = _get_number_of_pages(soup)
                                if number_of_pages >= 2:
                                    for page in range(2, number_of_pages):
                                        sub_html = Query(
                                            f'https://www.classicthesaurus.com/{self._word}/narrower/'
                                            f'{page}', self._proxies
                                        ).get_single_page_html()
                                        sub_soup = BeautifulSoup(
                                            sub_html.text, 'lxml')
                                        additional_hyponym = _get_hyponyms(
                                            sub_soup)
                                        hyponym.union(additional_hyponym)
                                self._update_cache(sorted(hyponym))
                                return sorted(set(hyponym))
                except bs4.FeatureNotFound as error:
                    logger.error(
                        'An error occurred in the following code segment:')
                    logger.error(''.join(
                        traceback.format_tb(error.__traceback__)))
                except AttributeError as error:
                    logger.error(
                        'An AttributeError occurred in the following code segment:'
                    )
                    logger.error(''.join(
                        traceback.format_tb(error.__traceback__)))
                except IndexError as error:
                    logger.error(
                        'An IndexError occurred in the following code segment:'
                    )
                    logger.error(''.join(
                        traceback.format_tb(error.__traceback__)))
                except KeyError as error:
                    logger.error(
                        'A KeyError occurred in the following code segment:')
                    logger.error(''.join(
                        traceback.format_tb(error.__traceback__)))
                except TypeError as error:
                    logger.error(
                        'A TypeError occurred in the following code segment:')
                    logger.error(''.join(
                        traceback.format_tb(error.__traceback__)))
            else:
                hyponym = cleansing.flatten_multidimensional_list(
                    [val for val in check_cache.values()])
                return hyponym

Exemple #2

0

Afficher le fichier

    def _query_merriam_webster(self):
        """
        This function queries merriam-webster.com for a definition associated
        with the specific word provided to the Class Definitions

        :returns:
            definitions: definition for a word

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            if self._proxies is None:
                response = Query(
                    f'https://www.merriam-webster.com/dictionary/{self._word}'
                ).get_single_page_html()
                if response.status_code == 404:
                    logger.info(
                        f'Merriam-webster.com has no definition reference for the word {self._word}'
                    )
                else:
                    definition_list = []
                    soup = BeautifulSoup(response.text, "lxml")
                    pattern = regex.compile(r'Words fail us')
                    if soup.find(text=pattern):
                        logger.info(
                            f'Merriam-webster.com has no reference for the word {self._word}'
                        )
                    elif soup.find('h1', {'class': 'mispelled-word'}):
                        logger.info(
                            f'Merriam-webster.com has no definition reference for the word {self._word}'
                        )
                    else:
                        dictionary_entry = soup.find(
                            'div', {'id': 'dictionary-entry-1'})
                        definition_container = dictionary_entry.find(
                            'div', {'class': 'vg'})
                        definition_entries = definition_container.find_all(
                            'span', {'class': 'sb-0'})[0]
                        for definition_entry in definition_entries.find_all(
                                'span', {'class': 'dtText'}):
                            definition_list.append(
                                definition_entry.text.lower().replace(
                                    ':', '').strip())
                        definitions = sorted([
                            cleansing.normalize_space(i)
                            for i in definition_list
                        ])
                        self._update_cache(definitions)
                        return definitions
            elif self._proxies is not None:
                response = Query(
                    f'https://www.merriam-webster.com/dictionary/{self._word}',
                    self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(
                        f'Merriam-webster.com has no definition reference for the word {self._word}'
                    )
                else:
                    definition_list = []
                    soup = BeautifulSoup(response.text, "lxml")
                    pattern = regex.compile(r'Words fail us')
                    if soup.find(text=pattern):
                        logger.info(
                            f'Merriam-webster.com has no reference for the word {self._word}'
                        )
                    elif soup.find('h1', {'class': 'mispelled-word'}):
                        logger.info(
                            f'Merriam-webster.com has no definition reference for the word {self._word}'
                        )
                    else:
                        dictionary_entry = soup.find(
                            'div', {'id': 'dictionary-entry-1'})
                        definition_container = dictionary_entry.find(
                            'div', {'class': 'vg'})
                        definition_entries = definition_container.find_all(
                            'span', {'class': 'sb-0'})[0]
                        for definition_entry in definition_entries.find_all(
                                'span', {'class': 'dtText'}):
                            definition_list.append(
                                definition_entry.text.lower().replace(
                                    ':', '').strip())
                        definitions = sorted([
                            cleansing.normalize_space(i)
                            for i in definition_list
                        ])
                        self._update_cache(definitions)
                        return definitions
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error(
                'An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error(
                'An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #3

0

Afficher le fichier

    def _query_synonym_com(self):
        """
        This function queries synonym.com for a definition associated
        with the specific word provided to the Class Definitions

        :returns:
            definitions: definition for a word

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            if self._proxies is None:
                response = Query(
                    f'https://www.synonym.com/synonyms/{self._word}'
                ).get_single_page_html()
                if response.status_code == 404:
                    logger.info(
                        f'Synonym.com had no definition reference for the word {self._word}'
                    )
                else:
                    definition_list = []
                    soup = BeautifulSoup(response.text, "lxml")
                    status_tag = soup.find("meta", {"name": "pagetype"})
                    pattern = regex.compile(r'Oops, 404!')
                    if soup.find(text=pattern):
                        logger.info(
                            f'Synonym.com had no definition reference for the word {self._word}'
                        )
                    elif status_tag.attrs['content'] == 'Term':
                        dictionary_entries = soup.find(
                            'h3', {'class': 'section-title'})
                        dictionary_entry = dictionary_entries.find_next(
                            'p').text
                        remove_brackets = regex.sub(r'.*?\[.*?\]', '',
                                                    dictionary_entry)
                        definition_list.append(remove_brackets.strip())
                        definitions = sorted(
                            [x.lower() for x in definition_list])
                        self._update_cache(definitions)
                        return sorted(definitions)
            elif self._proxies is not None:
                response = Query(
                    f'https://www.synonym.com/synonyms/{self._word}',
                    self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(
                        f'Synonym.com had no definition reference for the word {self._word}'
                    )
                else:
                    definition_list = []
                    soup = BeautifulSoup(response.text, "lxml")
                    status_tag = soup.find("meta", {"name": "pagetype"})
                    pattern = regex.compile(r'Oops, 404!')
                    if soup.find(text=pattern):
                        logger.info(
                            f'Synonym.com had no definition reference for the word {self._word}'
                        )
                    elif status_tag.attrs['content'] == 'Term':
                        dictionary_entries = soup.find(
                            'h3', {'class': 'section-title'})
                        dictionary_entry = dictionary_entries.find_next(
                            'p').text
                        remove_brackets = regex.sub(r'.*?\[.*?\]', '',
                                                    dictionary_entry)
                        definition_list.append(remove_brackets.strip())
                        definitions = sorted(
                            [x.lower() for x in definition_list])
                        self._update_cache(definitions)
                        return sorted(definitions)
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error(
                'An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error(
                'An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #4

0

Afficher le fichier

Fichier : hypernyms.py Projet : johnbumgarner/wordhoard

    def find_hypernyms(self):
        """
        Purpose
        ----------
        This function queries classicthesaurus_com for hypernyms associated
        with the specific word provided to the Class Hypernyms.

        Returns
        ----------
        :returns:
            hypernym: list of hypernyms

        :rtype: list

        Raises
        ----------
        :raises
            AttributeError: Raised when an attribute reference or assignment fails

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys

            TypeError: Raised when an operation or function is applied to an object of inappropriate type

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        valid_word = self._validate_word()
        if valid_word:
            check_cache = self._check_cache()
            if check_cache[0] is True:
                hypernym = cleansing.flatten_multidimensional_list(list(check_cache[1]))
                if self._output_format == 'list':
                    return sorted(hypernym)
                elif self._output_format == 'dictionary':
                    output_dict = {self._word: sorted(set(hypernym))}
                    return output_dict
                elif self._output_format == 'json':
                    json_object = json.dumps({'hypernyms': {self._word: sorted(set(hypernym))}},
                                             indent=4, ensure_ascii=False)
                    return json_object

            elif check_cache[0] is False:
                try:
                    response = ''
                    if self._proxies is None:
                        if self._user_agent is None:
                            response = Query(
                                f'https://www.classicthesaurus.com/{self._word}/broader').get_single_page_html()
                        elif self._user_agent is not None:
                            response = Query(f'https://www.classicthesaurus.com/{self._word}/broader',
                                             user_agent=self._user_agent).get_single_page_html()

                    elif self._proxies is not None:
                        if self._user_agent is None:
                            response = Query(f'https://www.classicthesaurus.com/{self._word}/broader',
                                             proxies=self._proxies).get_single_page_html()
                        elif self._user_agent is not None:
                            response = Query(f'https://www.classicthesaurus.com/{self._word}/broader',
                                             user_agent=self._user_agent, proxies=self._proxies).get_single_page_html()

                    if response.status_code == 404:
                        logger.info(f'Classic Thesaurus had no hypernyms reference for the word {self._word}')
                    else:
                        soup = BeautifulSoup(response.text, "lxml")
                        cloudflare_protection = CloudflareVerification('https://www.classicthesaurus.com',
                                                                       soup).cloudflare_protected_url()
                        if cloudflare_protection is False:
                            hypernym = _get_hypernyms(soup)
                            if 'no hypernyms found' in hypernym:
                                return _colorized_text(255, 0, 255,
                                                       f'No hypernyms were found for the word: {self._word} \n'
                                                       f'Please verify that the word is spelled correctly.')
                            else:
                                number_of_pages = _get_number_of_pages(soup)
                                if number_of_pages >= 2:
                                    for page in range(2, number_of_pages):
                                        sub_html = ''
                                        if self._proxies is None:
                                            if self._user_agent is None:
                                                sub_html = Query(
                                                    f'https://www.classicthesaurus.com/{self._word}/broader/{page}').get_single_page_html()
                                            elif self._user_agent is not None:
                                                sub_html = Query(
                                                    f'https://www.classicthesaurus.com/{self._word}/broader/{page}',
                                                    user_agent=self._user_agent).get_single_page_html()
                                        elif self._proxies is not None:
                                            if self._user_agent is None:
                                                sub_html = Query(
                                                    f'https://www.classicthesaurus.com/{self._word}/broader/{page}',
                                                    proxies=self._proxies).get_single_page_html()
                                            elif self._user_agent is not None:
                                                sub_html = Query(
                                                    f'https://www.classicthesaurus.com/{self._word}/broader/{page}',
                                                    user_agent=self._user_agent,
                                                    proxies=self._proxies).get_single_page_html()

                                        sub_soup = BeautifulSoup(sub_html.text, 'lxml')
                                        additional_hypernym = _get_hypernyms(sub_soup)
                                        if additional_hypernym:
                                            hypernym.union(additional_hypernym)
                                self._update_cache(hypernym)
                                if self._output_format == 'list':
                                    return sorted(set(hypernym))
                                elif self._output_format == 'dictionary':
                                    output_dict = {self._word: sorted(set(hypernym))}
                                    return output_dict
                                elif self._output_format == 'json':
                                    json_object = json.dumps({'hypernyms': {self._word: sorted(set(hypernym))}},
                                                             indent=4, ensure_ascii=False)
                                    return json_object
                        elif cloudflare_protection is True:
                            logger.info('-' * 80)
                            logger.info(f'The following URL has Cloudflare DDoS mitigation service protection.')
                            logger.info('https://www.classicthesaurus.com')
                            logger.info('-' * 80)
                            return None

                except bs4.FeatureNotFound as error:
                    logger.error('An error occurred in the following code segment:')
                    logger.error(''.join(traceback.format_tb(error.__traceback__)))
                except AttributeError as error:
                    logger.error('An AttributeError occurred in the following code segment:')
                    logger.error(''.join(traceback.format_tb(error.__traceback__)))
                except IndexError as error:
                    logger.error('An IndexError occurred in the following code segment:')
                    logger.error(''.join(traceback.format_tb(error.__traceback__)))
                except KeyError as error:
                    logger.error('A KeyError occurred in the following code segment:')
                    logger.error(''.join(traceback.format_tb(error.__traceback__)))
                except TypeError as error:
                    logger.error('A TypeError occurred in the following code segment:')
                    logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #5

0

Afficher le fichier

    def _query_collins_dictionary(self):
        """
        This function queries collinsdictionary.com for a definition associated
        with the specific word provided to the Class Definitions.

         :returns:
            definition: definition for a word

        :rtype: str

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            if self._proxies is None:
                response = Query(
                    f'https://www.collinsdictionary.com/dictionary/english-thesaurus/{self._word}'
                ).get_single_page_html()
                if response.status_code == 404:
                    logger.error(
                        f'Collins Dictionary had no definition reference for the word {self._word}'
                    )
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    query_results = soup.find(
                        'div',
                        {'class': 'form type-def titleTypeSubContainer'})
                    if query_results is not None:
                        definition = query_results.findNext(
                            'div', {'class': 'def'})
                        self._update_cache(definition.text)
                        return definition.text
                    else:
                        logger.error(
                            f'Collins Dictionary had no definition reference for the word {self._word}'
                        )
            elif self._proxies is not None:
                response = Query(
                    f'https://www.collinsdictionary.com/dictionary/english-thesaurus/{self._word}',
                    self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.error(
                        f'Collins Dictionary had no definition reference for the word {self._word}'
                    )
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    query_results = soup.find(
                        'div',
                        {'class': 'form type-def titleTypeSubContainer'})
                    if query_results is not None:
                        definition = query_results.findNext(
                            'div', {'class': 'def'})
                        self._update_cache(definition.text)
                        return definition.text
                    else:
                        logger.error(
                            f'Collins Dictionary had no definition reference for the word {self._word}'
                        )
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error(
                'An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error(
                'An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #6

0

Afficher le fichier

    def _query_thesaurus_com(self):
        """
        This function queries thesaurus.com for synonyms associated
        with the specific word provided to the Class Synonyms.

        :returns:
            synonyms: list of synonyms

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            synonyms_list = []
            if self._proxies is None:
                response = Query(f'https://www.thesaurus.com/browse/{self._word}').get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Thesaurus.com had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    status_tag = soup.find("h1")
                    if status_tag.text.startswith('0 results for'):
                        logger.info(f'Thesaurus.com had no synonym reference for the word {self._word}')
                    else:
                        synonyms = []
                        word_container = soup.find('div', {'data-testid': 'word-grid-container'})
                        for list_item in word_container.find('ul').find_all('li'):
                            for link in list_item.find_all('a', href=True):
                                synonyms_list.append(link.text)
                            synonyms = sorted([cleansing.normalize_space(i) for i in synonyms_list])
                            synonyms = sorted([x.lower() for x in synonyms])
                        self._update_cache(synonyms)
                        return synonyms
            elif self._proxies is not None:
                response = Query(f'https://www.thesaurus.com/browse/{self._word}',
                                 self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Thesaurus.com had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    status_tag = soup.find("h1")
                    if status_tag.text.startswith('0 results for'):
                        logger.info(f'Thesaurus.com had no synonym reference for the word {self._word}')
                    else:
                        synonyms = []
                        word_container = soup.find('div', {'data-testid': 'word-grid-container'})
                        for list_item in word_container.find('ul').find_all('li'):
                            for link in list_item.find_all('a', href=True):
                                synonyms_list.append(link.text)
                            synonyms = sorted([cleansing.normalize_space(i) for i in synonyms_list])
                            synonyms = sorted([x.lower() for x in synonyms])
                        self._update_cache(synonyms)
                        return synonyms
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #7

0

Afficher le fichier

    def _query_wordnet(self):
        """
        This function queries wordnet for synonyms associated
        with the specific word provided to the Class Synonyms.

        :returns:
            synonyms: list of synonyms

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            synonyms = []
            if self._proxies is None:
                response = Query(f'http://wordnetweb.princeton.edu/perl/webwn?s={self._word}').get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Wordnet had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    pattern = regex.compile(r'Your search did not return any results')
                    if soup.find(text=pattern):
                        logger.info(f'Wordnet had no synonym reference for the word {self._word}')
                    else:
                        if soup.findAll('h3', text='Noun'):
                            parent_node = soup.findAll("ul")[0].findAll('li')
                            for children in parent_node:
                                for child in children.find_all(href=True):
                                    if 'S:' not in child.contents[0]:
                                        synonyms.append(child.contents[0])
                            synonyms = sorted([x.lower() for x in synonyms])
                            self._update_cache(synonyms)
                            return synonyms
                        else:
                            logger.info(f'Wordnet had no synonym reference for the word {self._word}')
            elif self._proxies is not None:
                response = Query(f'http://wordnetweb.princeton.edu/perl/webwn?s={self._word}',
                                 self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Wordnet had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    pattern = regex.compile(r'Your search did not return any results')
                    if soup.find(text=pattern):
                        logger.info(f'Wordnet had no synonym reference for the word {self._word}')
                    else:
                        if soup.findAll('h3', text='Noun'):
                            parent_node = soup.findAll("ul")[0].findAll('li')
                            for children in parent_node:
                                for child in children.find_all(href=True):
                                    if 'S:' not in child.contents[0]:
                                        synonyms.append(child.contents[0])
                            synonyms = sorted([x.lower() for x in synonyms])
                            self._update_cache(synonyms)
                            return synonyms
                        else:
                            logger.info(f'Wordnet had no synonym reference for the word {self._word}')
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #8

0

Afficher le fichier

    def _query_synonym_com(self):
        """
        This function queries synonym.com for synonyms associated
        with the specific word provided to the Class Synonyms.

         :returns:
            synonyms: list of synonyms

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            if self._proxies is None:
                response = Query(f'https://www.synonym.com/synonyms/{self._word}').get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Synonym.com had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    status_tag = soup.find("meta", {"name": "pagetype"})
                    pattern = regex.compile(r'Oops, 404!')
                    if soup.find(text=pattern):
                        logger.info(f'Synonym.com had no synonym reference for the word {self._word}')
                    elif status_tag.attrs['content'] == 'Term':
                        if soup.find('div', {'data-section': 'synonyms'}):
                            synonyms_class = soup.find('div', {'data-section': 'synonyms'})
                            synonyms = [word.text for word in
                                        synonyms_class.find('ul', {'class': 'section-list'}).find_all('li')]
                            synonyms = sorted([x.lower() for x in synonyms])
                            self._update_cache(synonyms)
                            return sorted(synonyms)
                        else:
                            logger.info(f'Synonym.com had no synonym reference for the word {self._word}')
            elif self._proxies is not None:
                response = Query(f'https://www.synonym.com/synonyms/{self._word}',
                                 self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Synonym.com had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    status_tag = soup.find("meta", {"name": "pagetype"})
                    pattern = regex.compile(r'Oops, 404!')
                    if soup.find(text=pattern):
                        logger.info(f'Synonym.com had no synonym reference for the word {self._word}')
                    elif status_tag.attrs['content'] == 'Term':
                        if soup.find('div', {'data-section': 'synonyms'}):
                            synonyms_class = soup.find('div', {'data-section': 'synonyms'})
                            synonyms = [word.text for word in
                                        synonyms_class.find('ul', {'class': 'section-list'}).find_all('li')]
                            synonyms = sorted([x.lower() for x in synonyms])
                            self._update_cache(synonyms)
                            return sorted(synonyms)
                        else:
                            logger.info(f'Synonym.com had no synonym reference for the word {self._word}')
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.info('\n')
            logger.info(self._word)
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
            logger.info('\n')
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #9

0

Afficher le fichier

    def _query_merriam_webster(self):
        """
        This function queries merriam-webster.com for synonyms associated
        with the specific word provided to the Class Synonyms.

        :returns:
            synonyms: list of synonyms

        :rtype: list

        :raises
           AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            synonyms_list = []
            if self._proxies is None:
                response = Query(f'https://www.merriam-webster.com/thesaurus/{self._word}').get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Merriam-webster.com had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    pattern = regex.compile(r'Words fail us')
                    if soup.find(text=pattern):
                        logger.info(f'Merriam-webster.com had no synonym reference for the word {self._word}')
                    elif soup.find('h1', {'class': 'mispelled-word'}):
                        logger.info(f'Merriam-webster.com had no synonym reference for the word {self._word}')
                    else:
                        synonyms = []
                        if soup.find('p', {'class': 'function-label'}):
                            label = soup.find('p', {'class': 'function-label'})
                            if label.text.startswith('Synonyms for'):
                                parent_tag = soup.find("span", {'class': 'thes-list syn-list'})
                                word_container = parent_tag.find('div', {'class': 'thes-list-content synonyms_list'})
                                for list_item in word_container.find_all("ul", {'class': 'mw-list'}):
                                    for link in list_item.find_all('a', href=True):
                                        synonyms_list.append(link.text)
                                synonyms = sorted([cleansing.normalize_space(i) for i in synonyms_list])
                                synonyms = sorted([x.lower() for x in synonyms])
                            self._update_cache(synonyms)
                            return synonyms
            elif self._proxies is not None:
                response = Query(f'https://www.merriam-webster.com/thesaurus/{self._word}',
                                 self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Merriam-webster.com had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, "lxml")
                    pattern = regex.compile(r'Words fail us')
                    if soup.find(text=pattern):
                        logger.info(f'Merriam-webster.com had no synonym reference for the word {self._word}')
                    elif soup.find('h1', {'class': 'mispelled-word'}):
                        logger.info(f'Merriam-webster.com had no synonym reference for the word {self._word}')
                    else:
                        synonyms = []
                        if soup.find('p', {'class': 'function-label'}):
                            label = soup.find('p', {'class': 'function-label'})
                            if label.text.startswith('Synonyms for'):
                                parent_tag = soup.find("span", {'class': 'thes-list syn-list'})
                                word_container = parent_tag.find('div', {'class': 'thes-list-content synonyms_list'})
                                for list_item in word_container.find_all("ul", {'class': 'mw-list'}):
                                    for link in list_item.find_all('a', href=True):
                                        synonyms_list.append(link.text)
                                synonyms = sorted([cleansing.normalize_space(i) for i in synonyms_list])
                                synonyms = sorted([x.lower() for x in synonyms])
                            self._update_cache(synonyms)
                            return synonyms
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #10

0

Afficher le fichier

    def _query_collins_dictionary(self):
        """
        This function queries collinsdictionary.com for synonyms associated
        with the specific word provided to the Class Synonyms.

        :returns:
            synonyms: list of synonyms

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails.

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys.

            TypeError: Raised when an operation or function is applied to an object of inappropriate type.

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            synonyms = []
            if self._proxies is None:
                response = Query(f'https://www.collinsdictionary.com/dictionary/english-thesaurus/{self._word}').get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Collins Dictionary had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, 'lxml')
                    word_found = soup.find('h1', text=f'Sorry, no results for “{self._word}” in the English Thesaurus.')
                    if word_found:
                        logger.info(f'Collins Dictionary had no synonym reference for the word {self._word}')
                    else:
                        if soup.find('div', {'class': 'blockSyn'}):
                            query_results = soup.find('div', {'class': 'blockSyn'})
                            for primary_syn in query_results.find_all('div', {'class', 'form type-syn orth'}):
                                synonyms.append(primary_syn.text)

                            for sub_syn in query_results.find_all('div', {'class', 'form type-syn'}):
                                child = sub_syn.findChild('span', {'class': 'orth'})
                                synonyms.append(child.text)

                        synonyms = sorted([x.lower() for x in synonyms])
                        self._update_cache(synonyms)
                        return sorted(synonyms)

            elif self._proxies is not None:
                response = Query(f'https://www.collinsdictionary.com/dictionary/english-thesaurus/{self._word}',
                                 self._proxies).get_single_page_html()
                if response.status_code == 404:
                    logger.info(f'Collins Dictionary had no synonym reference for the word {self._word}')
                else:
                    soup = BeautifulSoup(response.text, 'lxml')
                    word_found = soup.find('h1', text=f'Sorry, no results for “{self._word}” in the English Thesaurus.')
                    if word_found:
                        logger.info(f'Collins Dictionary had no synonym reference for the word {self._word}')
                    else:
                        if soup.find('div', {'class': 'blockSyn'}):
                            query_results = soup.find('div', {'class': 'blockSyn'})
                            for primary_syn in query_results.find_all('div', {'class', 'form type-syn orth'}):
                                synonyms.append(primary_syn.text)

                            for sub_syn in query_results.find_all('div', {'class', 'form type-syn'}):
                                child = sub_syn.findChild('span', {'class': 'orth'})
                                synonyms.append(child.text)

                        synonyms = sorted([x.lower() for x in synonyms])
                        self._update_cache(synonyms)
                        return sorted(synonyms)
        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #11

0

Afficher le fichier

    def _query_wordhippo(self):
        """
        This function queries wordhippo.com for antonyms associated
        with the specific word provided to the Class Antonyms.

        :returns:
            antonyms: list of antonyms

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys

            TypeError: Raised when an operation or function is applied to an object of inappropriate type

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            antonyms = []
            response = ''
            if self._proxies is None:
                if self._user_agent is None:
                    response = Query(
                        f'https://www.wordhippo.com/what-is/the-opposite-of/{self._word}.html').get_single_page_html()
                elif self._user_agent is not None:
                    response = Query(f'https://www.wordhippo.com/what-is/the-opposite-of/{self._word}.html',
                                     user_agent=self._user_agent).get_single_page_html()
            elif self._proxies is not None:
                if self._user_agent is None:
                    response = Query(f'https://www.wordhippo.com/what-is/the-opposite-of/{self._word}.html',
                                     proxies=self._proxies).get_single_page_html()
                elif self._user_agent is not None:
                    response = Query(f'https://www.wordhippo.com/what-is/the-opposite-of/{self._word}.html',
                                     user_agent=self._user_agent, proxies=self._proxies).get_single_page_html()

            if response.status_code == 404:
                logger.info(f'Wordhippo.com had no antonym reference for the word {self._word}')
            else:
                soup = BeautifulSoup(response.text, "lxml")
                cloudflare_protection = CloudflareVerification('https://www.wordhippo.com',
                                                               soup).cloudflare_protected_url()
                if cloudflare_protection is False:
                    pattern = regex.compile(r'We do not currently know of any antonyms for')
                    if soup.find(text=pattern):
                        logger.info(f'Wordhippo.com had no antonym reference for the word {self._word}')
                    else:
                        related_tag = soup.find("div", {'class': 'relatedwords'})
                        if related_tag.find("div", {'class': 'wb'}) is not None:
                            for list_item in related_tag.find_all("div", {'class': 'wb'}):
                                for link in list_item.find_all('a', href=True):
                                    antonyms.append(link.text)
                            antonyms = sorted([x.lower() for x in antonyms])
                            self._update_cache(antonyms)
                            return antonyms
                        else:
                            for table_row in related_tag.find_all('td'):
                                for href_link in table_row.find('a', href=True):
                                    antonyms.append(href_link.text)
                            antonyms = sorted([x.lower() for x in antonyms])
                            self._update_cache(antonyms)
                            return antonyms
                elif cloudflare_protection is True:
                    logger.info('-' * 80)
                    logger.info(f'The following URL has Cloudflare DDoS mitigation service protection.')
                    logger.info('https://www.wordhippo.com')
                    logger.info('-' * 80)
                    return None

        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))

Exemple #12

0

Afficher le fichier

    def _query_thesaurus_com(self):
        """
        This function queries thesaurus.com for antonyms associated
        with the specific word provided to the Class Antonyms.

        :returns:
            antonyms: list of antonyms

        :rtype: list

        :raises
            AttributeError: Raised when an attribute reference or assignment fails

            IndexError: Raised when a sequence subscript is out of range

            KeyError: Raised when a mapping (dictionary) key is not found in the set of existing keys

            TypeError: Raised when an operation or function is applied to an object of inappropriate type

            bs4.FeatureNotFound: raised by the BeautifulSoup constructor if no parser with the requested features
            is found
        """
        try:
            antonyms = []
            response = ''
            if self._proxies is None:
                if self._user_agent is None:
                    response = Query(f'https://www.thesaurus.com/browse/{self._word}').get_single_page_html()
                elif self._user_agent is not None:
                    response = Query(f'https://www.thesaurus.com/browse/{self._word}',
                                     user_agent=self._user_agent).get_single_page_html()
            elif self._proxies is not None:
                if self._user_agent is None:
                    response = Query(f'https://www.thesaurus.com/browse/{self._word}',
                                     proxies=self._proxies).get_single_page_html()
                elif self._user_agent is not None:
                    response = Query(f'https://www.thesaurus.com/browse/{self._word}',
                                     user_agent=self._user_agent, proxies=self._proxies, ).get_single_page_html()

            if response.status_code == 404:
                logger.info(f'Thesaurus.com had no antonym reference for the word {self._word}')
            else:
                soup = BeautifulSoup(response.text, "lxml")
                cloudflare_protection = CloudflareVerification('https://www.thesaurus.com',
                                                               soup).cloudflare_protected_url()
                if cloudflare_protection is False:
                    if soup.find("div", {'id': 'antonyms'}):
                        parent_tag = soup.find_all("div", {'data-testid': 'word-grid-container'})[1]
                        for link in parent_tag.find_all('a', {'class': 'css-pc0050'}):
                            antonyms.append(link.text.strip())
                        antonyms = sorted([x.lower() for x in antonyms])
                        self._update_cache(antonyms)
                        return antonyms
                    else:
                        logger.info(f'Thesaurus.com had no antonym reference for the word {self._word}')
                elif cloudflare_protection is True:
                    logger.info('-' * 80)
                    logger.info(f'The following URL has Cloudflare DDoS mitigation service protection.')
                    logger.info('https://www.thesaurus.com')
                    logger.info('-' * 80)
                    return None

        except bs4.FeatureNotFound as error:
            logger.error('An error occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except AttributeError as error:
            logger.error('An AttributeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except IndexError as error:
            logger.error('An IndexError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except KeyError as error:
            logger.error('A KeyError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))
        except TypeError as error:
            logger.error('A TypeError occurred in the following code segment:')
            logger.error(''.join(traceback.format_tb(error.__traceback__)))