Beispiel #1
0
    def scrape_url(
        self,
        url,
        parser='html.parser',
        tag_to_find='p',
    ):
        try:
            sents = []
            resp = requests.get(url=url, )
            soup = BeautifulSoup(resp.content, parser)
            contents_tag = soup.find_all(tag_to_find)
            for cont in contents_tag:
                txt = StringUtils.trim(cont.get_text())
                sent_list = txt.split('。')
                sent_list = [StringUtils.trim(s) for s in sent_list if s]
                if len(sent_list):
                    sents += sent_list
                Log.debug('Split "' + str(txt) + '" into:' + str(sent_list))
                # [Log.debug('\t"' + str(s) + '"') for s in sent_list]

            return sents
        except Exception as ex:
            Log.error(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Error scraping url "' + str(url) + '", exception: ' +
                str(ex))
Beispiel #2
0
 def __segment_words(self, text):
     sent = StringUtils.trim(text)
     sent = sent.lower()
     sent = sent.split(' ')
     # Split out punctuations
     sent = BasicPreprocessor.clean_punctuations(sentence=sent)
     return sent
Beispiel #3
0
    def process_common_words(self, word_split_token=' '):
        try:
            self.raw_words = StringUtils.trim(self.raw_words)
            self.raw_words = re.sub(pattern='[\xa0\t\n\r]',
                                    repl=word_split_token,
                                    string=self.raw_words)
            self.raw_words = self.raw_words.lower()
        except Exception as ex:
            errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error processing raw words. Exception: ' + str(ex)
            Log.error(errmsg)
            raise Exception(errmsg)

        try:
            self.common_words = self.raw_words.split(word_split_token)
            # Remove None, '', {}, etc.
            self.common_words = [w for w in self.common_words if w]

            word_stems = self.add_word_stems()
            if word_stems:
                self.common_words = word_stems + self.common_words

            self.common_words = sorted(set(self.common_words))
            Log.info(
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + ': Loaded ' + str(len(self.common_words)) + ' common words of lang "' + str(self.lang) + '".'
            )
        except Exception as ex:
            errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error processing common words. Exception: ' + str(ex)
            Log.error(errmsg)
            raise Exception(errmsg)

        return
Beispiel #4
0
 def import_form_fields(
         list_json,
         mex_form_model
 ):
     if len(list_json) != len(mex_form_model):
         raise Exception(
             str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': List of fields must be same length with mex expr list.'
             + ' Fields: ' + str(list_json) + ', Mex Expr List: ' + str(mex_form_model)
         )
     form_fields = []
     for i in range(len(list_json)):
         json_field = list_json[i]
         json_field[ffld.FormField.KEY_MEX_EXPR] = StringUtils.trim(mex_form_model[i])
         try:
             form_fields.append(
                 ffld.FormField.import_form_field(json_obj=json_field)
             )
         except Exception as ex_field:
             errmsg = \
                 str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                 + ': Error importing field: ' + str(json_field) \
                 + '. Exception: ' + str(ex_field)
             Log.error(errmsg)
             raise Exception(errmsg)
     return form_fields
Beispiel #5
0
 def confirm_answer(self, answer):
     answer = StringUtils.trim(answer)
     if answer.lower() in self.text_list_confirm_words:
         self.confirm_current_field()
         return True
     else:
         # No form confirmation
         return False
Beispiel #6
0
 def get_training_data_by_scraping(
     self,
     url,
     tag_to_find='p',
     min_char_per_sent=0,
     max_char_per_sent=np.inf,
     rm_html_markup=False,
     unquote_html=False,
 ):
     # Пример данных из википедии
     sentences_list_from_wiki_scraping = Scrape().scrape_url(
         url=url, tag_to_find=tag_to_find)
     Log.info(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Scraped ' + str(len(sentences_list_from_wiki_scraping)) +
         ' sentences from url "' + str(url) + '"')
     sentences_list = []
     for s in sentences_list_from_wiki_scraping:
         s = StringUtils.trim(s)
         s = BeautifulSoup(s).text
         s_clean = s
         if rm_html_markup:
             # Remove all patterns '<...>'
             html_tags_re = re.compile(r'<[^>]+>')
             s_clean = re.sub(html_tags_re, '', string=s)
         if unquote_html:
             # Convert strings like '%3Fmode%3DLSD%26mid%3Dshm%26sid1%3D102%26oid%3D421%26aid%3D0005537039'
             # into '?mode=LSD&mid=shm&sid1=102&oid=421&aid=0005537039'
             s_clean = urllib.parse.unquote(string=s)
         len_s = len(s_clean)
         if (len_s >= min_char_per_sent) and (len_s <= max_char_per_sent):
             sentences_list.append(s_clean)
         Log.debug('From\n\r\t"' + str(s) + '" to\n\r\t"' + str(s_clean) +
                   '"')
     Log.info(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Filtered to ' + str(len(sentences_list)) +
         ' sentences from url "' + str(url) + '"')
     return sentences_list
Beispiel #7
0
    def confirm_form(self, answer):
        answer = StringUtils.trim(answer)
        if answer.lower() in self.text_list_confirm_words:
            self.set_state_form_completed_and_confirmed()
            self.reset_continuous_error_count()
            return True
        else:
            # Try to update all fields strictly, maybe user wants to change something
            result = self.set_all_field_value_from_answer(answer=answer)
            if result.is_updated:
                self.reset_continuous_error_count()
            else:
                self.increment_continuous_error_count()

            if self.is_error_threshold_hit():
                Log.warning(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Reset form after ' +
                    str(self.fill_form_continuous_err_count) +
                    ' error counts.')
                self.reset()
            # No form confirmation
            return False
Beispiel #8
0
 def trim_lower(x):
     x = StringUtils.trim(str(x))
     return x.lower()