Exemple #1
0
    def extract(self, html_text: str, strategy: Strategy=Strategy.ALL_TEXT) \
            -> List[Extraction]:
        """
        Extracts text from an HTML page using a variety of strategies

        Args:
            html_text (): html page in string
            strategy (): one of Strategy.ALL_TEXT, Strategy.MAIN_CONTENT_STRICT and Strategy.MAIN_CONTENT_RELAXED

        Returns: a list of Extraction(s) of a str, typically a singleton list with the extracted text
        """

        try:
            if html_text:
                if strategy == Strategy.ALL_TEXT:
                    soup = BeautifulSoup(html_text, 'html.parser')
                    texts = soup.findAll(text=True)
                    visible_texts = filter(self.tag_visible, texts)
                    all_text = u" ".join(t.strip() for t in visible_texts)
                    return [Extraction(all_text, self.name)]
                else:
                    relax = strategy == Strategy.MAIN_CONTENT_RELAXED
                    readable = Document(html_text, recallPriority=relax).summary(html_partial=False)
                    cleantext = BeautifulSoup(readable.encode('utf-8'), 'lxml').strings
                    readability_text = ' '.join(cleantext)
                    return [Extraction(readability_text, self.name)]
            else:
                return []
        except Exception as e:
            print('Error in extracting readability %s' % e)
            return []
Exemple #2
0
    def extract(self, html_text: str, threshold=0.5) -> List[Extraction]:
        """

        Args:
            html_text (): str of the html page to be extracted
            threshold (): if the ratio of rules that successfully extracted something over all rules \
                    is higher than or equal to the threshold, return the results, else return an empty list

        Returns: a list of Extractions, each extraction includes the extracted value, the rule name, the provenance etc.

        """

        result = list()
        try:
            for rule in self.rule_set.rules:
                rule.apply(html_text)
                value = rule.value
                if value is not None:
                    # note the addition of a new tag argument to Extraction
                    start_char = rule.start_char
                    end_char = rule.end_char
                    result.append(Extraction(value, self.name, start_char=start_char, end_char=end_char, tag=rule.name))

            # Test whether the fraction of extractions meets the desired threshold
            if len(self.rule_set.rules) > 0 and float(len(result))/len(self.rule_set.rules) >= threshold:
                return result
            else:
                return list()
        except Exception as e:
            print('Error in extracting landmark %s' % e)
            return list()
Exemple #3
0
    def extract(self, str_date: str, settings: dict=None) -> List[Extraction]:
        """

        Args:
            str_date (): a date in strin
            settings (): settings when parse the date:
                {
                    'DATE_ORDER': 'MDY',    # default to be 'MDY', shuffled Y, M, D representing Year, Month, Date
                    'STRICT_PARSING': True,
                    'FUZZY': True,
                    'PREFER_DAY_OF_MONTH': 'current',   # default to be 'current'; can be 'first' or 'last' instead;\
                        specify the date when the date is missing
                    'PREFER_DATES_FROM': 'current_period',  # default to be 'current_period'; can be 'future', \
                        or 'past' instead; specify the date when the date is missing
                    'RELATIVE_BASE': datetime.datetime(2020, 1, 1),  # default to be current date and time
                    'SKIP_TOKENS_PARSER': ['t']    # default to be ['t']; a list of tokens to discard while detecting language
                }
            see more on https://github.com/scrapinghub/dateparser/blob/master/docs/usage.rst

        Returns: a datetime.datetime object (or None if the string is not a date)

        """
        customized_settings = settings if settings else {'STRICT_PARSING': True}
        ori_str = str_date
        try:
            if len(str_date) > 100:
                return list()
            str_date = str_date[:20] if len(str_date) > 20 else str_date
            str_date = str_date.replace('\r', '')
            str_date = str_date.replace('\n', '')
            str_date = str_date.replace('<', '')
            str_date = str_date.replace('>', '')
            parsed_date = dateparser.parse(str_date, settings=customized_settings)
            if parsed_date:
                parsed_year = parsed_date.year
                current_year = datetime.datetime.now().year
                if current_year - self.ignore_past_years > parsed_year:
                    return list()
                if self.ignore_future_dates and datetime.datetime.now() < parsed_date:
                    return list()
            extracted_date = Extraction(str(self.convert_to_iso_format(parsed_date)), self.name)
            # should be better if re-consider the construction of provenance:
            extracted_date._provenance['original_date_str'] = ori_str
            return [extracted_date]
        except Exception as e:
            print('Exception: {}, failed to parse {} as date'.format(e, str_date))
            return list()
 def wrap_value_with_context(self,
                             value: dict,
                             field_name: str,
                             start: int = 0,
                             end: int = 0) -> Extraction:
     """Wraps the final result"""
     return Extraction(value,
                       self.name,
                       start_token=start,
                       end_token=end,
                       tag=field_name)
Exemple #5
0
    def extract(self, text: str=None, ignore_future_dates: bool=True, ignore_past_years: int=20,
                settings: dict=None) -> List[Extraction]:
        """
        go through the text to find some sub strings for date,
        when meet a substring for date, parse it by DateParser,
        and wrap the result in an Extraction,
        return the list of Extractions
        """

        res = list()
        date_parser = DateParser(ignore_future_dates, ignore_past_years)
        date_list = self.extract_date_str(text)
        for date_str in date_list:
            date = date_parser.extract(date_str['value'], settings)
            if date and len(date):
                extracted_date = Extraction(str(date[0].value), self.name, start_char=date_str['start'], end_char=date_str['start']+len(date_str['value']))
                # should be better if re-consider the construction of provenance:
                extracted_date._provenance['original_date_str'] = date_str['value']
                res.append(extracted_date)
        return res
 def wrap_split_extraction(self, items: List[str]) -> List[Extraction]:
     res = list()
     start = 0
     for item in items:
         end = start + len(item)
         e = Extraction(value=item,
                        extractor_name=self.name,
                        start_char=start,
                        end_char=end)
         res.append(e)
         start = end
     return res
Exemple #7
0
 def wrap_value_with_context(self, value: str, start: int, end: int) -> Extraction:
     """Wraps the final result"""
     return Extraction(value, self.name, start_token=start, end_token=end)
Exemple #8
0
 def wrap_data(self, key: str, value) -> Extraction:
     e = Extraction(value=value, extractor_name=self.name, tag=key)
     return e
 def wrap_extraction(self, group_idx: int, matches: object) -> Extraction:
     start, end = matches.start(group_idx), matches.end(group_idx)
     text = matches.group(group_idx)
     e = Extraction(value=text, extractor_name=self.name, \
                    start_char=start, end_char=end, tag=self.general_tag)
     return e