def extract(self, html_text: str, strategy: Strategy=Strategy.ALL_TEXT) \ -> List[Extraction]: """ Extracts text from an HTML page using a variety of strategies Args: html_text (): html page in string strategy (): one of Strategy.ALL_TEXT, Strategy.MAIN_CONTENT_STRICT and Strategy.MAIN_CONTENT_RELAXED Returns: a list of Extraction(s) of a str, typically a singleton list with the extracted text """ try: if html_text: if strategy == Strategy.ALL_TEXT: soup = BeautifulSoup(html_text, 'html.parser') texts = soup.findAll(text=True) visible_texts = filter(self.tag_visible, texts) all_text = u" ".join(t.strip() for t in visible_texts) return [Extraction(all_text, self.name)] else: relax = strategy == Strategy.MAIN_CONTENT_RELAXED readable = Document(html_text, recallPriority=relax).summary(html_partial=False) cleantext = BeautifulSoup(readable.encode('utf-8'), 'lxml').strings readability_text = ' '.join(cleantext) return [Extraction(readability_text, self.name)] else: return [] except Exception as e: print('Error in extracting readability %s' % e) return []
def extract(self, html_text: str, threshold=0.5) -> List[Extraction]: """ Args: html_text (): str of the html page to be extracted threshold (): if the ratio of rules that successfully extracted something over all rules \ is higher than or equal to the threshold, return the results, else return an empty list Returns: a list of Extractions, each extraction includes the extracted value, the rule name, the provenance etc. """ result = list() try: for rule in self.rule_set.rules: rule.apply(html_text) value = rule.value if value is not None: # note the addition of a new tag argument to Extraction start_char = rule.start_char end_char = rule.end_char result.append(Extraction(value, self.name, start_char=start_char, end_char=end_char, tag=rule.name)) # Test whether the fraction of extractions meets the desired threshold if len(self.rule_set.rules) > 0 and float(len(result))/len(self.rule_set.rules) >= threshold: return result else: return list() except Exception as e: print('Error in extracting landmark %s' % e) return list()
def extract(self, str_date: str, settings: dict=None) -> List[Extraction]: """ Args: str_date (): a date in strin settings (): settings when parse the date: { 'DATE_ORDER': 'MDY', # default to be 'MDY', shuffled Y, M, D representing Year, Month, Date 'STRICT_PARSING': True, 'FUZZY': True, 'PREFER_DAY_OF_MONTH': 'current', # default to be 'current'; can be 'first' or 'last' instead;\ specify the date when the date is missing 'PREFER_DATES_FROM': 'current_period', # default to be 'current_period'; can be 'future', \ or 'past' instead; specify the date when the date is missing 'RELATIVE_BASE': datetime.datetime(2020, 1, 1), # default to be current date and time 'SKIP_TOKENS_PARSER': ['t'] # default to be ['t']; a list of tokens to discard while detecting language } see more on https://github.com/scrapinghub/dateparser/blob/master/docs/usage.rst Returns: a datetime.datetime object (or None if the string is not a date) """ customized_settings = settings if settings else {'STRICT_PARSING': True} ori_str = str_date try: if len(str_date) > 100: return list() str_date = str_date[:20] if len(str_date) > 20 else str_date str_date = str_date.replace('\r', '') str_date = str_date.replace('\n', '') str_date = str_date.replace('<', '') str_date = str_date.replace('>', '') parsed_date = dateparser.parse(str_date, settings=customized_settings) if parsed_date: parsed_year = parsed_date.year current_year = datetime.datetime.now().year if current_year - self.ignore_past_years > parsed_year: return list() if self.ignore_future_dates and datetime.datetime.now() < parsed_date: return list() extracted_date = Extraction(str(self.convert_to_iso_format(parsed_date)), self.name) # should be better if re-consider the construction of provenance: extracted_date._provenance['original_date_str'] = ori_str return [extracted_date] except Exception as e: print('Exception: {}, failed to parse {} as date'.format(e, str_date)) return list()
def wrap_value_with_context(self, value: dict, field_name: str, start: int = 0, end: int = 0) -> Extraction: """Wraps the final result""" return Extraction(value, self.name, start_token=start, end_token=end, tag=field_name)
def extract(self, text: str=None, ignore_future_dates: bool=True, ignore_past_years: int=20, settings: dict=None) -> List[Extraction]: """ go through the text to find some sub strings for date, when meet a substring for date, parse it by DateParser, and wrap the result in an Extraction, return the list of Extractions """ res = list() date_parser = DateParser(ignore_future_dates, ignore_past_years) date_list = self.extract_date_str(text) for date_str in date_list: date = date_parser.extract(date_str['value'], settings) if date and len(date): extracted_date = Extraction(str(date[0].value), self.name, start_char=date_str['start'], end_char=date_str['start']+len(date_str['value'])) # should be better if re-consider the construction of provenance: extracted_date._provenance['original_date_str'] = date_str['value'] res.append(extracted_date) return res
def wrap_split_extraction(self, items: List[str]) -> List[Extraction]: res = list() start = 0 for item in items: end = start + len(item) e = Extraction(value=item, extractor_name=self.name, start_char=start, end_char=end) res.append(e) start = end return res
def wrap_value_with_context(self, value: str, start: int, end: int) -> Extraction: """Wraps the final result""" return Extraction(value, self.name, start_token=start, end_token=end)
def wrap_data(self, key: str, value) -> Extraction: e = Extraction(value=value, extractor_name=self.name, tag=key) return e
def wrap_extraction(self, group_idx: int, matches: object) -> Extraction: start, end = matches.start(group_idx), matches.end(group_idx) text = matches.group(group_idx) e = Extraction(value=text, extractor_name=self.name, \ start_char=start, end_char=end, tag=self.general_tag) return e