def _exclude_selectors(self): """ :return: list of css selectors """ site_name = toolkit.get_site_name(self._url) site_exclude = self._rules[site_name]['exclude'] return site_exclude + self._always_exlude
def main(): # parse args parser = argparse.ArgumentParser() parser.add_argument('-u', '--url', help='Target page url') parser.add_argument('-t', '--target', help='Css selector to process text.') parser.add_argument('-e', '--exclude', help='Css selector to exclude text.') parser.add_argument('-c', '--config', help='Path to config file') parser.add_argument('-d', '--debug', action='store_true') parser.set_defaults(debug=False, config='config.json', exclude=None) args = parser.parse_args() # /parse args if args.debug: logging.basicConfig(level=logging.DEBUG) # getting config if os.path.exists(args.config): with codecs.open(args.config, 'r', encoding='utf-8') as fh: config = json.load(fh) else: config = dict(urls=[], rules={}) # getting rules and urls for processing if args.url: url = args.url site_name = toolkit.get_site_name(url) if args.target: exclude = args.exclude rule = dict(include=[ args.target, ], exclude=[ exclude, ] if exclude else []) rules = {site_name: rule} else: rules = config['rules'] urls = [ url, ] else: rules = config['rules'] urls = config['urls'] # process urls text_extractor = htmltoreadable.HtmlTextExtractor(rules) for url in urls: text = text_extractor.get_text(url) write_to_file(url, text)
def _include_selectors(self): """ :return: list of css selectors """ site_name = toolkit.get_site_name(self._url) return self._rules[site_name]['include']