def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector') results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content( attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('next'): results['data'].append(result) else: for next in self.config['scraping']['next']: for r in traverse_next(selector, next, result): results['data'].append(r) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector' ) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('next'): results['data'].append(result) else: for next in self.config['scraping']['next']: for r in traverse_next(selector, next, result): results['data'].append(r) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def execute_command(self): """ The generate command uses `Jinja2 <http://jinja.pocoo.org/>`_ templates \ to create Python scripts, according to the specification in the configuration \ file. The predefined templates use the extract_content() method of the \ :ref:`selector classes <implementation-selectors>` to implement linear extractors \ and use recursive for loops to implement multiple levels of link crawlers. This \ implementation is effectively a representation of the traverse_next() \ :ref:`utility function <implementation-utils>`, using the loop depth to \ differentiate between levels of the crawler execution. According to the --output_type argument in the CLI input, the results are \ written into a JSON document or a CSV document. The Python script is written into <output_filename>.py - running this file \ is the equivalent of using the Scrapple :ref:`run command <command-run>`. """ print(Back.GREEN + Fore.BLACK + "Scrapple Generate") print(Back.RESET + Fore.RESET) directory = os.path.join(scrapple.__path__[0], 'templates', 'scripts') with open(os.path.join(directory, 'generate.txt'), 'r') as f: template_content = f.read() template = Template(template_content) try: with open(self.args['<projectname>'] + '.json', 'r') as f: config = json.load(f) if self.args['--output_type'] == 'csv': from scrapple.utils.config import extract_fieldnames config['fields'] = str(extract_fieldnames(config)) config['output_file'] = self.args['<output_filename>'] config['output_type'] = self.args['--output_type'] rendered = template.render(config=config) with open(self.args['<output_filename>'] + '.py', 'w') as f: f.write(rendered) print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".py has been created" + Back.RESET + Fore.RESET, sep="") except IOError: print(Back.WHITE + Fore.RED + self.args['<projectname>'], ".json does not ", \ "exist. Use ``scrapple genconfig``." + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector') results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() tabular_data_headers = dict() if self.args['--verbosity'] > 0: print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET, end='') selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": if self.args['--verbosity'] > 1: print("\nExtracting", attribute['field'], "attribute", sep=' ', end='') result[attribute['field']] = selector.extract_content( **attribute) if not self.config['scraping'].get('table'): result_list = [result] else: tables = self.config['scraping'].get('table', []) for table in tables: if table.get('selector', '').strip() != '': table.update({ 'result': result, 'verbosity': self.args['--verbosity'] }) table_headers, result_list = selector.extract_tabular( **table) for th in table_headers: if not th in tabular_data_headers: tabular_data_headers[th] = len( tabular_data_headers) if not self.config['scraping'].get('next'): results['data'].extend(result_list) else: for nextx in self.config['scraping']['next']: for tdh, r in traverse_next( selector, nextx, result, verbosity=self.args['--verbosity']): results['data'].append(r) for th in tdh: if not th in tabular_data_headers: tabular_data_headers[th] = len( tabular_data_headers) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f, indent=3) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) data_headers = sorted( tabular_data_headers, key=lambda x: tabular_data_headers[x]) fields.extend(data_headers) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) if self.args['--verbosity'] > 0: print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClassMapping = { 'xpath': XpathSelector, 'css': CssSelector } selectorClass = selectorClassMapping.get(self.config['selector_type'].lower()) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() tabular_data_headers = dict() if self.args['--verbosity'] > 0: print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET, end='') selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": if self.args['--verbosity'] > 1: print("\nExtracting", attribute['field'], "attribute", sep=' ', end='') result[attribute['field']] = selector.extract_content(**attribute) if not self.config['scraping'].get('table'): result_list = [result] else: tables = self.config['scraping'].get('table', []) for table in tables: if table.get('selector', '').strip() != '': table.update({ 'result': result, 'verbosity': self.args['--verbosity'] }) table_headers, result_list = selector.extract_tabular(**table) for th in table_headers: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) if not self.config['scraping'].get('next'): results['data'].extend(result_list) else: for nextx in self.config['scraping']['next']: for tdh, r in traverse_next(selector, nextx, result, verbosity=self.args['--verbosity']): results['data'].append(r) for th in tdh: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f, indent=4) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x]) fields.extend(data_headers) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) if self.args['--verbosity'] > 0: print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector' ) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() tabular_data_headers = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('table'): result_list = [result] else: tables = self.config['scraping'].get('table') for table in tables: table_headers, result_list = selector.extract_tabular( result=result, table_type=table.get('table_type', 'rows'), header=table.get('header', []), prefix=table.get('prefix', ''), suffix=table.get('suffix', ''), selector=table.get('selector', ''), attr=table.get('attr', 'text'), default=table.get('default', '') ) for th in table_headers: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) if not self.config['scraping'].get('next'): results['data'].extend(result_list) else: for next in self.config['scraping']['next']: for tdh, r in traverse_next(selector, next, result): results['data'].append(r) for th in tdh: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x]) fields.extend(data_headers) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")