def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector') results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content( attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('next'): results['data'].append(result) else: for next in self.config['scraping']['next']: for r in traverse_next(selector, next, result): results['data'].append(r) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector' ) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('next'): results['data'].append(result) else: for next in self.config['scraping']['next']: for r in traverse_next(selector, next, result): results['data'].append(r) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector') results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() tabular_data_headers = dict() if self.args['--verbosity'] > 0: print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET, end='') selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": if self.args['--verbosity'] > 1: print("\nExtracting", attribute['field'], "attribute", sep=' ', end='') result[attribute['field']] = selector.extract_content( **attribute) if not self.config['scraping'].get('table'): result_list = [result] else: tables = self.config['scraping'].get('table', []) for table in tables: if table.get('selector', '').strip() != '': table.update({ 'result': result, 'verbosity': self.args['--verbosity'] }) table_headers, result_list = selector.extract_tabular( **table) for th in table_headers: if not th in tabular_data_headers: tabular_data_headers[th] = len( tabular_data_headers) if not self.config['scraping'].get('next'): results['data'].extend(result_list) else: for nextx in self.config['scraping']['next']: for tdh, r in traverse_next( selector, nextx, result, verbosity=self.args['--verbosity']): results['data'].append(r) for th in tdh: if not th in tabular_data_headers: tabular_data_headers[th] = len( tabular_data_headers) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f, indent=3) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) data_headers = sorted( tabular_data_headers, key=lambda x: tabular_data_headers[x]) fields.extend(data_headers) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) if self.args['--verbosity'] > 0: print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClassMapping = { 'xpath': XpathSelector, 'css': CssSelector } selectorClass = selectorClassMapping.get(self.config['selector_type'].lower()) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() tabular_data_headers = dict() if self.args['--verbosity'] > 0: print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET, end='') selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": if self.args['--verbosity'] > 1: print("\nExtracting", attribute['field'], "attribute", sep=' ', end='') result[attribute['field']] = selector.extract_content(**attribute) if not self.config['scraping'].get('table'): result_list = [result] else: tables = self.config['scraping'].get('table', []) for table in tables: if table.get('selector', '').strip() != '': table.update({ 'result': result, 'verbosity': self.args['--verbosity'] }) table_headers, result_list = selector.extract_tabular(**table) for th in table_headers: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) if not self.config['scraping'].get('next'): results['data'].extend(result_list) else: for nextx in self.config['scraping']['next']: for tdh, r in traverse_next(selector, nextx, result, verbosity=self.args['--verbosity']): results['data'].append(r) for th in tdh: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f, indent=4) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x]) fields.extend(data_headers) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) if self.args['--verbosity'] > 0: print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector' ) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() tabular_data_headers = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('table'): result_list = [result] else: tables = self.config['scraping'].get('table') for table in tables: table_headers, result_list = selector.extract_tabular( result=result, table_type=table.get('table_type', 'rows'), header=table.get('header', []), prefix=table.get('prefix', ''), suffix=table.get('suffix', ''), selector=table.get('selector', ''), attr=table.get('attr', 'text'), default=table.get('default', '') ) for th in table_headers: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) if not self.config['scraping'].get('next'): results['data'].extend(result_list) else: for next in self.config['scraping']['next']: for tdh, r in traverse_next(selector, next, result): results['data'].append(r) for th in tdh: if not th in tabular_data_headers: tabular_data_headers[th] = len(tabular_data_headers) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x]) fields.extend(data_headers) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")