Esempio n. 1
0
 def run(self):
     selectorClass = getattr(
         eval(self.config['selector_type']),
         self.config['selector_type'].title() + 'Selector')
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         print()
         print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
             + Back.RESET + Fore.RESET)
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 print("\nExtracting",
                       attribute['field'],
                       "attribute",
                       sep=' ')
                 result[attribute['field']] = selector.extract_content(
                     attribute['selector'], attribute['attr'],
                     attribute['default'])
         if not self.config['scraping'].get('next'):
             results['data'].append(result)
         else:
             for next in self.config['scraping']['next']:
                 for r in traverse_next(selector, next, result):
                     results['data'].append(r)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         print()
         print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
               ".", self.args['--output_type'], " has been created" \
               + Back.RESET + Fore.RESET, sep="")
Esempio n. 2
0
 def run(self):
     selectorClass = getattr(
             eval(self.config['selector_type']), 
             self.config['selector_type'].title() + 'Selector'
             )
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         print()
         print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
             + Back.RESET + Fore.RESET)
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 print("\nExtracting", attribute['field'], "attribute", sep=' ')
                 result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default'])
         if not self.config['scraping'].get('next'):
             results['data'].append(result)
         else:
             for next in self.config['scraping']['next']:
                 for r in traverse_next(selector, next, result):
                     results['data'].append(r)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         print()
         print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
               ".", self.args['--output_type'], " has been created" \
               + Back.RESET + Fore.RESET, sep="")
Esempio n. 3
0
    def execute_command(self):
        """
        The generate command uses `Jinja2 <http://jinja.pocoo.org/>`_ templates \
        to create Python scripts, according to the specification in the configuration \
        file. The predefined templates use the extract_content() method of the \
        :ref:`selector classes <implementation-selectors>` to implement linear extractors \
        and use recursive for loops to implement multiple levels of link crawlers. This \
        implementation is effectively a representation of the traverse_next() \
        :ref:`utility function <implementation-utils>`, using the loop depth to \
        differentiate between levels of the crawler execution. 

        According to the --output_type argument in the CLI input, the results are \
        written into a JSON document or a CSV document. 

        The Python script is written into <output_filename>.py - running this file \
        is the equivalent of using the Scrapple :ref:`run command <command-run>`. 

        """
        print(Back.GREEN + Fore.BLACK + "Scrapple Generate")
        print(Back.RESET + Fore.RESET)
        directory = os.path.join(scrapple.__path__[0], 'templates', 'scripts')
        with open(os.path.join(directory, 'generate.txt'), 'r') as f:
            template_content = f.read()
        template = Template(template_content)
        try:
            with open(self.args['<projectname>'] + '.json', 'r') as f:
                config = json.load(f)
            if self.args['--output_type'] == 'csv':
                from scrapple.utils.config import extract_fieldnames
                config['fields'] = str(extract_fieldnames(config))
            config['output_file'] = self.args['<output_filename>']
            config['output_type'] = self.args['--output_type']
            rendered = template.render(config=config)
            with open(self.args['<output_filename>'] + '.py', 'w') as f:
                f.write(rendered)
            print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
                  ".py has been created" + Back.RESET + Fore.RESET, sep="")
        except IOError:
            print(Back.WHITE + Fore.RED + self.args['<projectname>'], ".json does not ", \
                  "exist. Use ``scrapple genconfig``." + Back.RESET + Fore.RESET, sep="")
Esempio n. 4
0
    def execute_command(self):
        """
        The generate command uses `Jinja2 <http://jinja.pocoo.org/>`_ templates \
        to create Python scripts, according to the specification in the configuration \
        file. The predefined templates use the extract_content() method of the \
        :ref:`selector classes <implementation-selectors>` to implement linear extractors \
        and use recursive for loops to implement multiple levels of link crawlers. This \
        implementation is effectively a representation of the traverse_next() \
        :ref:`utility function <implementation-utils>`, using the loop depth to \
        differentiate between levels of the crawler execution. 

        According to the --output_type argument in the CLI input, the results are \
        written into a JSON document or a CSV document. 

        The Python script is written into <output_filename>.py - running this file \
        is the equivalent of using the Scrapple :ref:`run command <command-run>`. 

        """
        print(Back.GREEN + Fore.BLACK + "Scrapple Generate")
        print(Back.RESET + Fore.RESET)
        directory = os.path.join(scrapple.__path__[0], 'templates', 'scripts')
        with open(os.path.join(directory, 'generate.txt'), 'r') as f:
            template_content = f.read()
        template = Template(template_content)
        try:
            with open(self.args['<projectname>'] + '.json', 'r') as f:
                config = json.load(f)
            if self.args['--output_type'] == 'csv':
                from scrapple.utils.config import extract_fieldnames
                config['fields'] = str(extract_fieldnames(config))
            config['output_file'] = self.args['<output_filename>']
            config['output_type'] = self.args['--output_type']
            rendered = template.render(config=config)
            with open(self.args['<output_filename>'] + '.py', 'w') as f:
                f.write(rendered)
            print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
                  ".py has been created" + Back.RESET + Fore.RESET, sep="")
        except IOError:
            print(Back.WHITE + Fore.RED + self.args['<projectname>'], ".json does not ", \
                  "exist. Use ``scrapple genconfig``." + Back.RESET + Fore.RESET, sep="")
Esempio n. 5
0
 def run(self):
     selectorClass = getattr(
         eval(self.config['selector_type']),
         self.config['selector_type'].title() + 'Selector')
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         tabular_data_headers = dict()
         if self.args['--verbosity'] > 0:
             print()
             print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
                 + Back.RESET + Fore.RESET, end='')
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 if self.args['--verbosity'] > 1:
                     print("\nExtracting",
                           attribute['field'],
                           "attribute",
                           sep=' ',
                           end='')
                 result[attribute['field']] = selector.extract_content(
                     **attribute)
         if not self.config['scraping'].get('table'):
             result_list = [result]
         else:
             tables = self.config['scraping'].get('table', [])
             for table in tables:
                 if table.get('selector', '').strip() != '':
                     table.update({
                         'result': result,
                         'verbosity': self.args['--verbosity']
                     })
                     table_headers, result_list = selector.extract_tabular(
                         **table)
                     for th in table_headers:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(
                                 tabular_data_headers)
         if not self.config['scraping'].get('next'):
             results['data'].extend(result_list)
         else:
             for nextx in self.config['scraping']['next']:
                 for tdh, r in traverse_next(
                         selector,
                         nextx,
                         result,
                         verbosity=self.args['--verbosity']):
                     results['data'].append(r)
                     for th in tdh:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(
                                 tabular_data_headers)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f, indent=3)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 data_headers = sorted(
                     tabular_data_headers,
                     key=lambda x: tabular_data_headers[x])
                 fields.extend(data_headers)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         if self.args['--verbosity'] > 0:
             print()
             print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
                   ".", self.args['--output_type'], " has been created" \
                   + Back.RESET + Fore.RESET, sep="")
Esempio n. 6
0
 def run(self):
     selectorClassMapping = {
         'xpath': XpathSelector,
         'css': CssSelector
     }
     selectorClass = selectorClassMapping.get(self.config['selector_type'].lower())
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         tabular_data_headers = dict()
         if self.args['--verbosity'] > 0:
             print()
             print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
                 + Back.RESET + Fore.RESET, end='')
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 if self.args['--verbosity'] > 1:
                     print("\nExtracting", attribute['field'], "attribute", sep=' ', end='')
                 result[attribute['field']] = selector.extract_content(**attribute)
         if not self.config['scraping'].get('table'):
             result_list = [result]
         else:
             tables = self.config['scraping'].get('table', [])
             for table in tables:
                 if table.get('selector', '').strip() != '':
                     table.update({
                         'result': result,
                         'verbosity': self.args['--verbosity']
                     })
                     table_headers, result_list = selector.extract_tabular(**table)
                     for th in table_headers:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(tabular_data_headers)
         if not self.config['scraping'].get('next'):
             results['data'].extend(result_list)
         else:
             for nextx in self.config['scraping']['next']:
                 for tdh, r in traverse_next(selector, nextx, result, verbosity=self.args['--verbosity']):
                     results['data'].append(r)
                     for th in tdh:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(tabular_data_headers)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f, indent=4)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x])
                 fields.extend(data_headers)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         if self.args['--verbosity'] > 0:        
             print()
             print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
                   ".", self.args['--output_type'], " has been created" \
                   + Back.RESET + Fore.RESET, sep="")
Esempio n. 7
0
 def run(self):
     selectorClass = getattr(
             eval(self.config['selector_type']), 
             self.config['selector_type'].title() + 'Selector'
             )
     results = dict()
     results['project'] = self.args['<projectname>']
     results['data'] = list()
     try:
         result = dict()
         tabular_data_headers = dict()
         print()
         print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \
             + Back.RESET + Fore.RESET)
         selector = selectorClass(self.config['scraping']['url'])
         for attribute in self.config['scraping']['data']:
             if attribute['field'] != "":
                 print("\nExtracting", attribute['field'], "attribute", sep=' ')
                 result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default'])
         if not self.config['scraping'].get('table'):
             result_list = [result]
         else:
             tables = self.config['scraping'].get('table')
             for table in tables:
                 table_headers, result_list = selector.extract_tabular(
                     result=result,
                     table_type=table.get('table_type', 'rows'),
                     header=table.get('header', []),
                     prefix=table.get('prefix', ''),
                     suffix=table.get('suffix', ''),
                     selector=table.get('selector', ''),
                     attr=table.get('attr', 'text'),
                     default=table.get('default', '')
                     )
                 for th in table_headers:
                     if not th in tabular_data_headers:
                         tabular_data_headers[th] = len(tabular_data_headers)
         if not self.config['scraping'].get('next'):
             results['data'].extend(result_list)
         else:
             for next in self.config['scraping']['next']:
                 for tdh, r in traverse_next(selector, next, result):
                     results['data'].append(r)
                     for th in tdh:
                         if not th in tabular_data_headers:
                             tabular_data_headers[th] = len(tabular_data_headers)
     except KeyboardInterrupt:
         pass
     except Exception as e:
         print(e)
     finally:
         if self.args['--output_type'] == 'json':
             import json
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \
                 'w') as f:
                 json.dump(results, f)
         elif self.args['--output_type'] == 'csv':
             import csv
             with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \
                 'w') as f:
                 fields = extract_fieldnames(self.config)
                 data_headers = sorted(tabular_data_headers, key=lambda x:tabular_data_headers[x])
                 fields.extend(data_headers)
                 writer = csv.DictWriter(f, fieldnames=fields)
                 writer.writeheader()
                 writer.writerows(results['data'])
         print()
         print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \
               ".", self.args['--output_type'], " has been created" \
               + Back.RESET + Fore.RESET, sep="")