Esempio n. 1
0
 def write_to_csv(self, output_file):
     csvfile = open(output_file, 'wb')
     csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
     csvwriter.writerow(['Provider Success Total', self.provider_success_count])
     csvwriter.writerow(['Provider Uploaded Total', self.provider_uploaded_count])
     csvwriter.writerow(['Provider Failed Total', self.provider_failed_count])
     csvwriter.writerow(['Provider Total', self.provider_total_count])
     csvwriter.writerow(['Percent Provider Success', self.percent_provider_success])
     csvwriter.writerow(['Provider Running Total', self.provider_running_count])
     csvwriter.writerow(['Courses Total ', self.course_total_count])
     csvwriter.writerow(['Time Spent', time_display(self.spent_time)])
     cols = ['provider_id', 'config_file_name', 'courses_total', 'execution_time_display',
             'status', 's3_info', 'message']
     cols_meta = ['PROVIDER_ID', 'CONFIG FILE NAME ', 'COURSES TOTAL ', 'TIME EXECUTION',
                  'STATUS', 'UPLOADED_S3', 'MESSAGE']
     csvwriter.writerow('')
     csvwriter.writerow('')
     csvwriter.writerow(cols_meta)
     provider_total = self.provider_success + self.provider_failed + self.provider_running
     for item in provider_total:
         fields_value = []
         for col in cols:
             if col == 's3_info' and item.s3_info:
                 if item.s3_info:
                     fields_value.append(item.s3_info.uploaded)
                 else:
                     fields_value.append('No')
             else:
                 value = getattr(item, col)
                 fields_value.append(value)
         csvwriter.writerow(fields_value)
     csvfile.close()
Esempio n. 2
0
 def finish(self):
     if self.spider:
         collector = self.spider._crawler.stats
         self.scrapy_collector = json.dumps(collector,
                                            default=lambda o: o.__dict__ if hasattr(o, '__dict__') else None)
         self.end_time = int(time.time())
         self.execution_time = self.end_time - self.start_time
         self.execution_time_display = time_display(self.execution_time)
         self.courses_total = collector._stats[
             'item_scraped_count'] if 'item_scraped_count' in collector._stats else 0
         self.suspect_requests_count = len(self.spider.suspect_requests)
         # calcul the percent null and not null
         if self.courses_total:
             total_percent_null = 0
             for item in self.fields_collector:
                 percent_null = float("{0:.2f}".format(item.count_null / item.count_total))
                 item.percent_null = percent_null
                 total_percent_null += percent_null
             if total_percent_null != 0:
                 self.average_percent_null = float("{0:.2f}".format(total_percent_null / len(self.fields_collector)))
         if self.courses_total > 0 and self.average_percent_null <= PERCENT_FIELD_NULL_MAX:
             self.status = SUCCESS_STATUS
         elif self.average_percent_null > PERCENT_FIELD_NULL_MAX:
             self.status = FAILED_STATUS
             self.message = 'Average percentage of null percent is %s'%(self.average_percent_null)
         elif self.status == ITEM_SCRAPED_NOT_GROW_UP:
             self.status = FAILED_STATUS
             self.message = "The web site is very slowly or there are multiple incorrect requests"
         else:
             self.status = FAILED_STATUS
     else:
         raise Exception("Spider is not found")