def inspect(self, parsers=None, log_results=False): if not parsers: parsers = parser_registry.keys() if log_results: start_time = datetime.datetime.now() (filename, file_url, C) = self.init_csv_writer( # slug='mapper-inspector', header=['index', 'file_count'] + [p for p in parsers] + ['blog', 'filepath', 'timestamp'], ) for (i,blog) in enumerate(self.blog_list): results = {} for p in parsers: results[p] = self.inspect_blog_mapper_pair(blog, p) row = [i, self.count_blog_files(blog)] + [results[p] for p in parsers] + \ [ blog,#.split('/')[-1], self.blog_path+blog, datetime.date.strftime(datetime.datetime.now(), "%Y/%m/%d %H:%M:%S") ] print '\t'.join([str(r) for r in row]) if log_results: C.writerow( row ) if log_results: print 'Output:\t', file_url
def inspect_multiple(self, parsers=None, shuffle=False, max_posts=20, verbose=False): "Check parsers and see which one seems to fit the best." if not parsers: parsers = parser_registry.keys() post_count = {} perfect_pct = {} acceptable_pct = {} for p in parsers: results = self.inspect_blog_parser_pair(self.blog_url, p, max_posts, shuffle) # print results post_count[p] = len(results) perfect_pct[p] = self.calc_percent_perfect(results) acceptable_pct[p] = self.calc_percent_acceptable(results) print p print '\t', post_count[p], '\tPost count' print '\t', round(100*perfect_pct[p]), '\t% perfect' print '\t', round(100*acceptable_pct[p]), '\t% acceptable' print for f in field_keys: print '\t', self.calc_success_rate(results, f), f print best_parser = max(acceptable_pct, key=acceptable_pct.get) if acceptable_pct[best_parser] <= 0: best_parser = "None" best_pct = -1 else: best_pct = acceptable_pct[best_parser] print print best_parser, ':', round(100*best_pct)
def inspect(self, parsers=None, log_results=False, log_summary=False, shuffle=False, max_posts=20, verbose=False): if not parsers: parsers = parser_registry.keys() if log_results: start_time = datetime.datetime.now() #Initialize the blog-by-parser csv header = ['index'] + \ ['post_count'] + \ ['pct_perfect', 'pct_acceptable'] + \ [f for f in field_keys] + \ ['parser', 'blog', 'filepath', 'timestamp'] (bxp_filename, bxp_file_url, bxp_csv) = self.init_csv_writer(slug="ParserInspector-BxP",header=header) if log_summary: #Initialize the blog csv header = ['index'] + \ ["best_parser", "best_pct"] + \ [p+"_post_count" for p in parsers] + \ [p+"_pct_perfect" for p in parsers] + \ [p+"_pct_acceptable" for p in parsers] + \ ['blog', 'filepath', 'timestamp'] (summary_filename, summary_file_url, summary_csv) = self.init_csv_writer(slug="ParserInspector-summary",header=header) acceptable_matches = 0 for (i,blog) in enumerate(self.blog_list): post_count = {} perfect_pct = {} acceptable_pct = {} for p in parsers: results = self.inspect_blog_parser_pair(blog, p, max_posts, shuffle) post_count[p] = len(results) perfect_pct[p] = self.calc_percent_perfect(results) acceptable_pct[p] = self.calc_percent_acceptable(results) if log_results: row = [i] + \ [post_count[p]] + \ [perfect_pct[p] ] + \ [acceptable_pct[p] ] + \ [self.calc_success_rate(results, f) for f in field_keys] + \ [ p, blog,#.split('/')[-1], self.blog_path+blog, datetime.date.strftime(datetime.datetime.now(), "%Y/%m/%d %H:%M:%S") ] bxp_csv.writerow( row ) # print '\t'.join([str(r) for r in row]) if log_summary: best_parser = max(acceptable_pct, key=acceptable_pct.get) if acceptable_pct[best_parser] <= 0: best_parser = "None" best_pct = -1 else: best_pct = acceptable_pct[best_parser] if best_pct == 1: acceptable_matches += 1 row = [i] + \ [best_parser, best_pct] + \ [post_count[p] for p in parsers] + \ [perfect_pct[p] for p in parsers] + \ [acceptable_pct[p] for p in parsers] + \ [ blog,#.split('/')[-1], self.blog_path+blog, datetime.date.strftime(datetime.datetime.now(), "%Y/%m/%d %H:%M:%S") ] summary_csv.writerow( row ) print '\t'.join([str(r) for r in row]) print '='*80 print acceptable_matches, 'acceptable matches' print len(self.blog_list), 'total blogs checked' print float(acceptable_matches)/len(self.blog_list), 'percent success' print if log_results: print 'Results file:\t', bxp_file_url if log_summary: print 'Summary file:\t', summary_file_url