def reconcile_matches(fn, clear=True): ''' ''' sheet = get_search_sheet() terms = sheet.range('B2:B%s' % sheet.row_count) line_numbers = dict( zip([x.value for x in terms], range(2, sheet.row_count + 1))) match_counts = sheet.range('D2:D%s' % sheet.row_count) output_values = sheet.range('E2:E%s' % sheet.row_count) if clear: for ov in output_values: ov.value = '' matches = defaultdict(list) for line in codecs.open(fn): js = json.loads(line) url = filename_to_s3path(js['filepath']) #url = js['filepath'].replace('/data/sedar', 'https://sedar.openoil.net.s3.amazonaws.com') for positive, details in js['positives'].items(): positive = re.escape(positive) # re-escape lineno = line_numbers.get(positive, None) if lineno is None: print('no lineno for %s' % positive) continue matches[lineno].append(url) output = [ ', '.join(matches[lineno]) for lineno in range(2, sheet.row_count + 1) ] for lineno in range(2, sheet.row_count + 1): if matches[lineno] or clear: if False and lineno > len(output_values): print('skipping silly line number') continue output_values[lineno - 2].value = ', '.join(matches[lineno]) match_counts[lineno - 2].value = len(matches[lineno]) sheet.update_cells(output_values) sheet.update_cells(match_counts)
def reconcile_matches(fn, clear = True): ''' ''' sheet = get_search_sheet() terms = sheet.range('B2:B%s' % sheet.row_count) line_numbers = dict(zip([x.value for x in terms], range(2,sheet.row_count+1))) match_counts = sheet.range('D2:D%s' % sheet.row_count) output_values = sheet.range('E2:E%s' % sheet.row_count) if clear: for ov in output_values: ov.value = '' matches = defaultdict(list) for line in codecs.open(fn): js = json.loads(line) url = filename_to_s3path(js['filepath']) #url = js['filepath'].replace('/data/sedar', 'https://sedar.openoil.net.s3.amazonaws.com') for positive, details in js['positives'].items(): positive = re.escape(positive) # re-escape lineno = line_numbers.get(positive, None) if lineno is None: print('no lineno for %s' % positive) continue matches[lineno].append(url) output = [', '.join(matches[lineno]) for lineno in range(2,sheet.row_count+1)] for lineno in range(2, sheet.row_count+1): if matches[lineno] or clear: if False and lineno > len(output_values): print('skipping silly line number') continue output_values[lineno-2].value = ', '.join(matches[lineno]) match_counts[lineno-2].value = len(matches[lineno]) sheet.update_cells(output_values) sheet.update_cells(match_counts)
def reducer(self, _, filepaths): for filepath in filepaths: url = filename_to_s3path(filepath) yield None, url