Example #1
0
def reconcile_matches(fn, clear=True):
    '''
    '''
    sheet = get_search_sheet()

    terms = sheet.range('B2:B%s' % sheet.row_count)
    line_numbers = dict(
        zip([x.value for x in terms], range(2, sheet.row_count + 1)))

    match_counts = sheet.range('D2:D%s' % sheet.row_count)
    output_values = sheet.range('E2:E%s' % sheet.row_count)

    if clear:
        for ov in output_values:
            ov.value = ''

    matches = defaultdict(list)
    for line in codecs.open(fn):
        js = json.loads(line)
        url = filename_to_s3path(js['filepath'])
        #url = js['filepath'].replace('/data/sedar', 'https://sedar.openoil.net.s3.amazonaws.com')
        for positive, details in js['positives'].items():
            positive = re.escape(positive)  # re-escape
            lineno = line_numbers.get(positive, None)
            if lineno is None:
                print('no lineno for %s' % positive)
                continue

            matches[lineno].append(url)
    output = [
        ', '.join(matches[lineno]) for lineno in range(2, sheet.row_count + 1)
    ]
    for lineno in range(2, sheet.row_count + 1):
        if matches[lineno] or clear:
            if False and lineno > len(output_values):
                print('skipping silly line number')
                continue
            output_values[lineno - 2].value = ', '.join(matches[lineno])
            match_counts[lineno - 2].value = len(matches[lineno])
    sheet.update_cells(output_values)
    sheet.update_cells(match_counts)
def reconcile_matches(fn, clear = True):
    '''
    '''
    sheet = get_search_sheet()

    terms = sheet.range('B2:B%s' % sheet.row_count)
    line_numbers = dict(zip([x.value for x in terms], range(2,sheet.row_count+1)))

    match_counts = sheet.range('D2:D%s' % sheet.row_count)
    output_values = sheet.range('E2:E%s' % sheet.row_count)

    if clear:
        for ov in output_values:
            ov.value = ''

    matches = defaultdict(list)
    for line in codecs.open(fn):
        js = json.loads(line)
        url = filename_to_s3path(js['filepath'])
        #url = js['filepath'].replace('/data/sedar', 'https://sedar.openoil.net.s3.amazonaws.com')
        for positive, details in js['positives'].items():
            positive = re.escape(positive) # re-escape
            lineno = line_numbers.get(positive, None)
            if lineno is None:
                print('no lineno for %s' % positive)
                continue

            matches[lineno].append(url)
    output = [', '.join(matches[lineno]) for lineno in range(2,sheet.row_count+1)]
    for lineno in range(2, sheet.row_count+1):
        if matches[lineno] or clear:
            if False and lineno > len(output_values):
                print('skipping silly line number')
                continue
            output_values[lineno-2].value = ', '.join(matches[lineno])
            match_counts[lineno-2].value = len(matches[lineno])
    sheet.update_cells(output_values)
    sheet.update_cells(match_counts)
 def reducer(self, _, filepaths):
     for filepath in filepaths:
         url = filename_to_s3path(filepath)
         yield None, url
Example #4
0
 def reducer(self, _, filepaths):
     for filepath in filepaths:
         url = filename_to_s3path(filepath)
         yield None, url