def zip_results(filename: str, score_type: str, column_idx: int): regex = re.compile('(?:@)?(?P<tld>[\w\-]+\.\w+)') print("Welcome to the results zipper! Wait for the xlsx to load.") if re.search('\.xlsx$', filename): workbook = load_workbook(filename=filename, keep_vba=False) result_filename = filename.replace(".xlsx", ".csv") elif re.search('\.xls$', filename): workbook = open_xls_as_xlsx(filename) result_filename = filename.replace(".xls", ".csv") else: print("Unsupported file format!") exit(1) sheet = workbook.active print("File loaded. results/{} will be zipped to it.".format( result_filename)) with open("results/" + result_filename, "rb") as result: result.seek(0) rows = sheet.max_row columns = sheet.max_column skip_empty = 2 for row in sheet['A2:B256']: if not row[0].value: skip_empty += 1 else: break for line in range(skip_empty, rows): if line % 100 == 0: print("Zipped {0:.3f}%".format(line / (rows * 1.) * 100.)) result_line = result.readline() if not result_line: # xlsx had some invalid lines, some where skiped # resulting in a discrepancy between the two files lines numbers break domain, segment, score = result_line.decode("utf-8").split(",") mail = sheet['{}{}'.format(column_idx, line)].value if not mail or not regex.search(mail): result.seek(-len(result_line), 1) # go back to read the same line again continue # skip invalid entries, like in bulk_score if domain not in mail: print( "ERROR! Zipping on weird data (trying to zip {} with {})". format(mail, domain)) exit(1) sheet.cell(line, columns + 1).value = segment sheet.cell(line, columns + 2).value = int(score) filename_with_results = filename.replace(".xls", "_with-results.xls") print("Now saving to {}, this might take several minutes...".format( filename_with_results)) workbook.save(filename_with_results) print("You're good to go!")
def run_xls(filename: str, api_key: str, score_type: str, column_idx: int): print("Welcome to the bulk persons searcher! Wait for the xlsx to load.") if re.search('\.xlsx$', filename): workbook = load_workbook(filename=filename, keep_vba=False) result_filename = filename.replace(".xlsx", ".csv") elif re.search('\.xls$', filename): workbook = open_xls_as_xlsx(filename) result_filename = filename.replace(".xls", ".csv") else: print("Unsupported file format!") exit(1) sheet = workbook.active regex = re.compile('(?:@)?(?P<tld>[\w\-]+\.\w+)') domains_scored = {} emails_scored = {} print("File loaded. Results will be saved to results/{}.".format( result_filename)) with open("results/" + result_filename, "a+") as result: result.seek(0) start = sum(1 for line in result) skip_empty = 2 for row in sheet['A2:B256']: if not row[0].value: skip_empty += 1 else: break try: rows = sheet.max_row for line in range(start + skip_empty, rows): person = {} if line % 100 == 0: print("Currently at {}%".format(line / (rows * 1.) * 100.)) person['email'] = sheet['{}{}'.format(column_idx, line)].value if not person['email']: continue search = regex.search(person["email"]) print("scoring: " + person["email"]) if not search: continue if score_type == 'domain': domain = search.group('tld') if domain not in domains_scored: params = {"domain": domain} resp = requests.get(API_DOMAIN_URL, auth=(api_key, ''), params=params) domains_scored[domain] = resp.json( )['properties']['customer_fit'] customer_fit = domains_scored[domain] result.write("{},{},{}\n".format( domain, customer_fit['segment'], customer_fit['score'], '"' + format_signals(customer_fit.get('top_signals', '')) + '"')) if score_type == 'email': email = person["email"] if email not in emails_scored: params = {"email": email} resp = requests.get(API_PERSON_URL, auth=(api_key, ''), params=params) customer_fit = resp.json( )['properties']['customer_fit'] emails_scored[email] = resp.json( )['properties']['customer_fit'] customer_fit = emails_scored[email] result.write("{},{},{},{}\n".format( email, customer_fit['segment'], customer_fit['score'], '"' + format_signals(customer_fit.get('top_signals', '')) + '"')) except Exception: result.flush() logger.exception("Exception met. Relaunch to resume!\n") exit(1) exit(0)
async def run_xls(filename: str, api_key: str, score_type: str, column_idx: int): print("Welcome to the bulk persons searcher! Wait for the xlsx to load.") if re.search(r'\.xlsx$', filename): workbook = load_workbook(filename=filename, keep_vba=False) result_filename = filename.replace(".xlsx", ".csv") elif re.search(r'\.xls$', filename): workbook = open_xls_as_xlsx(filename) result_filename = filename.replace(".xls", ".csv") else: print("Unsupported file format!") exit(1) sheet = workbook.active values_to_score = [] values_scored = {} async def write_to_file(values_to_score): results = await asyncio.gather(*[ get(score_type, api_key, value_to_score) for value_to_score in values_to_score ]) for result in results: if result and 'properties' in result: customer_fit_result = result['properties']['customer_fit'] values_scored[result[score_type]] = customer_fit_result if 'top_signals_formatted' in customer_fit_result: new_row = "{},{},{},{}\"\n".format( result[score_type], customer_fit_result['segment'], customer_fit_result['score'], customer_fit_result['top_signals_formatted']) else: new_row = "{},{},{}\"\n".format( result[score_type], customer_fit_result['segment'], customer_fit_result['score']) readcsv.write(new_row) readcsv.flush() print("File loaded. Results will be saved to results/{}.".format( result_filename)) with open("results/" + result_filename, "a+") as readcsv: readcsv.seek(0) full_csv = readcsv.read() try: rows = sheet.max_row for line in range(2, rows): email_or_domain = sheet['{}{}'.format(column_idx, line)].value if email_or_domain not in full_csv: print("scoring: " + email_or_domain) if email_or_domain not in values_scored: values_to_score.append(email_or_domain) if len(values_to_score) == 100: await write_to_file(values_to_score) values_to_score = [] print("Currently at {}%".format(line / (rows * 1.) * 100.)) except Exception: readcsv.flush() logger.exception("Exception met. Relaunch to resume!\n") exit(1) if len(values_to_score) > 0: await write_to_file(values_to_score) exit(0)