def zip_results(filename: str, score_type: str, column_idx: int):
    regex = re.compile('(?:@)?(?P<tld>[\w\-]+\.\w+)')

    print("Welcome to the results zipper! Wait for the xlsx to load.")
    if re.search('\.xlsx$', filename):
        workbook = load_workbook(filename=filename, keep_vba=False)
        result_filename = filename.replace(".xlsx", ".csv")
    elif re.search('\.xls$', filename):
        workbook = open_xls_as_xlsx(filename)
        result_filename = filename.replace(".xls", ".csv")
    else:
        print("Unsupported file format!")
        exit(1)
    sheet = workbook.active

    print("File loaded. results/{} will be zipped to it.".format(
        result_filename))
    with open("results/" + result_filename, "rb") as result:
        result.seek(0)
        rows = sheet.max_row
        columns = sheet.max_column
        skip_empty = 2
        for row in sheet['A2:B256']:
            if not row[0].value:
                skip_empty += 1
            else:
                break
        for line in range(skip_empty, rows):
            if line % 100 == 0:
                print("Zipped {0:.3f}%".format(line / (rows * 1.) * 100.))
            result_line = result.readline()
            if not result_line:
                # xlsx had some invalid lines, some where skiped
                # resulting in a discrepancy between the two files lines numbers
                break
            domain, segment, score = result_line.decode("utf-8").split(",")
            mail = sheet['{}{}'.format(column_idx, line)].value
            if not mail or not regex.search(mail):
                result.seek(-len(result_line),
                            1)  # go back to read the same line again
                continue  # skip invalid entries, like in bulk_score
            if domain not in mail:
                print(
                    "ERROR! Zipping on weird data (trying to zip {} with {})".
                    format(mail, domain))
                exit(1)
            sheet.cell(line, columns + 1).value = segment
            sheet.cell(line, columns + 2).value = int(score)
    filename_with_results = filename.replace(".xls", "_with-results.xls")
    print("Now saving to {}, this might take several minutes...".format(
        filename_with_results))
    workbook.save(filename_with_results)
    print("You're good to go!")
def run_xls(filename: str, api_key: str, score_type: str, column_idx: int):
    print("Welcome to the bulk persons searcher! Wait for the xlsx to load.")
    if re.search('\.xlsx$', filename):
        workbook = load_workbook(filename=filename, keep_vba=False)
        result_filename = filename.replace(".xlsx", ".csv")
    elif re.search('\.xls$', filename):
        workbook = open_xls_as_xlsx(filename)
        result_filename = filename.replace(".xls", ".csv")
    else:
        print("Unsupported file format!")
        exit(1)

    sheet = workbook.active
    regex = re.compile('(?:@)?(?P<tld>[\w\-]+\.\w+)')

    domains_scored = {}
    emails_scored = {}

    print("File loaded. Results will be saved to results/{}.".format(
        result_filename))
    with open("results/" + result_filename, "a+") as result:
        result.seek(0)
        start = sum(1 for line in result)
        skip_empty = 2
        for row in sheet['A2:B256']:
            if not row[0].value:
                skip_empty += 1
            else:
                break
        try:
            rows = sheet.max_row
            for line in range(start + skip_empty, rows):
                person = {}
                if line % 100 == 0:
                    print("Currently at {}%".format(line / (rows * 1.) * 100.))
                person['email'] = sheet['{}{}'.format(column_idx, line)].value
                if not person['email']:
                    continue

                search = regex.search(person["email"])
                print("scoring: " + person["email"])
                if not search:
                    continue

                if score_type == 'domain':
                    domain = search.group('tld')

                    if domain not in domains_scored:
                        params = {"domain": domain}

                        resp = requests.get(API_DOMAIN_URL,
                                            auth=(api_key, ''),
                                            params=params)
                        domains_scored[domain] = resp.json(
                        )['properties']['customer_fit']

                    customer_fit = domains_scored[domain]
                    result.write("{},{},{}\n".format(
                        domain, customer_fit['segment'], customer_fit['score'],
                        '"' +
                        format_signals(customer_fit.get('top_signals', '')) +
                        '"'))
                if score_type == 'email':
                    email = person["email"]
                    if email not in emails_scored:
                        params = {"email": email}

                        resp = requests.get(API_PERSON_URL,
                                            auth=(api_key, ''),
                                            params=params)
                        customer_fit = resp.json(
                        )['properties']['customer_fit']
                        emails_scored[email] = resp.json(
                        )['properties']['customer_fit']
                    customer_fit = emails_scored[email]
                    result.write("{},{},{},{}\n".format(
                        email, customer_fit['segment'], customer_fit['score'],
                        '"' +
                        format_signals(customer_fit.get('top_signals', '')) +
                        '"'))
        except Exception:
            result.flush()
            logger.exception("Exception met. Relaunch to resume!\n")
            exit(1)
        exit(0)
async def run_xls(filename: str, api_key: str, score_type: str,
                  column_idx: int):
    print("Welcome to the bulk persons searcher! Wait for the xlsx to load.")
    if re.search(r'\.xlsx$', filename):
        workbook = load_workbook(filename=filename, keep_vba=False)
        result_filename = filename.replace(".xlsx", ".csv")
    elif re.search(r'\.xls$', filename):
        workbook = open_xls_as_xlsx(filename)
        result_filename = filename.replace(".xls", ".csv")
    else:
        print("Unsupported file format!")
        exit(1)

    sheet = workbook.active

    values_to_score = []
    values_scored = {}

    async def write_to_file(values_to_score):
        results = await asyncio.gather(*[
            get(score_type, api_key, value_to_score)
            for value_to_score in values_to_score
        ])
        for result in results:
            if result and 'properties' in result:
                customer_fit_result = result['properties']['customer_fit']
                values_scored[result[score_type]] = customer_fit_result
                if 'top_signals_formatted' in customer_fit_result:
                    new_row = "{},{},{},{}\"\n".format(
                        result[score_type], customer_fit_result['segment'],
                        customer_fit_result['score'],
                        customer_fit_result['top_signals_formatted'])
                else:
                    new_row = "{},{},{}\"\n".format(
                        result[score_type], customer_fit_result['segment'],
                        customer_fit_result['score'])
                readcsv.write(new_row)

        readcsv.flush()

    print("File loaded. Results will be saved to results/{}.".format(
        result_filename))
    with open("results/" + result_filename, "a+") as readcsv:
        readcsv.seek(0)
        full_csv = readcsv.read()
        try:
            rows = sheet.max_row
            for line in range(2, rows):
                email_or_domain = sheet['{}{}'.format(column_idx, line)].value

                if email_or_domain not in full_csv:
                    print("scoring: " + email_or_domain)

                    if email_or_domain not in values_scored:
                        values_to_score.append(email_or_domain)

                    if len(values_to_score) == 100:
                        await write_to_file(values_to_score)
                        values_to_score = []
                        print("Currently at {}%".format(line / (rows * 1.) *
                                                        100.))

        except Exception:
            readcsv.flush()
            logger.exception("Exception met. Relaunch to resume!\n")
            exit(1)

        if len(values_to_score) > 0:
            await write_to_file(values_to_score)
        exit(0)