Example #1
0
def generate_synthetic_names(n_synth):
    n_faker = min(round(n_synth / 2),
                  700)  # reduce repetitions, faker come from limited dataset
    n_gibberish = n_synth - n_faker
    synth_list = []

    fake = Faker()
    for _ in range(n_faker):
        # faker returns brands composed of multiple last names, just return first (may include hyphens)
        name = fake.company().replace(",", "").split()[0]
        if random.random() < 0.5:
            name = name.lower()
        if ("-" in name) and (random.random() < 0.5):
            name = name.replace("-", " ")
        synth_list.append(name)

    gib = Gibberish()
    for _ in range(n_gibberish):
        # gibberish returns uncapitalized gibberish, by default begins and ends with consonants
        type_of_gibberish = random.random()
        if type_of_gibberish < 0.25:
            name = gib.generate_word(start_vowel=True)
        elif type_of_gibberish < 0.5:
            name = gib.generate_word(end_vowel=True)
        elif type_of_gibberish < 0.75:
            name = gib.generate_word()
        else:
            name = gib.generate_word(2, start_vowel=True, end_vowel=True)
        add_gibberish = random.random()
        if add_gibberish < 0.1:
            name += " " + gib.generate_word()
        if random.random() < 0.5:
            name = name.title()
        synth_list.append(name)

    random.shuffle(synth_list)
    return synth_list