/
dummy_data.py
90 lines (62 loc) · 2.54 KB
/
dummy_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
from faker import Factory
import random
import datetime
import locale
def main():
# Set locale to the United States for US-formatted Social Security Numbers
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
# Create faker factory instance
faker = Factory.create()
# The number of rows in the resulting CSV
num_fake_records = 1000
# Download the field names from Google Sheets in XLSX format
google_sheet = pd.read_excel('https://docs.google.com/spreadsheets/d/' +
'1bloqWcGFOmFLJ98iYFAEX5PDZWpqBI722-qZVyekZuU' +
'/export?format=xlsx',
sheetname=None)
# If a field doesn't have weights, add 1s as weights
# functionally, this will give each class equal weight
for field, df in google_sheet.iteritems():
google_sheet[field] = google_sheet[field].fillna(1)
# Create the dataframe with dummy data
dummy_data = pd.DataFrame([fake_record(google_sheet, faker) for _ in range(num_fake_records)])
# Output and save data
print dummy_data.head()
output_file = 'dummy_data.csv'
dummy_data.to_csv(output_file, index=False)
print '\nDummy data saved to: ' + output_file
def fake_record(google_sheet, faker):
"""Return one row with fake data
"""
one_record = {}
# Pick one from each field in the spreadsheet
for field, df in google_sheet.iteritems():
one_record[field] = weighted_choice(df.values.tolist())
# Add Faker fields
one_record['home_city'] = faker.city()
one_record['home_state'] = faker.state()
# Add dates
one_record['enlistment_date'] = date_between(faker, 'jan01-1995', 'jan01-2013')
one_record['separation_date'] = date_between(faker, 'jan01-1996', 'jan01-2014')
return one_record
def date_between(faker, d1, d2):
"""Return a date that falls between two dates
"""
f = '%b%d-%Y'
date_time = faker.date_time_between_dates(datetime.datetime.strptime(d1, f), datetime.datetime.strptime(d2, f))
return date_time.date()
def weighted_choice(choices):
"""For a list of items in the format (class, weight), return one choice from the list, taking weight into account
Weight can be percentages or numbers, they don't have to add up to anything
"""
total = sum(w for c, w in choices)
r = random.uniform(0, total)
upto = 0
for c, w in choices:
if upto + w >= r:
return c
upto += w
assert False, "The weighted_choice() function shouldn't get here"
if __name__ == "__main__":
main()