def get_rows(): i=501 #line = input("Enter a row (python dict) into the table: ") while i < 1000: fake = Faker() #Pick an account number and store it in acct #if the account hasn't been already generated then generate a record with all fields i=i+1 line = "{'rownum':"+str(i)+",'dunno':"+str(10)+",'CC':"+str(gen_data.cc_number())+",'Employer':"+str(gen_data.create_company_name())+\ ",'Custemail':"+str(gen_data.create_email())+",'name':"+\ str(gen_data.create_name())+",'occupation':"+str(gen_data.create_job_title())+",'address_street':"+\ str(gen_data.create_city_state_zip())+",'DOB':"+str(gen_data.create_birthday(min_age=2, max_age=85))+\ ",'previous_address_city_state_zip':"+str(gen_data.create_city_state_zip())+",'altcustomer_name':"+str(fake.name())+\ ",'altcustomer_occupation':"+str(gen_data.create_job_title())+",'altcustomer_dob':"+str(gen_data.create_birthday(min_age=2, max_age=85))+\ ",'ssn':"+str((randrange(101,1000,1),randrange(10,100,1),randrange(1000,10000,1)))+",'phone':"+\ str((randrange(101,1000,1),randrange(101,999,1),randrange(1000,10000,1)))+ \ ",'AccountID':"+str(randrange(100000,100000000,1))+",'PepFlag':"+str(max((randrange(0,101,1)-99,0)))+",'altcustomerssn':"+\ str((randrange(101,1000,1),randrange(10,100,1),randrange(1000,10000,1)))+",'demarketed_customer_flag':"+\ str(max((randrange(0,101,1)-99),0))+\ ",'SAR_flag':"+str(max((randrange(0,101,1)-99),0))+",'nolonger_a_customer':"+str(max((randrange(0,101,1)-99),0))+\ ",'closed_account'"+str(max((randrange(0,101,1)-90),0))+",'High_risk_flag':"+str(max((randrange(0,101,1)-99),0))+\ ",'Risk_rating':"+str(max((randrange(0,101,1)-99),0))+"}" yield ast.literal_eval(line)
+ ["demarketed_customer_flag"] + ["SAR_flag"] + ["nolonger_a_customer"] + ["closed_account"] + ["High_risk_flag"] + ["Risk_rating"] ) while i < 50000000: # Pick an account number and store it in acct acct = randrange(100000, 100000000, 1) # if the account hasn't been already generated then generate a record with all fields if d.has_key(str(acct)) == False: row = ( [i] + [10] + [gen_data.cc_number()] + [gen_data.create_company_name()] + [gen_data.create_email()] + [gen_data.create_name()] + [gen_data.create_job_title()] + [gen_data.create_city_state_zip()] + [gen_data.create_birthday(min_age=2, max_age=85)] + [gen_data.create_city_state_zip()] + [fake.name()] + [gen_data.create_job_title()] + [gen_data.create_birthday(min_age=2, max_age=85)] + [(randrange(101, 1000, 1), randrange(10, 100, 1), randrange(1000, 10000, 1))] + [(randrange(101, 1000, 1), randrange(101, 999, 1), randrange(1000, 10000, 1))] + [acct] + [max((randrange(0, 101, 1) - 99), 0)] + [(randrange(101, 1000, 1), randrange(10, 100, 1), randrange(1000, 10000, 1))]
for j in range(No_CCs - 1): names.insert(j, fake.name()) tmp2 = gen_data.create_name() mdl.insert(j, tmp2[0]) ssn.insert( j, (str(randrange(101, 1000, 1)) + str(randrange(10, 100, 1)) + str(randrange(1000, 10000, 1)))) #Name and SSN is set to blank if less than 4 customers on an account for k in range(4 - No_CCs): names.insert(No_CCs + k, '') ssn.insert(No_CCs + k, '') mdl.insert(No_CCs, '') #Sets CC_NO to a random credit card number CC_NO = gen_data.cc_number() #Extract CC_Number from the tuple returned by CC_Number...Tuple contains CC Number and Type #while CC_list.count(CC_NO[1][0]) > 0: CC_TRANS = CC_NO[1][0] dt = str(datetime.now()) clean = re.sub('\W', '', dt) printCC = str(CC_TRANS[-4:]) + str(clean[-12:-3]) + str( randrange(1111, 9999, randrange(1, 10, 1))) #str(CC_TRANS[-4:])+str(clean[-12:-2])+str(randrange(1111,9999,randrange(1,10,1))) #Add CC_Number to control list to prevent duplicates #Add data elements to current csv row row.extend([names[0],mdl[0],ssn[0],names[1],mdl[1],ssn[1],names[2],mdl[2],ssn[2],printCC,CC_NO[0],gen_data.create_company_name()+' '+tmp[1],\ gen_data.create_email(),gen_data.create_job_title()])
#Middle Name to reduce name dups mdl=[] for j in range(No_CCs-1): names.insert(j,fake.name()) tmp2=gen_data.create_name() mdl.insert(j,tmp2[0]) ssn.insert(j,(str(randrange(101,1000,1))+str(randrange(10,100,1))+str(randrange(1000,10000,1)))) #Name and SSN is set to blank if less than 4 customers on an account for k in range(4-No_CCs): names.insert(No_CCs+k,'') ssn.insert(No_CCs+k,'') mdl.insert(No_CCs,'') #Sets CC_NO to a random credit card number CC_NO=gen_data.cc_number() #Extract CC_Number from the tuple returned by CC_Number...Tuple contains CC Number and Type #while CC_list.count(CC_NO[1][0]) > 0: CC_TRANS=CC_NO[1][0] dt = str(datetime.now()) clean=re.sub('\W','',dt) printCC=str(CC_TRANS[-4:])+str(clean[-12:-3])+str(randrange(1111,9999,randrange(1,10,1))) #str(CC_TRANS[-4:])+str(clean[-12:-2])+str(randrange(1111,9999,randrange(1,10,1))) #Add CC_Number to control list to prevent duplicates #Add data elements to current csv row row.extend([names[0],mdl[0],ssn[0],names[1],mdl[1],ssn[1],names[2],mdl[2],ssn[2],printCC,CC_NO[0],gen_data.create_company_name()+' '+tmp[1],\ gen_data.create_email(),gen_data.create_job_title()]) #Creates Current Address
def createCusts(N): #List for client whose net worth is over $500K HighNetWorth = ['Yes'] + ['No'] * 30 #List for type of account Related_Type = ['Primary','Secondary','Joint'] #List for how the account was opened Party_Type = ['Person','Non-Person'] #List for a BMO customer Party_Relation = ['Customer','Non-Customer'] #List for random Yes/No Flag Yes_No = ['Yes'] + ['No'] * 12 #List for random Yes/No Consent Yes_No_Consent = ['Yes'] + ['No'] * 4 #List for equal Yes/No Flag Yes_No_50 = ['Yes','No'] #List for official language Official_Lang = ['English'] * 3 + ['French'] #List for method of communication Preffered_Channel = ['Direct Mail','Telemarketing','Email','SMS'] #List for status of customer #Customer_Status = ['Prospect','Inactive Customer','Past Customer'] + ['Active Customer'] * 56 #List for LOB Segment Type Seg_Model_Type = ['LOB Specific','Profitability','Geographical','Behavioral','Risk Tolerance'] #List for Model ID Model_ID = ['01','02','03','04','05'] #List for Model Name Seg_Model_Name = ['IRRI', 'CRS Risk Score','Geo Risk','Financial Behavior Risk','CM Risk'] #List for Model Score Seg_Model_Score = ['200','300','400','100','500'] #List for Model Group Seg_Model_Group = ['Group 1'] * 2 + ['Group 2','Group 3','Group 4'] #List for Model Description Seg_Model_Description = ['High Risk Tier','Mid Risk Tier','Low Risk Tier','Vertical Risk','Geographical Risk'] #List for random Arms Dealer flag Arms_Manufacturer=['Yes'] + ['No'] * 2 + [''] * 392 #List for random auction flag Auction=['Yes'] + ['No'] * 2 + [''] * 392 #List for random Cash Intensive flag CashIntensive_Business=['Yes'] + ['No'] * 2 + [''] * 392 #List for random Casino?Gaming flag Casino_Gambling=['Yes'] + ['No'] * 2 + [''] * 392 #List for random Client Onboarding flag Channel_Onboarding=['E-mail','In Person','In person - In Branch/Bank Office','In person - Offsite/Client Location','Mail','Online','Phone','Request for Proposal (RFP)'] + ['Not Applicable'] * 10 #List for random Transaction flag Channel_Ongoing_Transactions=['ATM','E-mail','Fax','Mail','Not Applicable','OTC Communication System','Phone'] + ['Online'] * 4 + ['In Person'] * 31 #List for random HI_Vehicle flag Complex_HI_Vehicle=['Yes'] + ['No'] * 2 + [''] * 392 #List for random Metals flag Dealer_Precious_Metal=['Yes'] + ['No'] * 2 + [''] * 392 #List for random Arms Dealer flag Digital_PM_Operator=['Yes'] + ['No'] * 2 + [''] * 392 #List for random Embassy flag Embassy_Consulate=['Yes'] + ['No'] * 2 + [''] * 392 #Sets variable to Embassy flag Exchange_Currency=Embassy_Consulate #Sets variable to Embassy flag Foreign_Financial_Institution=Embassy_Consulate #Sets variable to Embassy flag Foreign_Government=Embassy_Consulate #Sets variable to Embassy flag Foreign_NonBank_Financial_Institution=Embassy_Consulate #Sets variable to Embassy flag Internet_Gambling=Embassy_Consulate #Sets variable to Embassy flag Medical_Marijuana_Dispensary=Embassy_Consulate #Sets variable to Embassy flag Money_Service_Business=Embassy_Consulate #Sets variable to Embassy flag NonRegulated_Financial_Institution=Embassy_Consulate #Sets variable to Embassy flag Not_Profit=Embassy_Consulate #List for random occupation Occupation=['11-1011 Chief Executives',\ '11-3011 Administrative Services Managers',\ '11-3031 Financial Managers',\ '11-3061 Purchasing Managers',\ '13-1011 Agents and Business Managers of Artists Performers and Athletes',\ '13-1031 Claims Adjusters Examiners, and Investigators',\ '13-1199 Business Operations Specialists, All Other',\ '13-2099 Financial Specialists All Other',\ '17-1011 Architects Except Landscape and Naval',\ '23-1011 Lawyers',\ '23-1023 Judges, Magistrate Judges and Magistrates',\ '25-2012 Kindergarten Teachers Except Special Education',\ '25-2021 Elementary School Teachers Except Special Education',\ '29-1041 Optometrists',\ '29-2054 Respiratory Therapy Technicians',\ '33-2011 Firefighters',\ '37-1012 First-Line Supervisors of Landscaping Lawn Service and Groundskeeping Workers',\ '39-1011 Gaming Supervisors',\ '39-2011 Animal Trainers',\ '41-1011 First-Line Supervisors of Retail Sales Workers',\ '41-1012 First-Line Supervisors of Non-Retail Sales Workers',\ '41-2011 Cashiers',\ '41-2031 Retail Salespersons',\ '43-3021 Billing and Posting Clerks',\ '45-1011 First-Line Supervisors of Farming, Fishing, and Forestry Workers',\ '49-2011 Computer Automated Teller and Office Machine Repairers',\ '53-3021 Bus Drivers Transit and Intercity',\ '53-4031 Railroad Conductors and Yardmasters',\ '55-1011 Air Crew Officers',\ '55-1012 Aircraft Launch and Recovery Officers',\ '55-1013 Armored Assault Vehicle Officers',\ ] #Sets variable to Embassy flag Privately_ATM_Operator=Embassy_Consulate #List for random products Products=['Certificate of Deposit',\ 'Checking Account',\ 'Credit Card',\ 'Custodial and Investment Agency - Institutional',\ 'Custodial and Investment Agency - Personal',\ 'Custodial/Trust Outsourcing Services (BTOS)',\ 'Custody Accounts (PTIM)',\ 'Custody Accounts (RSTC)',\ 'DTF (BHFA)',\ 'Investment Agency - Personal',\ 'Investment Management Account (PTIM)',\ 'Lease',\ 'Loan / Letter of Credit',\ 'Money Market',\ 'Mortgage / Bond / Debentures',\ 'None',\ 'Savings Account',\ 'Trust Administration - Irrevocable and Revocable (PTIM)',\ 'Trust Administration - Irrevocable and Revocable Trusts (BDTC)',\ ] + ['Nondeposit Investment Products'] * 14 + ['Investment Agency - Institutional'] * 5 #Sets variable to Embassy flag Sales_Used_Vehicles=Embassy_Consulate #Dictionary for random Services Services=['Benefit Payment Services',\ 'Domestic Wires and Direct Deposit / ACH',\ 'Family Office Services (FOS)',\ 'Fiduciary Services',\ 'International Wires and IAT',\ 'Investment Advisory Services (IAS)',\ 'Investment Services',\ 'None',\ 'Online / Mobile Banking',\ 'Payroll',\ 'Short Term Cash Management',\ 'Trust Services',\ 'Trustee Services',\ 'Vault Cash Services',\ ] + ['Financial Planning'] * 6 + ['Retirement Plans'] * 19 #Dictionary for random SIC_Code SIC_Code=['6021 National Commercial Banks',\ '6211 Security Brokers Dealers and Flotation Companies',\ '6282 Investment Advice',\ '6311 Life Insurance',\ '6733 Trusts Except Educational Religious and Charitable',\ '8999 Services NEC',\ ] + ['6722 Management Investment Offices Open-End'] * 12 #Dictionary for random Market Listing Stock_Market_Listing=['Australian Stock Exchange',\ 'Brussels Stock Exchange',\ 'Montreal Stock Exchange',\ 'Tiers 1 and 2 of the TSX Venture Exchange (also known as Tiers 1 and 2 of the Canadian Venture Exchange)',\ 'Toronto Stock Exchange',\ ] + ['Not Found'] * 30 #Sets variable to Embassy flag Third_Party_Payment_Processor=Embassy_Consulate #Sets variable to Embassy flag Transacting_Provider=Embassy_Consulate #Dictionary for random Low Net Worth LowNet=[1,2] + [0] * 5 #Dictionary for Consumer vs Business Acct_Type = ['B'] + ['C'] * 5 #Dictionary for random number of credits cards per account Number_CC = [1] * 7 + [2] * 11 + [3] * 3 + [4] #Dictionary for Account list set to blank acct_list=[] #Dictionary for CreditCard list set to blank CC_list = [] #Dictionary for random Wolfsberg scenario Use_Case = [1,4,7,10,13,16,19,22,25,28,31,34,39] * 4 + [2,5,8,11,14,17,20,23,26,29,32,35,38] * 7 + [3,6,9,12,15,18,21,24,27,30,33,36] * 65 + [37] * 73 + [40,41] * 2 refrating = ['1','1','1','2','3','4','2','4','5','5','5','5','5','5','5','5','5','5','5','5'] fake = Faker() global liSSNMaster start=10786147 acct_list=[] liCSV = [] for i in xrange(N): #Initiate High Risk Flags #Politically Exposed Person PEP='No' #Customer with a Suspicous Activity Report SAR='No' #Customer with a closed account Clsd='No' #High risk customer flag high_risk='No' #High Risk Rating hr_rating='' #Customer that was demarketed by the bank demarket='No' dem_date='' #generate closed acct flag if (max((randrange(0,98,1)-96),0)==1): Clsd='Yes' #Random choice for number of credit card users per account number No_CCs = random.choice(Number_CC) #Generate account number acct=start+1+randrange(1,10,1) start=acct #Randomly generate customer name + middle name in tmp name = fake.name() tmp=gen_data.create_name() #Adds account number to account dictionary acct_list.extend([acct]) #Creates a new row and adds data elements row = [i]+[acct]+[random.choice(Acct_Type)]+[No_CCs]+[name]+[tmp[0]]+[liSSNMaster[i]] #Dictionary for names list set to blank names=[] #Dictionary for Social Security Number list set to blank ssn=[] #Middle Name to reduce name dups mdl=[] for j in range(No_CCs-1): names.insert(j,fake.name()) tmp2=gen_data.create_name() mdl.insert(j,tmp2[0]) ##Pull from SSN Master list randInt = randrange(1,len(liSSNMaster),1) if randInt != i: ssn.insert(j,liSSNMaster[randInt]) else: ssn.insert(j,liSSNMaster[randInt - 1]) #Name and SSN is set to blank if less than 4 customers on an account for k in range(4-No_CCs): names.insert(No_CCs+k,'') ssn.insert(No_CCs+k,'') mdl.insert(No_CCs,'') #Sets CC_NO to a random credit card number CC_NO=gen_data.cc_number() #Extract CC_Number from the tuple returned by CC_Number then scramble to ensure uniqueness...Tuple contains CC Number and Type CC_TRANS=CC_NO[1][0] dt = str(datetime.now()) clean=re.sub('\W','',dt) printCC=str(CC_TRANS[-4:])+str(clean[-12:-3])+str(randrange(1111,9999,randrange(1,10,1))) #Add data elements to current csv row row.extend([names[0],mdl[0],ssn[0],names[1],mdl[1],ssn[1],names[2],mdl[2],ssn[2],printCC,CC_NO[0],gen_data.create_company_name()+' '+tmp[1],\ gen_data.create_email(),gen_data.create_job_title()]) #Create Current Address zip=random.choice(zips.zip) addr=geo_data.create_city_state_zip[zip] #Create Previous address zip2=random.choice(zips.zip) addr2=geo_data.create_city_state_zip[zip2] #Add additional data elements to current csv row lrg_cash_ex=random.choice(Yes_No) #Condition for SARs and Demarketed Clients if(Clsd=='Yes'): #1% of closed accounts are demarketed but never had a SAR filed if (max((randrange(0,101,1)-99),0)==1 and SAR=='No'): demarket='Yes' dem_date=gen_data.create_date(past=True) if (max((randrange(0,11,1)-9),0)==1 and demarket=='No'): #10% of closed accounts have SARs SAR='Yes' #90% of closed accounts with SARs are demarketed if(max((randrange(0,11,1)-9),0)==0): demarket='Yes' dem_date=gen_data.create_date(past=True) if (max((randrange(0,101,1)-99),0)==1): PEP='Yes' row.extend([addr[0],addr[1],zip,'US',addr2[0],addr2[1],zip2,'US',gen_data.create_birthday(min_age=2, max_age=85),PEP,SAR,Clsd]) #Start Generating related accounts from account list once 10,000 accounts are generated - to avoid duplicating accounts in the beginning if i > 10000: rel = int(random.choice(acct_list))*max((randrange(0,10001,1)-9999),0) if rel <> 0: row.append(rel) row.append(random.choice(Related_Type)) else: row.append('') row.append('') else: row.append('') row.append('') #Randomly generates account start date party_start=gen_data.create_date(past=True) #Randomly selects consent option for sharing info Consent_Share = random.choice(Yes_No_Consent) #Add additional data elements to current csv row row.extend([random.choice(Party_Type),random.choice(Party_Relation),party_start,gen_data.create_date(past=True),\ lrg_cash_ex,demarket,dem_date,randrange(0,100,1),random.choice(Official_Lang)]) #Add data element preferred methond of contact for yes to share info...if not then blank to current row if Consent_Share == 'Yes': row.extend(['Yes',random.choice(Preffered_Channel)]) else: row.extend(['No','']) row.extend([zip,randrange(0,5,1)]) #Generate Segment ID then add additional Segment data based on the selection to the current csv row Segment_ID = randrange(0,5,1)%5 if Segment_ID == 0: row.extend([Model_ID[0],Seg_Model_Type[0],Seg_Model_Name[0],Seg_Model_Group[0],Seg_Model_Description[0],Seg_Model_Score[0]]) if Segment_ID == 1: row.extend([Model_ID[1],Seg_Model_Type[1],Seg_Model_Name[1],Seg_Model_Group[1],Seg_Model_Description[1],Seg_Model_Score[1]]) if Segment_ID == 2: row.extend([Model_ID[2],Seg_Model_Type[2],Seg_Model_Name[2],Seg_Model_Group[2],Seg_Model_Description[2],Seg_Model_Score[2]]) if Segment_ID == 3: row.extend([Model_ID[3],Seg_Model_Type[3],Seg_Model_Name[3],Seg_Model_Group[3],Seg_Model_Description[3],Seg_Model_Score[3]]) if Segment_ID == 4: row.extend([Model_ID[4],Seg_Model_Type[4],Seg_Model_Name[4],Seg_Model_Group[4],Seg_Model_Description[4],Seg_Model_Score[4]]) #Add additional data elements to current csv row hr0=random.choice(Arms_Manufacturer) hr01=random.choice(Auction) hr02=random.choice(CashIntensive_Business) hr03=random.choice(Casino_Gambling) hr04=random.choice(Channel_Onboarding) hr05=random.choice(Channel_Ongoing_Transactions) row.extend([hr0,hr01,hr02,hr03,hr04,hr05]) #Randomly select whether customer has a High Net Worth HighNetWorthFlag = random.choice(HighNetWorth) #Randomly Generate customer net worth based on the above flag if HighNetWorthFlag == 'Yes': row.append(max(max((randrange(0,101,1)-99),0)*randrange(1000000,25000000,1),randrange(1000000,5000000,1))) else: flag=random.choice(LowNet) if flag==0: row.append(randrange(-250000,600000,1)) else: if flag==1: row.append(randrange(149000,151000,1)) else: row.append(randrange(40000,50000,1)) #Add data elements to current csv row hr1=random.choice(Complex_HI_Vehicle) hr2=random.choice(Dealer_Precious_Metal) hr3=random.choice(Digital_PM_Operator) hr4=random.choice(Embassy_Consulate) hr5=random.choice(Exchange_Currency) hr6=random.choice(Foreign_Financial_Institution) hr7=random.choice(Foreign_Government) hr8=random.choice(Foreign_NonBank_Financial_Institution) hr9=random.choice(Internet_Gambling) hr10=random.choice(Medical_Marijuana_Dispensary) hr11=random.choice(Money_Service_Business) hr12=random.choice(NAICS.NAICS_Code) hr13=random.choice(NonRegulated_Financial_Institution) hr14=random.choice(Not_Profit) #hr15=random.choice(Occupation) - added before through gen_data hr16=random.choice(Privately_ATM_Operator) hr17=random.choice(Products) hr18=random.choice(Sales_Used_Vehicles) hr19=random.choice(Services) hr20=random.choice(SIC_Code) hr21=random.choice(Stock_Market_Listing) hr22=random.choice(Third_Party_Payment_Processor) hr23=random.choice(Transacting_Provider) if(PEP=='Yes' or SAR=='Yes' or lrg_cash_ex=='Yes' or demarket=='Yes' or hr0=='Yes' or hr01=='Yes' or hr02=='Yes' or hr03=='Yes' or hr1=='Yes' or hr2=='Yes' or hr3=='Yes' or hr4=='Yes' or hr5=='Yes' or hr6=='Yes' or hr7=='Yes' or hr8=='Yes' or hr9=='Yes' or hr10=='Yes' or hr11=='Yes' or hr13=='Yes' or hr14=='Yes' or hr16=='Yes' or hr17=='Yes' or hr18=='Yes' or hr22=='Yes' or hr23=='Yes' or HighNetWorthFlag=='Yes'): high_risk='Yes' hr_rating=random.choice(refrating) if(SAR=='No' and high_risk=='No'): if(max((randrange(0,101,1)-99),0)==1): high_risk='Yes' hr_rating=random.choice(refrating) if(PEP=='No' and high_risk=='No'): if(max((randrange(0,101,1)-99),0)==1): high_risk='Yes' hr_rating=random.choice(refrating) if(high_risk=='No'): if(max((randrange(0,101,1)-99),0)==1): high_risk='Yes' hr_rating=random.choice(refrating) row.extend([hr1,hr2,hr3,hr4,hr5,hr6,hr7,hr8,hr9,hr10,hr11,hr12,hr13,hr14,hr16,hr17,hr18,hr19,hr20,hr21,hr22,hr23, HighNetWorthFlag,high_risk,hr_rating,random.choice(Use_Case)]) liCSV.append(row) return liCSV
writer = csv.writer( f1, delimiter=',', lineterminator='\n', ) writer.writerow(['rownum'] +['dunno'] + ['CC'] + ['Employer'] + ['Custemail'] + ['name'] \ + ['occupation'] + ['address_street'] + ['DOB']+['previous address_city_state_zip']+ ['altcustomer_name'] \ + ['altcustomer_occupation'] + ['altcustomer_dob'] + ['ssn'] + ['phone'] + \ ['AccountID'] + ['PepFlag'] + ['altcustomerssn'] + ['demarketed_customer_flag'] + \ ['SAR_flag'] + ['nolonger_a_customer'] + ['closed_account'] +['High_risk_flag'] +['Risk_rating']) while i < 50000000: #Pick an account number and store it in acct acct = randrange(100000, 100000000, 1) #if the account hasn't been already generated then generate a record with all fields if d.has_key(str(acct)) == False: row = [i] + [10] + [gen_data.cc_number()]+[gen_data.create_company_name()] + \ [gen_data.create_email()]+[gen_data.create_name()] +[gen_data.create_job_title()] + \ [gen_data.create_city_state_zip()] + [gen_data.create_birthday(min_age=2, max_age=85)] + \ [gen_data.create_city_state_zip()] + [fake.name()] + [gen_data.create_job_title()] + \ [gen_data.create_birthday(min_age=2, max_age=85)] +\ [(randrange(101,1000,1),randrange(10,100,1),randrange(1000,10000,1))] + \ [(randrange(101,1000,1),randrange(101,999,1),randrange(1000,10000,1))] + \ [acct] + \ [max((randrange(0,101,1)-99),0)] + \ [(randrange(101,1000,1),randrange(10,100,1),randrange(1000,10000,1))] + \ [max((randrange(0,101,1)-99),0)] + [max((randrange(0,101,1)-99),0)] + \ [max((randrange(0,101,1)-99),0)] + [max((randrange(0,101,1)-90),0)] + \ [max((randrange(0,101,1)-99),0)] + [max((randrange(0,101,1)-99),0)] d[str(acct)] = acct i = i + 1 writer.writerow(row)
from barnum import gen_data import csv #gen_data = gen_data() with open('large.csv','w') as f1: writer=csv.writer(f1, delimiter=',',lineterminator='\n',) writer.writerow([''] + range(10)) for i in range(50000000): row = [i] + [10] + [gen_data.cc_number()]+[gen_data.create_company_name()] +[gen_data.create_email()]+[gen_data.create_name()] +[gen_data.create_job_title()] + [gen_data.create_city_state_zip()] + [gen_data.create_birthday(min_age=2, max_age=85)] writer.writerow(row) #row = [i] + [10] + [fake.name()] +[fake.address()]
def createCusts(N): #List for client whose net worth is over $500K HighNetWorth = ['Yes'] + ['No'] * 30 #List for type of account Related_Type = ['Primary', 'Secondary', 'Joint'] #List for how the account was opened Party_Type = ['Person', 'Non-Person'] #List for a BMO customer Party_Relation = ['Customer', 'Non-Customer'] #List for random Yes/No Flag Yes_No = ['Yes'] + ['No'] * 12 #List for random Yes/No Consent Yes_No_Consent = ['Yes'] + ['No'] * 4 #List for equal Yes/No Flag Yes_No_50 = ['Yes', 'No'] #List for official language Official_Lang = ['English'] * 3 + ['French'] #List for method of communication Preffered_Channel = ['Direct Mail', 'Telemarketing', 'Email', 'SMS'] #List for status of customer #Customer_Status = ['Prospect','Inactive Customer','Past Customer'] + ['Active Customer'] * 56 #List for LOB Segment Type Seg_Model_Type = [ 'LOB Specific', 'Profitability', 'Geographical', 'Behavioral', 'Risk Tolerance' ] #List for Model ID Model_ID = ['01', '02', '03', '04', '05'] #List for Model Name Seg_Model_Name = [ 'IRRI', 'CRS Risk Score', 'Geo Risk', 'Financial Behavior Risk', 'CM Risk' ] #List for Model Score Seg_Model_Score = ['200', '300', '400', '100', '500'] #List for Model Group Seg_Model_Group = ['Group 1'] * 2 + ['Group 2', 'Group 3', 'Group 4'] #List for Model Description Seg_Model_Description = [ 'High Risk Tier', 'Mid Risk Tier', 'Low Risk Tier', 'Vertical Risk', 'Geographical Risk' ] #List for random Arms Dealer flag Arms_Manufacturer = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random auction flag Auction = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random Cash Intensive flag CashIntensive_Business = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random Casino?Gaming flag Casino_Gambling = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random Client Onboarding flag Channel_Onboarding = [ 'E-mail', 'In Person', 'In person - In Branch/Bank Office', 'In person - Offsite/Client Location', 'Mail', 'Online', 'Phone', 'Request for Proposal (RFP)' ] + ['Not Applicable'] * 10 #List for random Transaction flag Channel_Ongoing_Transactions = [ 'ATM', 'E-mail', 'Fax', 'Mail', 'Not Applicable', 'OTC Communication System', 'Phone' ] + ['Online'] * 4 + ['In Person'] * 31 #List for random HI_Vehicle flag Complex_HI_Vehicle = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random Metals flag Dealer_Precious_Metal = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random Arms Dealer flag Digital_PM_Operator = ['Yes'] + ['No'] * 2 + [''] * 392 #List for random Embassy flag Embassy_Consulate = ['Yes'] + ['No'] * 2 + [''] * 392 #Sets variable to Embassy flag Exchange_Currency = Embassy_Consulate #Sets variable to Embassy flag Foreign_Financial_Institution = Embassy_Consulate #Sets variable to Embassy flag Foreign_Government = Embassy_Consulate #Sets variable to Embassy flag Foreign_NonBank_Financial_Institution = Embassy_Consulate #Sets variable to Embassy flag Internet_Gambling = Embassy_Consulate #Sets variable to Embassy flag Medical_Marijuana_Dispensary = Embassy_Consulate #Sets variable to Embassy flag Money_Service_Business = Embassy_Consulate #Sets variable to Embassy flag NonRegulated_Financial_Institution = Embassy_Consulate #Sets variable to Embassy flag Not_Profit = Embassy_Consulate #List for random occupation Occupation=['11-1011 Chief Executives',\ '11-3011 Administrative Services Managers',\ '11-3031 Financial Managers',\ '11-3061 Purchasing Managers',\ '13-1011 Agents and Business Managers of Artists Performers and Athletes',\ '13-1031 Claims Adjusters Examiners, and Investigators',\ '13-1199 Business Operations Specialists, All Other',\ '13-2099 Financial Specialists All Other',\ '17-1011 Architects Except Landscape and Naval',\ '23-1011 Lawyers',\ '23-1023 Judges, Magistrate Judges and Magistrates',\ '25-2012 Kindergarten Teachers Except Special Education',\ '25-2021 Elementary School Teachers Except Special Education',\ '29-1041 Optometrists',\ '29-2054 Respiratory Therapy Technicians',\ '33-2011 Firefighters',\ '37-1012 First-Line Supervisors of Landscaping Lawn Service and Groundskeeping Workers',\ '39-1011 Gaming Supervisors',\ '39-2011 Animal Trainers',\ '41-1011 First-Line Supervisors of Retail Sales Workers',\ '41-1012 First-Line Supervisors of Non-Retail Sales Workers',\ '41-2011 Cashiers',\ '41-2031 Retail Salespersons',\ '43-3021 Billing and Posting Clerks',\ '45-1011 First-Line Supervisors of Farming, Fishing, and Forestry Workers',\ '49-2011 Computer Automated Teller and Office Machine Repairers',\ '53-3021 Bus Drivers Transit and Intercity',\ '53-4031 Railroad Conductors and Yardmasters',\ '55-1011 Air Crew Officers',\ '55-1012 Aircraft Launch and Recovery Officers',\ '55-1013 Armored Assault Vehicle Officers',\ ] #Sets variable to Embassy flag Privately_ATM_Operator = Embassy_Consulate #List for random products Products=['Certificate of Deposit',\ 'Checking Account',\ 'Credit Card',\ 'Custodial and Investment Agency - Institutional',\ 'Custodial and Investment Agency - Personal',\ 'Custodial/Trust Outsourcing Services (BTOS)',\ 'Custody Accounts (PTIM)',\ 'Custody Accounts (RSTC)',\ 'DTF (BHFA)',\ 'Investment Agency - Personal',\ 'Investment Management Account (PTIM)',\ 'Lease',\ 'Loan / Letter of Credit',\ 'Money Market',\ 'Mortgage / Bond / Debentures',\ 'None',\ 'Savings Account',\ 'Trust Administration - Irrevocable and Revocable (PTIM)',\ 'Trust Administration - Irrevocable and Revocable Trusts (BDTC)',\ ] + ['Nondeposit Investment Products'] * 14 + ['Investment Agency - Institutional'] * 5 #Sets variable to Embassy flag Sales_Used_Vehicles = Embassy_Consulate #Dictionary for random Services Services=['Benefit Payment Services',\ 'Domestic Wires and Direct Deposit / ACH',\ 'Family Office Services (FOS)',\ 'Fiduciary Services',\ 'International Wires and IAT',\ 'Investment Advisory Services (IAS)',\ 'Investment Services',\ 'None',\ 'Online / Mobile Banking',\ 'Payroll',\ 'Short Term Cash Management',\ 'Trust Services',\ 'Trustee Services',\ 'Vault Cash Services',\ ] + ['Financial Planning'] * 6 + ['Retirement Plans'] * 19 #Dictionary for random SIC_Code SIC_Code=['6021 National Commercial Banks',\ '6211 Security Brokers Dealers and Flotation Companies',\ '6282 Investment Advice',\ '6311 Life Insurance',\ '6733 Trusts Except Educational Religious and Charitable',\ '8999 Services NEC',\ ] + ['6722 Management Investment Offices Open-End'] * 12 #Dictionary for random Market Listing Stock_Market_Listing=['Australian Stock Exchange',\ 'Brussels Stock Exchange',\ 'Montreal Stock Exchange',\ 'Tiers 1 and 2 of the TSX Venture Exchange (also known as Tiers 1 and 2 of the Canadian Venture Exchange)',\ 'Toronto Stock Exchange',\ ] + ['Not Found'] * 30 #Sets variable to Embassy flag Third_Party_Payment_Processor = Embassy_Consulate #Sets variable to Embassy flag Transacting_Provider = Embassy_Consulate #Dictionary for random Low Net Worth LowNet = [1, 2] + [0] * 5 #Dictionary for Consumer vs Business Acct_Type = ['B'] + ['C'] * 5 #Dictionary for random number of credits cards per account Number_CC = [1] * 7 + [2] * 11 + [3] * 3 + [4] #Dictionary for Account list set to blank acct_list = [] #Dictionary for CreditCard list set to blank CC_list = [] #Dictionary for random Wolfsberg scenario Use_Case = [1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 39] * 4 + [ 2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38 ] * 7 + [3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36 ] * 65 + [37] * 73 + [40, 41] * 2 refrating = [ '1', '1', '1', '2', '3', '4', '2', '4', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5' ] fake = Faker() global liSSNMaster start = 10786147 acct_list = [] liCSV = [] for i in xrange(N): #Initiate High Risk Flags #Politically Exposed Person PEP = 'No' #Customer with a Suspicous Activity Report SAR = 'No' #Customer with a closed account Clsd = 'No' #High risk customer flag high_risk = 'No' #High Risk Rating hr_rating = '' #Customer that was demarketed by the bank demarket = 'No' dem_date = '' #generate closed acct flag if (max((randrange(0, 98, 1) - 96), 0) == 1): Clsd = 'Yes' #Random choice for number of credit card users per account number No_CCs = random.choice(Number_CC) #Generate account number acct = start + 1 + randrange(1, 10, 1) start = acct #Randomly generate customer name + middle name in tmp name = fake.name() tmp = gen_data.create_name() #Adds account number to account dictionary acct_list.extend([acct]) #Creates a new row and adds data elements row = [i] + [acct] + [random.choice(Acct_Type)] + [No_CCs] + [name] + [ tmp[0] ] + [liSSNMaster[i]] #Dictionary for names list set to blank names = [] #Dictionary for Social Security Number list set to blank ssn = [] #Middle Name to reduce name dups mdl = [] for j in range(No_CCs - 1): names.insert(j, fake.name()) tmp2 = gen_data.create_name() mdl.insert(j, tmp2[0]) ##Pull from SSN Master list randInt = randrange(1, len(liSSNMaster), 1) if randInt != i: ssn.insert(j, liSSNMaster[randInt]) else: ssn.insert(j, liSSNMaster[randInt - 1]) #Name and SSN is set to blank if less than 4 customers on an account for k in range(4 - No_CCs): names.insert(No_CCs + k, '') ssn.insert(No_CCs + k, '') mdl.insert(No_CCs, '') #Sets CC_NO to a random credit card number CC_NO = gen_data.cc_number() #Extract CC_Number from the tuple returned by CC_Number then scramble to ensure uniqueness...Tuple contains CC Number and Type CC_TRANS = CC_NO[1][0] dt = str(datetime.now()) clean = re.sub('\W', '', dt) printCC = str(CC_TRANS[-4:]) + str(clean[-12:-3]) + str( randrange(1111, 9999, randrange(1, 10, 1))) #Add data elements to current csv row row.extend([names[0],mdl[0],ssn[0],names[1],mdl[1],ssn[1],names[2],mdl[2],ssn[2],printCC,CC_NO[0],gen_data.create_company_name()+' '+tmp[1],\ gen_data.create_email(),gen_data.create_job_title()]) #Create Current Address zip = random.choice(zips.zip) addr = geo_data.create_city_state_zip[zip] #Create Previous address zip2 = random.choice(zips.zip) addr2 = geo_data.create_city_state_zip[zip2] #Add additional data elements to current csv row lrg_cash_ex = random.choice(Yes_No) #Condition for SARs and Demarketed Clients if (Clsd == 'Yes'): #1% of closed accounts are demarketed but never had a SAR filed if (max((randrange(0, 101, 1) - 99), 0) == 1 and SAR == 'No'): demarket = 'Yes' dem_date = gen_data.create_date(past=True) if (max((randrange(0, 11, 1) - 9), 0) == 1 and demarket == 'No'): #10% of closed accounts have SARs SAR = 'Yes' #90% of closed accounts with SARs are demarketed if (max((randrange(0, 11, 1) - 9), 0) == 0): demarket = 'Yes' dem_date = gen_data.create_date(past=True) if (max((randrange(0, 101, 1) - 99), 0) == 1): PEP = 'Yes' row.extend([ addr[0], addr[1], zip, 'US', addr2[0], addr2[1], zip2, 'US', gen_data.create_birthday(min_age=2, max_age=85), PEP, SAR, Clsd ]) #Start Generating related accounts from account list once 10,000 accounts are generated - to avoid duplicating accounts in the beginning if i > 10000: rel = int(random.choice(acct_list)) * max( (randrange(0, 10001, 1) - 9999), 0) if rel <> 0: row.append(rel) row.append(random.choice(Related_Type)) else: row.append('') row.append('') else: row.append('') row.append('') #Randomly generates account start date party_start = gen_data.create_date(past=True) #Randomly selects consent option for sharing info Consent_Share = random.choice(Yes_No_Consent) #Add additional data elements to current csv row row.extend([random.choice(Party_Type),random.choice(Party_Relation),party_start,gen_data.create_date(past=True),\ lrg_cash_ex,demarket,dem_date,randrange(0,100,1),random.choice(Official_Lang)]) #Add data element preferred methond of contact for yes to share info...if not then blank to current row if Consent_Share == 'Yes': row.extend(['Yes', random.choice(Preffered_Channel)]) else: row.extend(['No', '']) row.extend([zip, randrange(0, 5, 1)]) #Generate Segment ID then add additional Segment data based on the selection to the current csv row Segment_ID = randrange(0, 5, 1) % 5 if Segment_ID == 0: row.extend([ Model_ID[0], Seg_Model_Type[0], Seg_Model_Name[0], Seg_Model_Group[0], Seg_Model_Description[0], Seg_Model_Score[0] ]) if Segment_ID == 1: row.extend([ Model_ID[1], Seg_Model_Type[1], Seg_Model_Name[1], Seg_Model_Group[1], Seg_Model_Description[1], Seg_Model_Score[1] ]) if Segment_ID == 2: row.extend([ Model_ID[2], Seg_Model_Type[2], Seg_Model_Name[2], Seg_Model_Group[2], Seg_Model_Description[2], Seg_Model_Score[2] ]) if Segment_ID == 3: row.extend([ Model_ID[3], Seg_Model_Type[3], Seg_Model_Name[3], Seg_Model_Group[3], Seg_Model_Description[3], Seg_Model_Score[3] ]) if Segment_ID == 4: row.extend([ Model_ID[4], Seg_Model_Type[4], Seg_Model_Name[4], Seg_Model_Group[4], Seg_Model_Description[4], Seg_Model_Score[4] ]) #Add additional data elements to current csv row hr0 = random.choice(Arms_Manufacturer) hr01 = random.choice(Auction) hr02 = random.choice(CashIntensive_Business) hr03 = random.choice(Casino_Gambling) hr04 = random.choice(Channel_Onboarding) hr05 = random.choice(Channel_Ongoing_Transactions) row.extend([hr0, hr01, hr02, hr03, hr04, hr05]) #Randomly select whether customer has a High Net Worth HighNetWorthFlag = random.choice(HighNetWorth) #Randomly Generate customer net worth based on the above flag if HighNetWorthFlag == 'Yes': row.append( max( max((randrange(0, 101, 1) - 99), 0) * randrange(1000000, 25000000, 1), randrange(1000000, 5000000, 1))) else: flag = random.choice(LowNet) if flag == 0: row.append(randrange(-250000, 600000, 1)) else: if flag == 1: row.append(randrange(149000, 151000, 1)) else: row.append(randrange(40000, 50000, 1)) #Add data elements to current csv row hr1 = random.choice(Complex_HI_Vehicle) hr2 = random.choice(Dealer_Precious_Metal) hr3 = random.choice(Digital_PM_Operator) hr4 = random.choice(Embassy_Consulate) hr5 = random.choice(Exchange_Currency) hr6 = random.choice(Foreign_Financial_Institution) hr7 = random.choice(Foreign_Government) hr8 = random.choice(Foreign_NonBank_Financial_Institution) hr9 = random.choice(Internet_Gambling) hr10 = random.choice(Medical_Marijuana_Dispensary) hr11 = random.choice(Money_Service_Business) hr12 = random.choice(NAICS.NAICS_Code) hr13 = random.choice(NonRegulated_Financial_Institution) hr14 = random.choice(Not_Profit) #hr15=random.choice(Occupation) - added before through gen_data hr16 = random.choice(Privately_ATM_Operator) hr17 = random.choice(Products) hr18 = random.choice(Sales_Used_Vehicles) hr19 = random.choice(Services) hr20 = random.choice(SIC_Code) hr21 = random.choice(Stock_Market_Listing) hr22 = random.choice(Third_Party_Payment_Processor) hr23 = random.choice(Transacting_Provider) if (PEP == 'Yes' or SAR == 'Yes' or lrg_cash_ex == 'Yes' or demarket == 'Yes' or hr0 == 'Yes' or hr01 == 'Yes' or hr02 == 'Yes' or hr03 == 'Yes' or hr1 == 'Yes' or hr2 == 'Yes' or hr3 == 'Yes' or hr4 == 'Yes' or hr5 == 'Yes' or hr6 == 'Yes' or hr7 == 'Yes' or hr8 == 'Yes' or hr9 == 'Yes' or hr10 == 'Yes' or hr11 == 'Yes' or hr13 == 'Yes' or hr14 == 'Yes' or hr16 == 'Yes' or hr17 == 'Yes' or hr18 == 'Yes' or hr22 == 'Yes' or hr23 == 'Yes' or HighNetWorthFlag == 'Yes'): high_risk = 'Yes' hr_rating = random.choice(refrating) if (SAR == 'No' and high_risk == 'No'): if (max((randrange(0, 101, 1) - 99), 0) == 1): high_risk = 'Yes' hr_rating = random.choice(refrating) if (PEP == 'No' and high_risk == 'No'): if (max((randrange(0, 101, 1) - 99), 0) == 1): high_risk = 'Yes' hr_rating = random.choice(refrating) if (high_risk == 'No'): if (max((randrange(0, 101, 1) - 99), 0) == 1): high_risk = 'Yes' hr_rating = random.choice(refrating) row.extend([ hr1, hr2, hr3, hr4, hr5, hr6, hr7, hr8, hr9, hr10, hr11, hr12, hr13, hr14, hr16, hr17, hr18, hr19, hr20, hr21, hr22, hr23, HighNetWorthFlag, high_risk, hr_rating, random.choice(Use_Case) ]) liCSV.append(row) return liCSV
from random import randrange from random import random from random import shuffle from faker import Faker from barnum import gen_data import csv fake = Faker() with open('large.csv','w') as f1: writer=csv.writer(f1, delimiter=',',lineterminator='\n',) writer.writerow(['rownum'] +['dunno'] + ['CC'] + ['Employer'] + ['Custemail'] + ['name'] \ + ['occupation'] + ['address_street'] + ['DOB']+['previous address_city_state_zip']+ ['altcustomer_name'] \ + ['altcustomer_occupation'] + ['altcustomer_dob'] + ['ssn'] + ['phone'] + \ ['AccountID'] + ['PepFlag'] + ['altcustomerssn'] + ['demarketed_customer_flag'] + \ ['SAR_flag'] + ['nolonger_a_customer'] + ['closed_account'] +['High_risk_flag'] +['Risk_rating']) for i in range(50000000): row = [i] + [10] + [gen_data.cc_number()]+[gen_data.create_company_name()] + \ [gen_data.create_email()]+[gen_data.create_name()] +[gen_data.create_job_title()] + \ [gen_data.create_city_state_zip()] + [gen_data.create_birthday(min_age=2, max_age=85)] + \ [gen_data.create_city_state_zip()] + [fake.name()] + [gen_data.create_job_title()] + \ [gen_data.create_birthday(min_age=2, max_age=85)] +\ [(randrange(101,1000,1),randrange(10,100,1),randrange(1000,10000,1))] + \ [(randrange(101,1000,1),randrange(101,999,1),randrange(1000,10000,1))] + \ [randrange(100000,100000000,1)] + \ [max((randrange(0,101,1)-99),0)] + \ [(randrange(101,1000,1),randrange(10,100,1),randrange(1000,10000,1))] + \ [max((randrange(0,101,1)-99),0)] + [max((randrange(0,101,1)-99),0)] + \ [max((randrange(0,101,1)-99),0)] + [max((randrange(0,101,1)-90),0)] + \ [max((randrange(0,101,1)-99),0)] + [max((randrange(0,101,1)-99),0)] writer.writerow(row)