def __init__(self, datapath=None, dataFile=None, countryFilter=None, **kwargs): DictElement.__init__(self, **kwargs) if datapath is None: p = os.path.dirname(__file__) datapath = os.path.join(p, 'data') if datapath is not None: dataFile = os.path.join(datapath, dataFile) trades = [] fields = None f = gzip.open(dataFile, 'rt') # dictreader defaults to using first row as header (good!!) reader = csv.DictReader(f, delimiter=',', quotechar='"') for d in reader: if countryFilter is not None: if d['country'] not in countryFilter: continue trades.append(d) f.close() self.fields = fields self.trades = trades return
def __init__(self, datapath=None, dataFile=None, fields=[], **kwargs): DictElement.__init__(self, **kwargs) if datapath is None: p = os.path.dirname(__file__) datapath = os.path.join(p, 'data') if datapath is not None: dataFile = os.path.join(datapath, dataFile) addrs = [] nfields = len(fields) f = gzip.open(dataFile, 'rt', encoding='utf-8') for line in f: cols = line.strip().split('|') if len(cols) != nfields: print("Invalid number of fields in address: " + line) continue d = {} n = 0 for field in fields: d[field] = cols[n] n += 1 addrs.append(d) f.close() self.fields = fields self.addrs = addrs return
def create(self): r = int(random.random() * self.maxRange) + 1 rset = self.db.execute(self.query, (r,)) row = rset.fetchone() r = dict(zip(self.columns, row)) DictElement.addChildren(self, r) return r
def create(self, gender=None, **kwargs): # If we have a path to Gender (as a parameter), we'll use it over an # explicitly given gender. if self.params is not None: gender_path = self.params.get('gender') if gender_path is not None: gender = self.root.getValueByPath(gender_path) if gender is None: if random.random(): fn_gen = self.male.getValue if self.suffix is not None: suffix_gen = self.suffix.getValue else: suffix_gen = lambda: "" else: fn_gen = self.female.getValue suffix_gen = lambda: "" elif gender.lower() in ['m', 'male']: fn_gen = self.male.getValue if self.suffix is not None: suffix_gen = self.suffix.getValue else: suffix_gen = lambda: "" elif gender.lower() in ['f', 'female']: fn_gen = self.female.getValue suffix_gen = lambda: "" else: raise ValueError('Invalid gender: {0:s}'.format(gender)) first = fn_gen() if self.pctFirstInitial > random.random(): first = first[0] last = self.surname.getValue() if random.random() < self.pctMidName: ## do not allow duplicate first == middle names middle = fn_gen() #while middle == first: # print("Middle: " + middle + ", First: " + first) # middle == fn_gen() if random.random() < self.pctMidInitial: middle = middle[0] else: middle = None suffix = None if random.random() < self.pctSuffix: suffix = suffix_gen() r = self.buildName(last, first, middle, suffix) DictElement.addChildren(self, r, **kwargs) return r
def create(self, **kwargs): # select a random record between min_recno and max_recno (inclusive) r = int(random.random() * (self.max_recno - self.min_recno + 1)) + self.min_recno d = self.cursor.execute(self.query_sql, (r, )).fetchone() # remove the rownum column if it's not the builtin 'rowid' if self.rownum_col != 'rowid': del (d[self.rownum_col]) DictElement.addChildren(self, d, **kwargs) return d
def __init__( self, datapath=None, male="male.dat.gz", female="female.dat.gz", surname="surname.dat.gz", suffix=None, order=None, # output order of full name (one # of None, LFM, or FML). pctMidName=1.0, # percentage of names which have # a middle name (0..1). pctMidInitial=0.0, # of the names which have a mid # name, how many of those give # only an initial (0..1) pctFirstInitial=0.0, # percentage of first names which # only a first initial pctSuffix=0, # percentage of names which # include a suffix **kwargs): DictElement.__init__(self, **kwargs) if datapath is None: datagen_root = os.path.dirname(__file__) datapath = os.path.join(datagen_root, 'data') if datapath is not None: male = os.path.join(datapath, male) female = os.path.join(datapath, female) surname = os.path.join(datapath, surname) if suffix is not None: suffix = os.path.join(datapath, suffix) self.male = CDF(male, delimiter="|") self.female = CDF(female, delimiter="|") self.surname = CDF(surname, isCumulative=True, delimiter="|") if suffix is not None: self.suffix = CDF(suffix) else: self.suffix = None self.order = order self.pctMidName = pctMidName self.pctMidInitial = pctMidInitial self.pctFirstInitial = pctFirstInitial self.pctSuffix = pctSuffix return
def __init__(self, datapath=None, dbFile=None, rownum_col='rowid', table_name=None, **kwargs): DictElement.__init__(self, **kwargs) if datapath is None: p = os.path.dirname(__file__) datapath = os.path.join(p, 'data') if datapath is not None: dbFile = os.path.join(datapath, dbFile) db = sqlite3.connect(dbFile) cursor = db.cursor() # careful -- sql injection possible here min_sql = 'select min({0:s}) from {1:s}'.format(rownum_col, table_name) cursor.execute(min_sql) self.min_recno = cursor.fetchone()[0] # careful -- sql injection possible here max_sql = 'select max({0:s}) from {1:s}'.format(rownum_col, table_name) cursor.execute(max_sql) self.max_recno = cursor.fetchone()[0] #print("min(recno): " + str(self.min_recno)) #print("max(recno): " + str(self.max_recno)) cursor.row_factory = AddressDB.sqlite_dict_factory # careful -- sql injection possible here sql = 'select * from {0:s} where {1:s} = ?'.format( table_name, rownum_col) self.rownum_col = rownum_col self.table_name = table_name self.query_sql = sql self.cursor = cursor self.db = db return
def create(self, **kwargs): r = int(random.random() * len(self.trades)) d = self.trades[r].copy() #print(str(d)) iin_start = int(d['iin_start']) iin_end = d['iin_end'] iin_end = int(iin_end) if iin_end is not '' else iin_start iin_range = iin_end - iin_start if iin_start == iin_end: r = 0 else: rnd = random.random() r = int(rnd * (iin_range + 1)) iin = iin_start + r iin = str(iin) if d['number_length'] is not "": acct_len = int(d['number_length']) else: acct_len = 16 if d['scheme'] == 'AMEX': acct_len = 15 # lots of fields... don't need them all. for key in [ 'iin_start', 'iin_end', 'bank_logo', 'number_length', 'bank_url', 'bank_city' ]: del (d[key]) # build the random account number n = acct_len - 6 - 1 # less BIN number and Luhn check digit acct = iin + str(int(random.random() * (10**n))).zfill(n) acct += str(self.luhn_checksum(acct)) d['account_no'] = acct DictElement.addChildren(self, d, **kwargs) return d
def __init__(self, dataPath=None, dataFile=None, columns=None, tableName=None, keyCol='rowid', **kwargs): ''' Source data for a DictElement from a SQLite database. 'dataFile' is the name of the SQLite database file. This value MUST be provided. 'dataPath' specifies the directory in which 'dataFile' may be found. If dataPath is not given, it will default to the 'data' directory immediately beneath the location of the package files. 'columns' is a list of columns to include in the output. If it is not specified, all columns except the keyCol will be output. 'tableName' specifies the name of the table to be queried. 'keyCol' is the name of the column containing the primary key. This column must be a unique integer, and all values between 1..max(keyCol) are expected to be present. If a value is missing, a new row will be selected. While this does not result in an error, too many attempts to find a valid row could result in degraded performance. CAUTION: do not allow external sources to provide 'tableName' or 'keyCol'. These values are used to construct SQL statements, and arbitrary values could be used to introduce SQL injection attacks. ''' DictElement.__init__(self, **kwargs) if dataFile is None: raise ValueError("dataFile must be provided.") if tableName is None: raise ValueError("tableName must be provided.") # defaults to package directory + '/data' if dataPath is None: dataPath = os.path.join(os.path.dirname(__file__), 'data') dbFile = os.path.join(dataPath, dataFile) db = sqlite3.connect(dbFile) self.db = db self.keyCol = keyCol self.tableName = tableName # load columns if not given if columns is None: # default to all except keyCol sql = "pragma table_info('{0:s}')" cur = db.execute(sql.format(tableName)) columns = [] for row in cur: if row[1] != keyCol: columns.append(row[1]) self.columns = columns # build the query we'll use to select unique rows. query = 'select {0:s} from {1:s} where {2:s} = ?' query = query.format(', '.join(columns), tableName, keyCol) self.query = query # how many entries in the database? rangeSql = 'select max({0:s}) from {1:s}'.format(keyCol, tableName) cur = db.execute(rangeSql) range = cur.fetchone()[0] self.maxRange = range return
def __init__(self, useFormat=True, pctPresent=1.0, **kwargs): DictElement.__init__(self, **kwargs) self.useFormat = useFormat # TODO: ignored for now... self.pctPresent = pctPresent self.states = { "AK": lambda: self.dn(7), "AL": lambda: self.dn(7), "AR": lambda: "9" + self.dn(8), "AZ": lambda: self.an(1) + self.dn(8), "CA": lambda: self.an(1) + self.dn(7), "CO": lambda: "-".join((self.dn(2), self.dn(3), self.dn(4))), "CT": self.ct, "DE": lambda: self.dn(7), #"FL": lambda: None, # it's complicated... "GA": lambda: self.dn(9), "HI": lambda: 'H' + self.dn(8), "IA": lambda: "".join((self.dn(3), self.an(2), self.dn(4))), "ID": lambda: self.an(2) + self.dn(6) + self.an(1), #"IL": lambda: None, # first letter of LN + 11 "IN": lambda: '-'.join((self.dn(4), self.dn(2), self.dn(4))), "KS": lambda: '-'.join((self.dn(2), self.dn(2), self.dn(4))), "KY": lambda: '-'.join( (self.an(1) + self.dn(2), self.dn(3), self.dn(3))), "LA": lambda: "00" + self.dn(7), "MA": lambda: 'S' + self.dn(8), #"MD": lambda: None, # it's complicated "ME": lambda: self.dn(7), #"MI": lambda: None, # it's complicated #"MN": lambda: None, # it's complicated "MO": lambda: self.an(1) + self.dn(random.choice([6, 7, 8, 9])), "MS": lambda: self.dn(9), #"MT": lambda: None, # it's complicated "NC": lambda: self.dn(12), #"ND": lambda: None, # it's complicated "NE": lambda: self.an(1) + self.dn(8), #"NH": lambda: None, # it's complicated #"NJ": lambda: None, # it's complicated "NM": lambda: self.dn(9), "NV": lambda: self.dn(10), "NY": lambda: ' '.join((self.dn(3), self.dn(3), self.dn(3))), "OK": lambda: self.an(1) + self.dn(8), "OH": lambda: self.an(2) + self.dn(6), "OR": lambda: self.dn(7), "PA": lambda: ' '.join((self.dn(2), self.dn(3), self.dn(3))), "RI": lambda: self.dn(7), "SC": lambda: self.dn(9), "SD": lambda: self.dn(8), "TN": lambda: self.dn(random.choice([8, 9])), "TX": lambda: self.dn(8), "UT": lambda: self.dn(9), "VA": lambda: '-'.join( (self.an(1) + self.dn(2), self.dn(2), self.dn(4))), "VT": lambda: self.dn(8), #"WA": lambda: None, # it's compilcated #"WI": lambda: None, # it's complicated "WV": lambda: self.an(1) + self.dn(6), "WY": lambda: '-'.join((self.dn(6), self.dn(3))) } self.allStates = list(self.states.keys()) return
def create(self, **kwargs): r = int(random.random() * len(self.addrs)) d = self.addrs[r] DictElement.addChildren(self, d, **kwargs) return d