Esempi in Python per DictElement, esempi in Python per datagen.entitygenerator.DictElement

Esempio n. 1

0

Mostra file

    def __init__(self,
                 datapath=None,
                 dataFile=None,
                 countryFilter=None,
                 **kwargs):

        DictElement.__init__(self, **kwargs)

        if datapath is None:
            p = os.path.dirname(__file__)
            datapath = os.path.join(p, 'data')

        if datapath is not None:
            dataFile = os.path.join(datapath, dataFile)

        trades = []
        fields = None

        f = gzip.open(dataFile, 'rt')

        # dictreader defaults to using first row as header (good!!)
        reader = csv.DictReader(f, delimiter=',', quotechar='"')
        for d in reader:
            if countryFilter is not None:
                if d['country'] not in countryFilter: continue

            trades.append(d)

        f.close()

        self.fields = fields
        self.trades = trades

        return

Esempio n. 2

0

Mostra file

    def __init__(self, datapath=None, dataFile=None, fields=[], **kwargs):

        DictElement.__init__(self, **kwargs)

        if datapath is None:
            p = os.path.dirname(__file__)
            datapath = os.path.join(p, 'data')

        if datapath is not None:
            dataFile = os.path.join(datapath, dataFile)

        addrs = []
        nfields = len(fields)

        f = gzip.open(dataFile, 'rt', encoding='utf-8')
        for line in f:
            cols = line.strip().split('|')
            if len(cols) != nfields:
                print("Invalid number of fields in address: " + line)
                continue

            d = {}
            n = 0
            for field in fields:
                d[field] = cols[n]
                n += 1

            addrs.append(d)

        f.close()

        self.fields = fields
        self.addrs = addrs

        return

Esempio n. 3

0

Mostra file

    def create(self):
        r = int(random.random() * self.maxRange) + 1
        rset = self.db.execute(self.query, (r,))
        row = rset.fetchone()

        r = dict(zip(self.columns, row))
        DictElement.addChildren(self, r)
        return r

Esempio n. 4

0

Mostra file

File: namegen.py Progetto: bballamudi/datagen

    def create(self, gender=None, **kwargs):

        # If we have a path to Gender (as a parameter), we'll use it over an
        # explicitly given gender.
        if self.params is not None:
            gender_path = self.params.get('gender')
            if gender_path is not None:
                gender = self.root.getValueByPath(gender_path)

        if gender is None:
            if random.random():
                fn_gen = self.male.getValue
                if self.suffix is not None:
                    suffix_gen = self.suffix.getValue
                else:
                    suffix_gen = lambda: ""
            else:
                fn_gen = self.female.getValue
                suffix_gen = lambda: ""
        elif gender.lower() in ['m', 'male']:
            fn_gen = self.male.getValue
            if self.suffix is not None:
                suffix_gen = self.suffix.getValue
            else:
                suffix_gen = lambda: ""
        elif gender.lower() in ['f', 'female']:
            fn_gen = self.female.getValue
            suffix_gen = lambda: ""
        else:
            raise ValueError('Invalid gender: {0:s}'.format(gender))

        first = fn_gen()
        if self.pctFirstInitial > random.random():
            first = first[0]

        last = self.surname.getValue()

        if random.random() < self.pctMidName:

            ## do not allow duplicate first == middle names
            middle = fn_gen()
            #while middle == first:
            #    print("Middle: " + middle + ", First: " + first)
            #    middle == fn_gen()

            if random.random() < self.pctMidInitial:
                middle = middle[0]
        else:
            middle = None

        suffix = None
        if random.random() < self.pctSuffix:
            suffix = suffix_gen()

        r = self.buildName(last, first, middle, suffix)
        DictElement.addChildren(self, r, **kwargs)
        return r

Esempio n. 5

0

Mostra file

    def create(self, **kwargs):

        # select a random record between min_recno and max_recno (inclusive)
        r = int(random.random() *
                (self.max_recno - self.min_recno + 1)) + self.min_recno
        d = self.cursor.execute(self.query_sql, (r, )).fetchone()

        # remove the rownum column if it's not the builtin 'rowid'
        if self.rownum_col != 'rowid':
            del (d[self.rownum_col])

        DictElement.addChildren(self, d, **kwargs)
        return d

Esempio n. 6

0

Mostra file

File: namegen.py Progetto: bballamudi/datagen

    def __init__(
            self,
            datapath=None,
            male="male.dat.gz",
            female="female.dat.gz",
            surname="surname.dat.gz",
            suffix=None,
            order=None,  # output order of full name (one
            #   of None, LFM, or FML).
        pctMidName=1.0,  # percentage of names which have
            #   a middle name (0..1).
        pctMidInitial=0.0,  # of the names which have a mid
            #   name, how many of those give
            #   only an initial (0..1)
        pctFirstInitial=0.0,  # percentage of first names which
            #   only a first initial
        pctSuffix=0,  # percentage of names which
            #   include a suffix
        **kwargs):

        DictElement.__init__(self, **kwargs)

        if datapath is None:
            datagen_root = os.path.dirname(__file__)
            datapath = os.path.join(datagen_root, 'data')

        if datapath is not None:
            male = os.path.join(datapath, male)
            female = os.path.join(datapath, female)
            surname = os.path.join(datapath, surname)
            if suffix is not None:
                suffix = os.path.join(datapath, suffix)

        self.male = CDF(male, delimiter="|")
        self.female = CDF(female, delimiter="|")
        self.surname = CDF(surname, isCumulative=True, delimiter="|")
        if suffix is not None:
            self.suffix = CDF(suffix)
        else:
            self.suffix = None

        self.order = order
        self.pctMidName = pctMidName
        self.pctMidInitial = pctMidInitial
        self.pctFirstInitial = pctFirstInitial
        self.pctSuffix = pctSuffix
        return

Esempio n. 7

0

Mostra file

    def __init__(self,
                 datapath=None,
                 dbFile=None,
                 rownum_col='rowid',
                 table_name=None,
                 **kwargs):

        DictElement.__init__(self, **kwargs)

        if datapath is None:
            p = os.path.dirname(__file__)
            datapath = os.path.join(p, 'data')

        if datapath is not None:
            dbFile = os.path.join(datapath, dbFile)

        db = sqlite3.connect(dbFile)
        cursor = db.cursor()

        # careful -- sql injection possible here
        min_sql = 'select min({0:s}) from {1:s}'.format(rownum_col, table_name)
        cursor.execute(min_sql)
        self.min_recno = cursor.fetchone()[0]

        # careful -- sql injection possible here
        max_sql = 'select max({0:s}) from {1:s}'.format(rownum_col, table_name)
        cursor.execute(max_sql)
        self.max_recno = cursor.fetchone()[0]

        #print("min(recno): " + str(self.min_recno))
        #print("max(recno): " + str(self.max_recno))

        cursor.row_factory = AddressDB.sqlite_dict_factory

        # careful -- sql injection possible here
        sql = 'select * from {0:s} where {1:s} = ?'.format(
            table_name, rownum_col)

        self.rownum_col = rownum_col
        self.table_name = table_name
        self.query_sql = sql
        self.cursor = cursor
        self.db = db
        return

Esempio n. 8

0

Mostra file

    def create(self, **kwargs):
        r = int(random.random() * len(self.trades))
        d = self.trades[r].copy()

        #print(str(d))

        iin_start = int(d['iin_start'])
        iin_end = d['iin_end']
        iin_end = int(iin_end) if iin_end is not '' else iin_start
        iin_range = iin_end - iin_start

        if iin_start == iin_end:
            r = 0
        else:
            rnd = random.random()
            r = int(rnd * (iin_range + 1))

        iin = iin_start + r
        iin = str(iin)

        if d['number_length'] is not "":
            acct_len = int(d['number_length'])
        else:
            acct_len = 16
            if d['scheme'] == 'AMEX':
                acct_len = 15

        # lots of fields... don't need them all.
        for key in [
                'iin_start', 'iin_end', 'bank_logo', 'number_length',
                'bank_url', 'bank_city'
        ]:
            del (d[key])

        # build the random account number
        n = acct_len - 6 - 1  # less BIN number and Luhn check digit
        acct = iin + str(int(random.random() * (10**n))).zfill(n)
        acct += str(self.luhn_checksum(acct))
        d['account_no'] = acct

        DictElement.addChildren(self, d, **kwargs)
        return d

Esempio n. 9

0

Mostra file

    def __init__(self,
                 dataPath=None,
                 dataFile=None,
                 columns=None,
                 tableName=None,
                 keyCol='rowid',
                 **kwargs):
        '''
        Source data for a DictElement from a SQLite database.

        'dataFile' is the name of the SQLite database file.  This value MUST
        be provided.

        'dataPath' specifies the directory in which 'dataFile' may be found.
        If dataPath is not given, it will default to the 'data' directory
        immediately beneath the location of the package files.

        'columns' is a list of columns to include in the output.  If it is not
        specified, all columns except the keyCol will be output.

        'tableName' specifies the name of the table to be queried.

        'keyCol' is the name of the column containing the primary key.  This
        column must be a unique integer, and all values between 1..max(keyCol)
        are expected to be present.  If a value is missing, a new row will be
        selected.  While this does not result in an error, too many attempts
        to find a valid row could result in degraded performance.

        CAUTION:  do not allow external sources to provide 'tableName' or
                  'keyCol'.  These values are used to construct SQL
                  statements, and arbitrary values could be used to introduce
                  SQL injection attacks.
        '''

        DictElement.__init__(self, **kwargs)

        if dataFile is None:
            raise ValueError("dataFile must be provided.")

        if tableName is None:
            raise ValueError("tableName must be provided.")

        # defaults to package directory + '/data'
        if dataPath is None:
            dataPath = os.path.join(os.path.dirname(__file__), 'data')

        dbFile = os.path.join(dataPath, dataFile)

        db = sqlite3.connect(dbFile)
        self.db = db

        self.keyCol = keyCol
        self.tableName = tableName

        # load columns if not given
        if columns is None:
            # default to all except keyCol
            sql = "pragma table_info('{0:s}')"
            cur = db.execute(sql.format(tableName))
            columns = []
            for row in cur:
                if row[1] != keyCol:
                    columns.append(row[1])

        self.columns = columns

        # build the query we'll use to select unique rows.
        query = 'select {0:s} from {1:s} where {2:s} = ?'
        query = query.format(', '.join(columns), tableName, keyCol)
        self.query = query

        # how many entries in the database?
        rangeSql = 'select max({0:s}) from {1:s}'.format(keyCol, tableName)
        cur = db.execute(rangeSql)
        range = cur.fetchone()[0]
        self.maxRange = range

        return

Esempio n. 10

0

Mostra file

    def __init__(self, useFormat=True, pctPresent=1.0, **kwargs):

        DictElement.__init__(self, **kwargs)
        self.useFormat = useFormat  # TODO: ignored for now...
        self.pctPresent = pctPresent

        self.states = {
            "AK":
            lambda: self.dn(7),
            "AL":
            lambda: self.dn(7),
            "AR":
            lambda: "9" + self.dn(8),
            "AZ":
            lambda: self.an(1) + self.dn(8),
            "CA":
            lambda: self.an(1) + self.dn(7),
            "CO":
            lambda: "-".join((self.dn(2), self.dn(3), self.dn(4))),
            "CT":
            self.ct,
            "DE":
            lambda: self.dn(7),
            #"FL": lambda: None,  # it's complicated...
            "GA":
            lambda: self.dn(9),
            "HI":
            lambda: 'H' + self.dn(8),
            "IA":
            lambda: "".join((self.dn(3), self.an(2), self.dn(4))),
            "ID":
            lambda: self.an(2) + self.dn(6) + self.an(1),
            #"IL": lambda: None,  # first letter of LN + 11
            "IN":
            lambda: '-'.join((self.dn(4), self.dn(2), self.dn(4))),
            "KS":
            lambda: '-'.join((self.dn(2), self.dn(2), self.dn(4))),
            "KY":
            lambda: '-'.join(
                (self.an(1) + self.dn(2), self.dn(3), self.dn(3))),
            "LA":
            lambda: "00" + self.dn(7),
            "MA":
            lambda: 'S' + self.dn(8),
            #"MD": lambda: None,  # it's complicated
            "ME":
            lambda: self.dn(7),
            #"MI": lambda: None,  # it's complicated
            #"MN": lambda: None,  # it's complicated
            "MO":
            lambda: self.an(1) + self.dn(random.choice([6, 7, 8, 9])),
            "MS":
            lambda: self.dn(9),
            #"MT": lambda: None,  # it's complicated
            "NC":
            lambda: self.dn(12),
            #"ND": lambda: None,  # it's complicated
            "NE":
            lambda: self.an(1) + self.dn(8),
            #"NH": lambda: None,  # it's complicated
            #"NJ": lambda: None,  # it's complicated
            "NM":
            lambda: self.dn(9),
            "NV":
            lambda: self.dn(10),
            "NY":
            lambda: ' '.join((self.dn(3), self.dn(3), self.dn(3))),
            "OK":
            lambda: self.an(1) + self.dn(8),
            "OH":
            lambda: self.an(2) + self.dn(6),
            "OR":
            lambda: self.dn(7),
            "PA":
            lambda: ' '.join((self.dn(2), self.dn(3), self.dn(3))),
            "RI":
            lambda: self.dn(7),
            "SC":
            lambda: self.dn(9),
            "SD":
            lambda: self.dn(8),
            "TN":
            lambda: self.dn(random.choice([8, 9])),
            "TX":
            lambda: self.dn(8),
            "UT":
            lambda: self.dn(9),
            "VA":
            lambda: '-'.join(
                (self.an(1) + self.dn(2), self.dn(2), self.dn(4))),
            "VT":
            lambda: self.dn(8),
            #"WA": lambda: None,  # it's compilcated
            #"WI": lambda: None,  # it's complicated
            "WV":
            lambda: self.an(1) + self.dn(6),
            "WY":
            lambda: '-'.join((self.dn(6), self.dn(3)))
        }

        self.allStates = list(self.states.keys())
        return

Esempio n. 11

0

Mostra file

    def create(self, **kwargs):
        r = int(random.random() * len(self.addrs))
        d = self.addrs[r]

        DictElement.addChildren(self, d, **kwargs)
        return d