Exemple #1
0
 def load_table_schema_from_census_xlsx_file(self, *, filename, prefix=""):
     """Read multiple tables from a xlsx file. Only ingest the tables that have the prefix. We only look at the first 5 columns"""
     from openpyxl import load_workbook
     from openpyxl.cell.read_only import EmptyCell
     wb = load_workbook(filename=filename, read_only='True')
     for ws in wb.worksheets:
         print("Processing worksheet {}".format(ws.title))
         if not self.verify_census_worksheet(ws):
             logging.error(
                 "This does not appear to be a US Census Bureau XLSX specification file"
             )
             raise RuntimeError(
                 "This does not appear to be a US Census Bureau XLSX specification file"
             )
         column = 0
         table = Table(name=ws.title)
         self.add_table(table)
         header_skipped = False
         for row in ws.iter_rows():
             # Skip empty rows
             if type(row[0]) == EmptyCell:
                 continue
             # skip the header
             if header_skipped == False:
                 header_skipped = True
                 continue
             # Only the first five columns are considered
             values = []
             for i in range(0, 5):
                 if type(row[i]) == EmptyCell:
                     values.append("")
                 elif type(row[i].value) == str:
                     values.append(row[i].value.strip())
                 else:
                     values.append(row[i].value)
             (column_id, column_name, oracle_datatype, desc,
              legal_values) = values
             if (not column_id) and not (legal_values):
                 # Ignore the blank line
                 continue
             if column_id:
                 if type(column_id) != int:
                     logging.info("Column ID is invalid; skipping")
                     continue
                 v = Variable(position=column_id,
                              name=column_name,
                              desc=desc,
                              vtype=oracle_datatype,
                              column=column)
                 # advance to the next column
                 column += v.width
                 table.add_variable(v)
             if v and legal_values:
                 for possible_legal_value in legal_values.split("\n"):
                     r = Range.extract_range_and_desc(possible_legal_value)
                     if r:
                         v.add_range(r)
Exemple #2
0
    def load_schema_from_ipums_sas_file(*, schema, filename):
        if not IPUMS_SASParser.is_ipums_sas_file(filename):
            raise RuntimeError("{} is not an IPUMS SAS file".format(filename))

        # Only process H and P record types
        rectypes = ['H', 'P']

        state = None
        rectype = None
        table = None
        labels = dict()
        for line in dopen(filename):
            line = line.strip()
            if line == INPUT and rectype != None:
                if state == INPUT:
                    raise RuntimeError(
                        "{}: INPUT within INPUT???".format(filename))
                state = INPUT
                table = Table(name=rectype)
                continue
            if line == ';':
                if state == INPUT:
                    if rectype:
                        schema.add_table(table)
                state = None  # end of SAS statement
                continue
            if line == LABEL:
                state = LABEL
                continue
            if state == INPUT:
                IPUMS_SASParser.process_layout_line(table=table, line=line)
            if state == LABEL:
                m = label_re.search(line)
                if m:
                    labels[m.group(1)] = m.group(2)

            m = rectype_re.search(line)
            if m:
                rectype = m.group(1)
                if rectype not in rectypes:
                    raise RuntimeError(
                        "Record type '{}' not in list of approved rectypes ({})"
                        .format(rectype, str(rectypes)))

        # Now use the labels to set the description for all of the variables we have learned
        for table in schema.tables():
            for v in table.vars():
                if v.name in labels:
                    v.desc = labels[v.name]
def process_file(fname, schemafile):
    table = None
    for line in open(fname):
        if not table:
            m = title_re.search(line)
            if m:
                table = Table(name=m.group(1))
        if table and not table.version:
            m = version_re.search(line)
            if m:
                table.version = m.group(1)
        m = variable_re.search(line)
        if m:
            v = Variable()
            (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5)
            oname = name
            count = 2
            while name in [v.name for v in table.vars]:
                name = "{}{}".format(oname, count)
                count += 1
            v.define_from_row([position, name, desc, vtype])
            if "-" in column:
                v.column = [int(x) for x in column.split("-")]
            table.add_variable(v)

    schemafile.write('#define SPEC_FILE "{}"\n'.format(fname))
    table.write_sql_scanner(schemafile)
Exemple #4
0
    def load_table_from_docx_table(docx_table):
        """Returns a table name and the list of the variables"""
        tableName = docx_table[0][0].replace(" ", "_").replace(".", "_")
        if CensusSpec.debug:
            print("DEBUG: Reading table '{}'".format(tableName))
        table = Table(name=tableName)
        v = None  # current variable
        for row in docx_table[1:]:  # don't process the first line
            cols = [x.replace("\n", " ").strip()
                    for x in row]  # change all newlines to spaces
            if sum([len(x) for x in cols]) == 0:  # blank row
                continue
            if CensusSpec.debug:
                print("DEBUG:  cols: {}".format(cols))
            if CensusSpec.is_variable_start(cols):
                if CensusSpec.debug:
                    print("DEBUG:    defining variable {}".format(cols[0]))
                v = Variable(position=cols[0],
                             name=cols[1],
                             desc=cols[2],
                             vtype=cols[3])

                # if there is a fifth column, it may be allowable values
                if len(cols) == 5:
                    v.add_valid_data_description(cols[4])
                table.add_variable(v)
                continue

            # If defining a variable and we have an extra cols, it may have an allowable value
            # in the cols[2] or cols[4]
            if v:
                if len(cols) > 2 and len(cols[2]):
                    v.add_valid_data_description(cols[2])
                    contine
                if len(cols) > 4 and len(cols[4]):
                    v.add_valid_data_description(cols[4])
                    continue
                print("Not sure what to do with this:", cols)
                assert False
        return table
Exemple #5
0
    def load_table_schema_from_census_txt_spec(self, *, filename, prefix=""):
        """Read a single table from a txt file."""
        table = None
        for (ll, line) in enumerate(dopen(filename), 1):
            if ll > self.MAXLINES:
                if (table == None) or len(table.vars()) == 0:
                    logging.info(
                        "{} is not a Census text specification".format(
                            filename))
                    return None

            # Get a table name if we do not have one
            if not table:
                m = TXT_TITLE_RE.search(line)
                if m:
                    table = Table(name=m.group(1))
                    table.add_comment("Parsed from {}".format(filename))
                    continue
            # Get the table version if we do not have one
            if table and not table.version:
                m = TXT_VERSION_RE.search(line)
                if m:
                    table.version = m.group(1)
                    continue
            # Is this a variable name within the table?
            m = VARIABLE_RE.search(line)
            if m:
                (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5)
                oname = name
                count = 2
                v = Variable(position=row[0],
                             name=row[1],
                             desc=row[2],
                             vtype=row[3])
                while name in [v.name for v in table.vars()]:
                    name = "{}{}".format(oname, count)
                    count += 1
                if "-" in column:
                    v.column = [int(x) for x in column.split("-")]
                table.add_variable(v)
        if len(table.vars()) == 0:
            return None
        self.add_table(table)
Exemple #6
0
    def explore_database(self, schema=None):
        tables_collection = dict()
        if schema is None:
            schema = self.url.db
        tables = self.get_table_names(schema)
        for table in tables:
            if table in tables_collection:
                raise exc.ArgumentError('Table "%s" is already defined'
                                        ' in this database' % table)
            tables_collection[table] = Table(table, None)
            for column in self.get_columns(table, schema):
                tables_collection[table].add_column(column)

        return tables_collection
Exemple #7
0
def json2schema(data):
    schema = Schema()
    for name, t in data.items():
        table = schema.tables[name] = Table(name)
        table.include = t["include"]
        table.primary_key = t["primary_key"]

        for c, r in t["relationships"].items():
            relationship = table.relationships[c] = \
                Relationship(r["name"], c,
                             r["remote_table"],
                             r["remote_column"])

            relationship.action = r["action"]

        for name, c in t["columns"].items():
            column = table.columns[name] = \
                Column(name, c["mysql_type"], c["options"])
    return schema
Exemple #8
0
def process_file(fname, dbfile):

    data = pandas.read_sas(fname, chunksize=1)
    frame = next(data)

    # Make a table
    table = Table(name=os.path.splitext(os.path.split(fname)[1])[0])
    logging.info("Creating table {}".format(table.name))
    for col in frame.columns:
        v = Variable()
        v.set_name(col)
        v.set_vtype(schema.vtype_for_numpy_type(type(frame[col][0])))
        table.add_variable(v)

    conn = sqlite3.connect(dbfile)
    c = conn.cursor()
    cmd = table.sql_schema()
    c.execute(cmd)

    t0 = time.time()
    logging.info("Transferring data...")
    istmt = table.sql_insert()
    print(istmt)
    lines = 0
    for frame in pandas.read_sas(fname, chunksize=CHUNKSIZE):
        c.execute("BEGIN TRANSACTION;")
        for row in frame.itertuples(index=False):
            c.execute(istmt, row)
            lines += 1
            if lines % 10000 == 0:
                t = int(time.time() - t0)
                s = t % 60
                m = (t % 3600) // 60
                h = t // 3600
                logging.info("time: {}:{:02}:{:02} lines {:,}".format(
                    h, m, s, lines))
        c.execute("END TRANSACTION;")
Exemple #9
0
        self._re_key = re.compile(
            r'\s+(?:(?P<type>\w+) )?KEY'
            r'(?:\s+`(?P<name>\w+)`)?'
            r'(?:\s+USING\s+(?P<using>BTREE|HASH))?'
            r'\s+\((?P<column>\.+)\)'
            r'(?:\s+USING\s+(?P<using_post>\w+))?'
            r'(?:\s+KEY_BLOCK_SIZE\s*[ =]? *(?P<keyblock>\w+))?'
            r'(?:\s+WITH PARSER\s+(?P<parser>\w+))?'
            r',?$', re.I | re.UNICODE)

        self._re_constraint = re.compile(
            r'\s+CONSTRAINT'
            r'\s+`(?P<name>\w+)`'
            r'\s+FOREIGN KEY'
            r'\s+\((?P<local>[^\)]+?)\) REFERENCES'
            r'\s+`(?P<table>\w+)`'
            r'\s+\((?P<foreign>[^\)]+?)\)'
            r'(?:\s+(?P<match>MATCH \w+))?'
            r'(?:\s+ON DELETE\s+(?P<ondelete>RESTRICT|CASCADE|SET NULL|NOACTION))?'
            r'(?:\s+ON UPDATE\s+(?P<onupdate>RESTRICT|CASCADE|SET NULL|NOACTION))?',
            re.I | re.UNICODE)


if __name__ == '__main__':
    posts = Table('posts', None)
    result = Select(Column('id', int, posts)).From(posts).Where(
        Column('created_at', datetime.datetime, posts) >= datetime.datetime(
            2014, 04, 24))
    # print(result.sql_tree._root.next_node.next_node.next_node)
    print(result.dump_query())
Exemple #10
0
    #     - visible to: 'vt' sum of all photos in shared viewpoint (includes 'sb'). to get the
    #       real count of photos shared with this user but not shared by him, compute 'vt - sb'
    #
    # 'op_ids' holds a list of previously-applied operation IDs. This is an attempt to
    # make increments idempotent with replays. The list is a comma-separated string of
    # operation ids (sometimes suffixed with a viewpoint ID), in the order in which they were
    # applied. We keep a maximum of Accounting._MAX_APPLIED_OP_IDS.
    #
    # Currently, all columns are used by each accounting category.
    Table(ACCOUNTING,
          'at',
          read_units=100,
          write_units=10,
          columns=[
              HashKeyColumn('hash_key', 'hk', 'S'),
              RangeKeyColumn('sort_key', 'sk', 'S'),
              Column('num_photos', 'np', 'N'),
              Column('tn_size', 'ts', 'N'),
              Column('med_size', 'ms', 'N'),
              Column('full_size', 'fs', 'N'),
              Column('orig_size', 'os', 'N'),
              Column('op_ids', 'oi', 'S')
          ]),

    # Activities are associated with a viewpoint and contain a record of
    # all high-level operations which have modified the structure of the
    # viewpoint in some way. For more details, see activity.py. The
    # activity_id attribute is a composite of information gleaned from
    # current operation: (reverse timestamp, user_id, op_id). The content
    # of the activity is a JSON-encoded ACTIVITY structure, as defined in
    # json_schema.py. 'update_seq' is set to the value of the viewpoint's
    # 'update_seq' attribute after it was incremented during creation of