def load_table_schema_from_census_xlsx_file(self, *, filename, prefix=""): """Read multiple tables from a xlsx file. Only ingest the tables that have the prefix. We only look at the first 5 columns""" from openpyxl import load_workbook from openpyxl.cell.read_only import EmptyCell wb = load_workbook(filename=filename, read_only='True') for ws in wb.worksheets: print("Processing worksheet {}".format(ws.title)) if not self.verify_census_worksheet(ws): logging.error( "This does not appear to be a US Census Bureau XLSX specification file" ) raise RuntimeError( "This does not appear to be a US Census Bureau XLSX specification file" ) column = 0 table = Table(name=ws.title) self.add_table(table) header_skipped = False for row in ws.iter_rows(): # Skip empty rows if type(row[0]) == EmptyCell: continue # skip the header if header_skipped == False: header_skipped = True continue # Only the first five columns are considered values = [] for i in range(0, 5): if type(row[i]) == EmptyCell: values.append("") elif type(row[i].value) == str: values.append(row[i].value.strip()) else: values.append(row[i].value) (column_id, column_name, oracle_datatype, desc, legal_values) = values if (not column_id) and not (legal_values): # Ignore the blank line continue if column_id: if type(column_id) != int: logging.info("Column ID is invalid; skipping") continue v = Variable(position=column_id, name=column_name, desc=desc, vtype=oracle_datatype, column=column) # advance to the next column column += v.width table.add_variable(v) if v and legal_values: for possible_legal_value in legal_values.split("\n"): r = Range.extract_range_and_desc(possible_legal_value) if r: v.add_range(r)
def load_schema_from_ipums_sas_file(*, schema, filename): if not IPUMS_SASParser.is_ipums_sas_file(filename): raise RuntimeError("{} is not an IPUMS SAS file".format(filename)) # Only process H and P record types rectypes = ['H', 'P'] state = None rectype = None table = None labels = dict() for line in dopen(filename): line = line.strip() if line == INPUT and rectype != None: if state == INPUT: raise RuntimeError( "{}: INPUT within INPUT???".format(filename)) state = INPUT table = Table(name=rectype) continue if line == ';': if state == INPUT: if rectype: schema.add_table(table) state = None # end of SAS statement continue if line == LABEL: state = LABEL continue if state == INPUT: IPUMS_SASParser.process_layout_line(table=table, line=line) if state == LABEL: m = label_re.search(line) if m: labels[m.group(1)] = m.group(2) m = rectype_re.search(line) if m: rectype = m.group(1) if rectype not in rectypes: raise RuntimeError( "Record type '{}' not in list of approved rectypes ({})" .format(rectype, str(rectypes))) # Now use the labels to set the description for all of the variables we have learned for table in schema.tables(): for v in table.vars(): if v.name in labels: v.desc = labels[v.name]
def process_file(fname, schemafile): table = None for line in open(fname): if not table: m = title_re.search(line) if m: table = Table(name=m.group(1)) if table and not table.version: m = version_re.search(line) if m: table.version = m.group(1) m = variable_re.search(line) if m: v = Variable() (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5) oname = name count = 2 while name in [v.name for v in table.vars]: name = "{}{}".format(oname, count) count += 1 v.define_from_row([position, name, desc, vtype]) if "-" in column: v.column = [int(x) for x in column.split("-")] table.add_variable(v) schemafile.write('#define SPEC_FILE "{}"\n'.format(fname)) table.write_sql_scanner(schemafile)
def load_table_from_docx_table(docx_table): """Returns a table name and the list of the variables""" tableName = docx_table[0][0].replace(" ", "_").replace(".", "_") if CensusSpec.debug: print("DEBUG: Reading table '{}'".format(tableName)) table = Table(name=tableName) v = None # current variable for row in docx_table[1:]: # don't process the first line cols = [x.replace("\n", " ").strip() for x in row] # change all newlines to spaces if sum([len(x) for x in cols]) == 0: # blank row continue if CensusSpec.debug: print("DEBUG: cols: {}".format(cols)) if CensusSpec.is_variable_start(cols): if CensusSpec.debug: print("DEBUG: defining variable {}".format(cols[0])) v = Variable(position=cols[0], name=cols[1], desc=cols[2], vtype=cols[3]) # if there is a fifth column, it may be allowable values if len(cols) == 5: v.add_valid_data_description(cols[4]) table.add_variable(v) continue # If defining a variable and we have an extra cols, it may have an allowable value # in the cols[2] or cols[4] if v: if len(cols) > 2 and len(cols[2]): v.add_valid_data_description(cols[2]) contine if len(cols) > 4 and len(cols[4]): v.add_valid_data_description(cols[4]) continue print("Not sure what to do with this:", cols) assert False return table
def load_table_schema_from_census_txt_spec(self, *, filename, prefix=""): """Read a single table from a txt file.""" table = None for (ll, line) in enumerate(dopen(filename), 1): if ll > self.MAXLINES: if (table == None) or len(table.vars()) == 0: logging.info( "{} is not a Census text specification".format( filename)) return None # Get a table name if we do not have one if not table: m = TXT_TITLE_RE.search(line) if m: table = Table(name=m.group(1)) table.add_comment("Parsed from {}".format(filename)) continue # Get the table version if we do not have one if table and not table.version: m = TXT_VERSION_RE.search(line) if m: table.version = m.group(1) continue # Is this a variable name within the table? m = VARIABLE_RE.search(line) if m: (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5) oname = name count = 2 v = Variable(position=row[0], name=row[1], desc=row[2], vtype=row[3]) while name in [v.name for v in table.vars()]: name = "{}{}".format(oname, count) count += 1 if "-" in column: v.column = [int(x) for x in column.split("-")] table.add_variable(v) if len(table.vars()) == 0: return None self.add_table(table)
def explore_database(self, schema=None): tables_collection = dict() if schema is None: schema = self.url.db tables = self.get_table_names(schema) for table in tables: if table in tables_collection: raise exc.ArgumentError('Table "%s" is already defined' ' in this database' % table) tables_collection[table] = Table(table, None) for column in self.get_columns(table, schema): tables_collection[table].add_column(column) return tables_collection
def json2schema(data): schema = Schema() for name, t in data.items(): table = schema.tables[name] = Table(name) table.include = t["include"] table.primary_key = t["primary_key"] for c, r in t["relationships"].items(): relationship = table.relationships[c] = \ Relationship(r["name"], c, r["remote_table"], r["remote_column"]) relationship.action = r["action"] for name, c in t["columns"].items(): column = table.columns[name] = \ Column(name, c["mysql_type"], c["options"]) return schema
def process_file(fname, dbfile): data = pandas.read_sas(fname, chunksize=1) frame = next(data) # Make a table table = Table(name=os.path.splitext(os.path.split(fname)[1])[0]) logging.info("Creating table {}".format(table.name)) for col in frame.columns: v = Variable() v.set_name(col) v.set_vtype(schema.vtype_for_numpy_type(type(frame[col][0]))) table.add_variable(v) conn = sqlite3.connect(dbfile) c = conn.cursor() cmd = table.sql_schema() c.execute(cmd) t0 = time.time() logging.info("Transferring data...") istmt = table.sql_insert() print(istmt) lines = 0 for frame in pandas.read_sas(fname, chunksize=CHUNKSIZE): c.execute("BEGIN TRANSACTION;") for row in frame.itertuples(index=False): c.execute(istmt, row) lines += 1 if lines % 10000 == 0: t = int(time.time() - t0) s = t % 60 m = (t % 3600) // 60 h = t // 3600 logging.info("time: {}:{:02}:{:02} lines {:,}".format( h, m, s, lines)) c.execute("END TRANSACTION;")
self._re_key = re.compile( r'\s+(?:(?P<type>\w+) )?KEY' r'(?:\s+`(?P<name>\w+)`)?' r'(?:\s+USING\s+(?P<using>BTREE|HASH))?' r'\s+\((?P<column>\.+)\)' r'(?:\s+USING\s+(?P<using_post>\w+))?' r'(?:\s+KEY_BLOCK_SIZE\s*[ =]? *(?P<keyblock>\w+))?' r'(?:\s+WITH PARSER\s+(?P<parser>\w+))?' r',?$', re.I | re.UNICODE) self._re_constraint = re.compile( r'\s+CONSTRAINT' r'\s+`(?P<name>\w+)`' r'\s+FOREIGN KEY' r'\s+\((?P<local>[^\)]+?)\) REFERENCES' r'\s+`(?P<table>\w+)`' r'\s+\((?P<foreign>[^\)]+?)\)' r'(?:\s+(?P<match>MATCH \w+))?' r'(?:\s+ON DELETE\s+(?P<ondelete>RESTRICT|CASCADE|SET NULL|NOACTION))?' r'(?:\s+ON UPDATE\s+(?P<onupdate>RESTRICT|CASCADE|SET NULL|NOACTION))?', re.I | re.UNICODE) if __name__ == '__main__': posts = Table('posts', None) result = Select(Column('id', int, posts)).From(posts).Where( Column('created_at', datetime.datetime, posts) >= datetime.datetime( 2014, 04, 24)) # print(result.sql_tree._root.next_node.next_node.next_node) print(result.dump_query())
# - visible to: 'vt' sum of all photos in shared viewpoint (includes 'sb'). to get the # real count of photos shared with this user but not shared by him, compute 'vt - sb' # # 'op_ids' holds a list of previously-applied operation IDs. This is an attempt to # make increments idempotent with replays. The list is a comma-separated string of # operation ids (sometimes suffixed with a viewpoint ID), in the order in which they were # applied. We keep a maximum of Accounting._MAX_APPLIED_OP_IDS. # # Currently, all columns are used by each accounting category. Table(ACCOUNTING, 'at', read_units=100, write_units=10, columns=[ HashKeyColumn('hash_key', 'hk', 'S'), RangeKeyColumn('sort_key', 'sk', 'S'), Column('num_photos', 'np', 'N'), Column('tn_size', 'ts', 'N'), Column('med_size', 'ms', 'N'), Column('full_size', 'fs', 'N'), Column('orig_size', 'os', 'N'), Column('op_ids', 'oi', 'S') ]), # Activities are associated with a viewpoint and contain a record of # all high-level operations which have modified the structure of the # viewpoint in some way. For more details, see activity.py. The # activity_id attribute is a composite of information gleaned from # current operation: (reverse timestamp, user_id, op_id). The content # of the activity is a JSON-encoded ACTIVITY structure, as defined in # json_schema.py. 'update_seq' is set to the value of the viewpoint's # 'update_seq' attribute after it was incremented during creation of