def get_pandas_file_reader(self, filename, chunksize=schema.PANDAS_CHUNKSIZE): (base, ext) = os.path.splitext(filename) import pandas if ext == schema.SAS7BDAT_EXT: return pandas.read_sas(dopen(filename), chunksize=chunksize, encoding='latin1') if ext == schema.CSV_EXT: return pandas.read_csv(dopen(filename), chunksize=chunksize, encoding='latin1') if ext == schema.TXT_EXT: # Get the first line and figure out the seperator with dopen(filename) as f: line = f.readline() if line.count("|") > 2: sep = '|' elif line.count("\t") > 2: sep = '\t' else: sep = ',' logging.info('sep={}'.format(sep)) return pandas.read_csv(dopen(filename), chunksize=chunksize, sep=sep, encoding='latin1') logging.error( "get_pandas_file_reader: unknown extension: {}".format(ext)) raise RuntimeError( "get_pandas_file_reader: unknown extension: {}".format(ext))
def load_schema_from_ipums_sas_file(*, schema, filename): if not IPUMS_SASParser.is_ipums_sas_file(filename): raise RuntimeError("{} is not an IPUMS SAS file".format(filename)) # Only process H and P record types rectypes = ['H', 'P'] state = None rectype = None table = None labels = dict() for line in dopen(filename): line = line.strip() if line == INPUT and rectype != None: if state == INPUT: raise RuntimeError( "{}: INPUT within INPUT???".format(filename)) state = INPUT table = Table(name=rectype) continue if line == ';': if state == INPUT: if rectype: schema.add_table(table) state = None # end of SAS statement continue if line == LABEL: state = LABEL continue if state == INPUT: IPUMS_SASParser.process_layout_line(table=table, line=line) if state == LABEL: m = label_re.search(line) if m: labels[m.group(1)] = m.group(2) m = rectype_re.search(line) if m: rectype = m.group(1) if rectype not in rectypes: raise RuntimeError( "Record type '{}' not in list of approved rectypes ({})" .format(rectype, str(rectypes))) # Now use the labels to set the description for all of the variables we have learned for table in schema.tables(): for v in table.vars(): if v.name in labels: v.desc = labels[v.name]
def load_table_schema_from_census_txt_spec(self, *, filename, prefix=""): """Read a single table from a txt file.""" table = None for (ll, line) in enumerate(dopen(filename), 1): if ll > self.MAXLINES: if (table == None) or len(table.vars()) == 0: logging.info( "{} is not a Census text specification".format( filename)) return None # Get a table name if we do not have one if not table: m = TXT_TITLE_RE.search(line) if m: table = Table(name=m.group(1)) table.add_comment("Parsed from {}".format(filename)) continue # Get the table version if we do not have one if table and not table.version: m = TXT_VERSION_RE.search(line) if m: table.version = m.group(1) continue # Is this a variable name within the table? m = VARIABLE_RE.search(line) if m: (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5) oname = name count = 2 v = Variable(position=row[0], name=row[1], desc=row[2], vtype=row[3]) while name in [v.name for v in table.vars()]: name = "{}{}".format(oname, count) count += 1 if "-" in column: v.column = [int(x) for x in column.split("-")] table.add_variable(v) if len(table.vars()) == 0: return None self.add_table(table)
def validate(self,fname): """ Validate file fname with vo. This is a single-threaded validation routine that has a nice GUI display of what doesn't validate """ self.records = 0 self.errors = 0 self.t0 = time.time() with dopen(fname,"r") as f: lines = 0 for line in f: lines += 1 line = line.rstrip() if line[0]=='#': continue # comment line if self.errors > self.maxerrors: print("too many errors",file=sys.stderr) break self.records += 1 try: if self.position_delimited: self.vo.parse_position_delimited(line) elif self.pipe_delimited: self.vo.parse_pipe_delimited(line) except ValueError as e: print(f"{fname}:{lines} will not parse: {e}") self.badline(line) self.errors += 1 continue if not vo.validate(): print(f"{fname}:{lines} {vo.validate_reason()}") self.badline(line) self.errors += 1 self.t1 = time.time() if self.errors==0: print(f"{fname} validated {self.records} records in {int(self.t1-self.t0)} seconds") else: print(f"{fname} had at least {self.errors} errors") return self.errors==0
def infosas(path): s = Schema() print("Loading schema from {}".format(path)) s.load_schema_from_file(dconfig.dopen(path)) s.dump(path) exit(0)
def is_ipums_sas_file(filename): data = dopen(filename).read(1024 * 1024) # read at most 1MiB return (data[0:2] == '/*') and ( 'ipums_directory') in data and data[-100:].strip().endswith("run;")