Esempio n. 1
0
 def get_pandas_file_reader(self,
                            filename,
                            chunksize=schema.PANDAS_CHUNKSIZE):
     (base, ext) = os.path.splitext(filename)
     import pandas
     if ext == schema.SAS7BDAT_EXT:
         return pandas.read_sas(dopen(filename),
                                chunksize=chunksize,
                                encoding='latin1')
     if ext == schema.CSV_EXT:
         return pandas.read_csv(dopen(filename),
                                chunksize=chunksize,
                                encoding='latin1')
     if ext == schema.TXT_EXT:
         # Get the first line and figure out the seperator
         with dopen(filename) as f:
             line = f.readline()
         if line.count("|") > 2:
             sep = '|'
         elif line.count("\t") > 2:
             sep = '\t'
         else:
             sep = ','
         logging.info('sep={}'.format(sep))
         return pandas.read_csv(dopen(filename),
                                chunksize=chunksize,
                                sep=sep,
                                encoding='latin1')
     logging.error(
         "get_pandas_file_reader: unknown extension: {}".format(ext))
     raise RuntimeError(
         "get_pandas_file_reader: unknown extension: {}".format(ext))
Esempio n. 2
0
    def load_schema_from_ipums_sas_file(*, schema, filename):
        if not IPUMS_SASParser.is_ipums_sas_file(filename):
            raise RuntimeError("{} is not an IPUMS SAS file".format(filename))

        # Only process H and P record types
        rectypes = ['H', 'P']

        state = None
        rectype = None
        table = None
        labels = dict()
        for line in dopen(filename):
            line = line.strip()
            if line == INPUT and rectype != None:
                if state == INPUT:
                    raise RuntimeError(
                        "{}: INPUT within INPUT???".format(filename))
                state = INPUT
                table = Table(name=rectype)
                continue
            if line == ';':
                if state == INPUT:
                    if rectype:
                        schema.add_table(table)
                state = None  # end of SAS statement
                continue
            if line == LABEL:
                state = LABEL
                continue
            if state == INPUT:
                IPUMS_SASParser.process_layout_line(table=table, line=line)
            if state == LABEL:
                m = label_re.search(line)
                if m:
                    labels[m.group(1)] = m.group(2)

            m = rectype_re.search(line)
            if m:
                rectype = m.group(1)
                if rectype not in rectypes:
                    raise RuntimeError(
                        "Record type '{}' not in list of approved rectypes ({})"
                        .format(rectype, str(rectypes)))

        # Now use the labels to set the description for all of the variables we have learned
        for table in schema.tables():
            for v in table.vars():
                if v.name in labels:
                    v.desc = labels[v.name]
Esempio n. 3
0
    def load_table_schema_from_census_txt_spec(self, *, filename, prefix=""):
        """Read a single table from a txt file."""
        table = None
        for (ll, line) in enumerate(dopen(filename), 1):
            if ll > self.MAXLINES:
                if (table == None) or len(table.vars()) == 0:
                    logging.info(
                        "{} is not a Census text specification".format(
                            filename))
                    return None

            # Get a table name if we do not have one
            if not table:
                m = TXT_TITLE_RE.search(line)
                if m:
                    table = Table(name=m.group(1))
                    table.add_comment("Parsed from {}".format(filename))
                    continue
            # Get the table version if we do not have one
            if table and not table.version:
                m = TXT_VERSION_RE.search(line)
                if m:
                    table.version = m.group(1)
                    continue
            # Is this a variable name within the table?
            m = VARIABLE_RE.search(line)
            if m:
                (position, name, desc, vtype, column) = m.group(1, 2, 3, 4, 5)
                oname = name
                count = 2
                v = Variable(position=row[0],
                             name=row[1],
                             desc=row[2],
                             vtype=row[3])
                while name in [v.name for v in table.vars()]:
                    name = "{}{}".format(oname, count)
                    count += 1
                if "-" in column:
                    v.column = [int(x) for x in column.split("-")]
                table.add_variable(v)
        if len(table.vars()) == 0:
            return None
        self.add_table(table)
Esempio n. 4
0
 def validate(self,fname):
     """
     Validate file fname with vo.
     This is a single-threaded validation routine that has a nice GUI display of what doesn't validate
     """
     self.records = 0
     self.errors = 0
     self.t0 = time.time()
     with dopen(fname,"r") as f:
         lines = 0
         for line in f:
             lines += 1
             line = line.rstrip()
             if line[0]=='#':
                 continue    # comment line
             if self.errors > self.maxerrors:
                 print("too many errors",file=sys.stderr)
                 break
             self.records += 1
             try:
                 if self.position_delimited:
                     self.vo.parse_position_delimited(line)
                 elif self.pipe_delimited:
                     self.vo.parse_pipe_delimited(line)
             except ValueError as e:
                 print(f"{fname}:{lines} will not parse: {e}")
                 self.badline(line)
                 self.errors += 1
                 continue
             if not vo.validate():
                 print(f"{fname}:{lines} {vo.validate_reason()}")
                 self.badline(line)
                 self.errors += 1
     self.t1 = time.time()
     if self.errors==0:
         print(f"{fname} validated {self.records} records in {int(self.t1-self.t0)} seconds")
     else:
         print(f"{fname} had at least {self.errors} errors")
     return self.errors==0
def infosas(path):
    s = Schema()
    print("Loading schema from {}".format(path))
    s.load_schema_from_file(dconfig.dopen(path))
    s.dump(path)
    exit(0)
Esempio n. 6
0
 def is_ipums_sas_file(filename):
     data = dopen(filename).read(1024 * 1024)  # read at most 1MiB
     return (data[0:2] == '/*') and (
         'ipums_directory') in data and data[-100:].strip().endswith("run;")