Example #1
0
File: dmd.py Project: glfharris/dmd
 def __init__(self, path):
     self.path = Path(path)
     self.dir = DMD._local_directory / self.path.stem
     match = DMD.file_pattern.fullmatch(self.path.name)
     assert match
     self.version = match.group(1)
     self.publish_date = parsedate(match.group(2))
     DMD._instances.append(self)
Example #2
0
def analyze_records(reader, fiscal_year, datefield, fields):
    fy_months = [date(fiscal_year - (1 if month >= 10 else 0),
                      month, 1)
                 for month in range(1, 13)]

    observations = dict(((month, dict.fromkeys(fields, []))
                         for month in fy_months))
    digits = dict(((month, dict.fromkeys(fields, []))
                   for month in fy_months))

    for (line_number, record) in enumerate(reader, 2):
        dtstr = record[datefield]
        if dtstr is None or dtstr.strip() == '':
            print >>sys.stderr, "Skipping record with blank date field."
            continue
        dt = parsedate(record[datefield], settings.DATE_FORMATS)
        dt1 = date(dt.year, dt.month, 1)
        if dt1 not in fy_months:
            fy_begin = min(fy_months)
            fy_end = max(fy_months)
            print "Skipping %s-%s because it's not in %s-%s - %s-%s" % (dt1.year, dt1.month,
                                                                        fy_begin.year, fy_begin.month,
                                                                        fy_end.year, fy_end.month)
            continue

        for field in fields:
            obs = observations[dt1][field]
            digs = digits[dt1][field]

            value = record[field]
            (value, digit) = benford_filter(value)
            if value is not None:
                obs.append(value)
            if digit is not None:
                digs.append(digit)

    results = dict(((month, dict.fromkeys(fields, {}))
                    for month in fy_months))
    for dt1 in results:
        for field in fields:
            result = results[dt1][field]
            obs = observations[dt1][field]
            obs_array = numpy.array(obs, dtype=float)
            digs = digits[dt1][field]

            result['field_name'] = field
            result['value_count'] = len(obs)
            result['value_sum'] = numpy.sum(obs_array)
            result['mean'] = numpy.mean(obs_array)
            result['median'] = numpy.median(obs_array)
            result['skew'] = stats.skew(obs_array)
            result['digits'] = benford_difference(digs)

    return results