Ejemplo n.º 1
0
def prepareUniqueResults(clustered_dupes, fpath):
    cluster_membership = {}
    for (cluster_id, cluster) in enumerate(clustered_dupes):
        for record_id, score in zip(*cluster):
            cluster_membership[record_id] = cluster_id

    unique_rows = []
    with open('{0}-converted.csv'.format(fpath), 'rb') as f:
        reader = UnicodeCSVReader(f)
 
        rows = [reader.next()]
        seen_clusters = set()
        for row_id, row in enumerate(reader):
            if row_id in cluster_membership: 
                cluster_id = cluster_membership[row_id]
                if cluster_id not in seen_clusters:
                    rows.append(row)
                    seen_clusters.add(cluster_id)
            else:
                rows.append(row)
        for row in rows:
            d = OrderedDict()
            for k,v in zip(rows[0], row):
                d[k] = v
            unique_rows.append(d)
    return unique_rows
Ejemplo n.º 2
0
def select_geo():
    if not session.get('file'):
        return redirect(url_for('views.index'))
    context = {}
    if request.method == 'POST':
        inp = StringIO(session['file'])
        reader = UnicodeCSVReader(inp)
        header = reader.next()
        fields = {}
        geo_type = None
        valid = True
        if not request.form:
            valid = False
            context['errors'] = [
                'Select a field that contains a geography type'
            ]
        if valid:
            for k, v in request.form.items():
                if k.startswith("geotype"):
                    geo_type = v
                    index = int(k.split('_')[1])
                    fields[header[index]] = {
                        'geo_type': v,
                        'column_index': index
                    }
            mancer_data = get_data_sources(geo_type)
            session.update({'fields': fields, 'mancer_data': mancer_data})
            return redirect(url_for('views.select_tables'))
    return render_template('select_geo.html', **context)
Ejemplo n.º 3
0
def iter_column(idx, f):
    """

    :param idx: index of column
    :param f: gzip file object of CSV dataset
    :return: col_type, null_values
             where col_type is inferred type from typeinference.py
             and null_values is whether null values were found and normalized.
    """
    f.seek(0)
    reader = UnicodeCSVReader(f)

    # Discard the header
    reader.next()

    col = []
    for row in reader:
        if row:
            try:
                col.append(row[idx])
            except IndexError:
                # Bad data. Maybe we can fill with nulls?
                pass
    col_type, null_values = normalize_column_type(col)
    return col_type, null_values
Ejemplo n.º 4
0
def select_geo():
    if not session.get('file'):
        return redirect(url_for('views.index'))
    context = {}
    if request.method == 'POST':
        inp = StringIO(session['file'])
        reader = UnicodeCSVReader(inp)
        header = reader.next()
        fields = {}
        geo_type = None
        valid = True
        if not request.form:
            valid = False
            context['errors'] = ['Select a field that contains a geography type']
        if valid:
            for k,v in request.form.items():
                if k.startswith("geotype"):
                    geo_type = v
                    index = int(k.split('_')[1])
                    fields[header[index]] = {
                        'geo_type': v,
                        'column_index': index
                    }
            mancer_data = get_data_sources(geo_type)
            session.update({'fields': fields, 'mancer_data': mancer_data})
            return redirect(url_for('views.select_tables'))
    return render_template('select_geo.html', **context)
Ejemplo n.º 5
0
def get_context_for_new_dataset(url):
    dataset_info = {}
    errors = []
    socrata_source = False
    if url:
        url = url.strip(' \t\n\r') # strip whitespace, tabs, etc
        four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', url)
        errors = True
        if four_by_four:
            parsed = urlparse(url)
            host = 'https://%s' % parsed.netloc
            path = 'api/views'

            dataset_info, errors, status_code = get_socrata_data_info(host, path, four_by_four[-1])
            if not errors:
                socrata_source = True
                dataset_info['submitted_url'] = url
        if errors:
            errors = []
            try:
                r = requests.get(url, stream=True)
                status_code = r.status_code
            except requests.exceptions.InvalidURL:
                errors.append('Invalid URL')
            except requests.exceptions.ConnectionError:
                errors.append('URL can not be reached')
            if status_code != 200:
                errors.append('URL returns a %s status code' % status_code)
            if not errors:
                dataset_info['submitted_url'] = url
                dataset_info['name'] = urlparse(url).path.split('/')[-1]
                inp = StringIO()
                line_no = 0
                lines = []
                for line in r.iter_lines():
                    try:
                        inp.write(line + '\n')
                        line_no += 1
                        if line_no > 1000:
                            raise StopIteration
                    except StopIteration:
                        break
                inp.seek(0)
                reader = UnicodeCSVReader(inp)
                header = reader.next()
                col_types = []
                inp.seek(0)
                for col in range(len(header)):
                    col_types.append(iter_column(col, inp))
                dataset_info['columns'] = []
                for idx, col in enumerate(col_types):
                    d = {
                        'human_name': header[idx],
                        'data_type': col.__visit_name__.lower()
                    }
                    dataset_info['columns'].append(d)
    else:
        errors.append('Need a URL')
    #print "get_context_for_new_dataset(): returning ", dataset_info, errors, socrata_source
    return (dataset_info, errors, socrata_source)
def scrape_by_election():
    id = 1
    blank = 0
    all_cands = []
    header = None
    last = False
    while not last:
        cand_info = fetch_data(id)
        if not cand_info \
            or 'Unexpected errors occurred trying to populate page.' in cand_info:
            blank += 1
            if blank > 20:
                last = True
        else:
            inp = StringIO(cand_info)
            reader = UnicodeCSVReader(inp)
            header = reader.next()
            all_cands.extend(list(reader))
            blank = 0
        id += 1
    all_cands.sort()
    no_dup_cands = []
    header.extend(['FullName', 'FullAddress'])
    for cand in all_cands:
        if cand not in no_dup_cands and cand != header:
            cand.insert(-2, '%s %s' % (cand[4], cand[3]))
            cand.insert(-1, '%s %s %s, %s %s' % \
                (cand[7], cand[8], cand[9], cand[10], cand[11]))
            no_dup_cands.append(cand)
    return header, no_dup_cands
Ejemplo n.º 7
0
def select_geo():
    if not session.get('file'):
        return redirect(url_for('views.index'))
    context = {}
    if request.method == 'POST':
        inp = StringIO(session['file'])
        reader = UnicodeCSVReader(inp)
        header = reader.next()
        fields = {}
        valid = True
        geotype_val = None
        if not request.form:
            valid = False
            context['errors'] = [
                'Select a field that contains a geography type'
            ]
        else:
            geotypes = []
            indexes = []
            for k, v in request.form.items():
                if k.startswith("geotype"):
                    geotypes.append(v)
                    indexes.append(k.split('_')[1])
            if len(indexes) > 2:
                valid = False
                context['errors'] = [
                    'We can only merge geographic information from 2 columns'
                ]
            else:
                fields_key = ';'.join([header[int(i)] for i in indexes])
                geotype_val = ';'.join([g for g in geotypes])
                if not check_combos(geotype_val):
                    valid = False
                    types = [t.title() for t in geotype_val.split(';')]
                    context['errors'] = [
                        'The geographic combination of {0} and {1} does not work'
                        .format(*types)
                    ]
                else:
                    fields[fields_key] = {
                        'geo_type': geotype_val,
                        'column_index': ';'.join(indexes)
                    }

            # found_geo_type = get_geo_types(geo_type)[0]['info']
            # sample_list = session['sample_data'][index][2]
            # valid, message = found_geo_type.validate(sample_list)
            # context['errors'] = [message]
        if valid:
            try:
                geo_type = SENSICAL_TYPES[geotype_val]
            except KeyError:
                geo_type = geotype_val
            mancer_data, errors = get_data_sources(geo_type=geo_type)
            session['fields'] = fields
            session['mancer_data'] = mancer_data
            for error in errors:
                flash(error)
            return redirect(url_for('views.select_tables'))
    return render_template('select_geo.html', **context)
Ejemplo n.º 8
0
 def _transform(self):
     reader = UnicodeCSVReader(self.station_raw_info)
     header = ['wban_code', 'station_name', 'country', 
               'state', 'call_sign', 'location', 'elevation', 
               'begin', 'end']
     reader.next()
     self.clean_station_info = StringIO()
     all_rows = []
     wbans = []
     for row in reader:
         if row[1] == '99999':
             continue
         elif row[1] in wbans:
             continue
         elif row[5] and row[6]:
             row.pop(0)
             row.pop(3)
             lat = row[5].replace('+', '')
             lon = row[6].replace('+', '')
             elev = row[7].replace('+', '')
             begin = parser.parse(row[8]).isoformat()
             end = parser.parse(row[9]).isoformat()
             row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000))
             row[6] = float(elev) / 10
             row[7] = begin
             row[8] = end
             row.pop()
             wbans.append(row[0])
             all_rows.append(row)
     writer = UnicodeCSVWriter(self.clean_station_info)
     writer.writerow(header)
     writer.writerows(all_rows)
     self.clean_station_info.seek(0)
Ejemplo n.º 9
0
def iter_column(idx, f):
    """

    :param idx: index of column
    :param f: gzip file object of CSV dataset
    :return: col_type, null_values
             where col_type is inferred type from typeinference.py
             and null_values is whether null values were found and normalized.
    """
    f.seek(0)
    reader = UnicodeCSVReader(f)

    # Discard the header
    reader.next()

    col = []
    for row in reader:
        if row:
            try:
                col.append(row[idx])
            except IndexError:
                # Bad data. Maybe we can fill with nulls?
                pass
    col_type, null_values = normalize_column_type(col)
    return col_type, null_values
Ejemplo n.º 10
0
 def _transform(self):
     reader = UnicodeCSVReader(self.station_raw_info)
     header = [
         'wban_code', 'station_name', 'country', 'state', 'call_sign',
         'location', 'elevation', 'begin', 'end'
     ]
     reader.next()
     self.clean_station_info = StringIO()
     all_rows = []
     wbans = []
     for row in reader:
         if row[1] == '99999':
             continue
         elif row[1] in wbans:
             continue
         elif row[5] and row[6]:
             row.pop(0)
             row.pop(3)
             lat = row[5].replace('+', '')
             lon = row[6].replace('+', '')
             elev = row[7].replace('+', '')
             begin = parser.parse(row[8]).isoformat()
             end = parser.parse(row[9]).isoformat()
             row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000),
                                                  (float(lat) / 1000))
             row[6] = float(elev) / 10
             row[7] = begin
             row[8] = end
             row.pop()
             wbans.append(row[0])
             all_rows.append(row)
     writer = UnicodeCSVWriter(self.clean_station_info)
     writer.writerow(header)
     writer.writerows(all_rows)
     self.clean_station_info.seek(0)
Ejemplo n.º 11
0
def clean(f):
    reader = UnicodeCSVReader(f)
    good = []
    bad = []
    header = reader.next()
    for row in reader:
        try:
            row[0] = int(row[0])
            row[3] = int(row[3])
            row[5] = int(row[5])
            row[7] = int(row[7])
            row[4] = row[4].replace(',', '')
            if len(row) == 12:
                good.append(row)
            else:
                bad.append(row)
        except (TypeError, ValueError):
           bad.append(row)
    goodf = open('data/trips_cleaned.csv', 'wb')
    badf = open('data/trips_dirty.csv', 'wb')
    goodwriter = UnicodeCSVWriter(goodf)
    goodwriter.writerow(header)
    goodwriter.writerows(good)
    badwriter = UnicodeCSVWriter(badf)
    badwriter.writerow(header)
    badwriter.writerows(bad)
    goodf.close()
    badf.close()
Ejemplo n.º 12
0
 def test_dump_entity_map(self):
     with open(join(fixtures_path, 'csv_example_messy_input.csv'),
               'rb') as inp:
         with open(join('/tmp/{0}_raw.csv'.format(self.dd_sess.id)),
                   'wb') as outp:
             outp.write(inp.read())
     initializeSession(self.dd_sess.id)
     initializeModel(self.dd_sess.id)
     dedupeRaw(self.dd_sess.id)
     with self.app.test_request_context():
         self.login()
         with self.client as c:
             c.get('/mark-all-clusters/?session_id={0}'.format(
                 self.dd_sess.id))
             rv = c.get('/dump-entity-map/?session_id=' + self.dd_sess.id)
             row_count = ''' 
                 SELECT count(*) 
                 FROM "raw_{0}" AS r
                 JOIN "entity_{0}" AS e
                   ON r.record_id = e.record_id
                 WHERE e.clustered = TRUE
             '''.format(self.dd_sess.id)
             with self.engine.begin() as conn:
                 row_count = list(conn.execute(row_count))
             row_count = row_count[0][0]
             s = StringIO(rv.data)
             reader = UnicodeCSVReader(s)
             reader.next()
             assert len([r for r in list(reader) if r[0]]) == row_count
Ejemplo n.º 13
0
def makeRawTable(contents):
    inp = StringIO(contents)
    reader = UnicodeCSVReader(inp)
    header = reader.next()
    header = [slugify(h) for h in header]
    outp = StringIO()
    writer = UnicodeCSVWriter(outp)
    writer.writerow(header)
    writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader])
    outp.seek(0)
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(outp, 
                       name='raw_table', 
                       blanks_as_nulls=False, 
                       infer_types=False)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    parts = create_st.split('raw_table (')
    create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts)
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    rows = [dict(zip(header, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    dump = StringIO()
    for line in conn.iterdump():
        dump.write(unidecode(line))
    dump.seek(0)
    return dump.getvalue(), header
Ejemplo n.º 14
0
def select_geo():
    if not session.get('file'):
        return redirect(url_for('views.index'))
    context = {}
    if request.method == 'POST':
        inp = StringIO(session['file'])
        reader = UnicodeCSVReader(inp)
        header = reader.next()
        fields = {}
        geo_type = None
        valid = True
        if not request.form:
            valid = False
            context['errors'] = ['Select a field that contains a geography type']
        else:
            for k,v in request.form.items():
                if k.startswith("geotype"):
                    geo_type = v
                    index = int(k.split('_')[1])
                    fields[header[index]] = {
                        'geo_type': v,
                        'column_index': index
                    }

            found_geo_type = get_geo_types(geo_type)[0]['info']
            sample_as_list = session['sample_data'][index][2].split(', ')
            valid = validate_geo_type(found_geo_type, sample_as_list)
            context['errors'] = ['The column you selected must be formatted like "%s" to match on %s geographies. Please pick another column or change the format of your data.' % (found_geo_type.formatting_example, found_geo_type.human_name)]
        if valid:
            mancer_data = get_data_sources(geo_type)
            session.update({'fields': fields, 'mancer_data': mancer_data})
            return redirect(url_for('views.select_tables'))
    return render_template('select_geo.html', **context)
Ejemplo n.º 15
0
def upload():
    context = {}
    if request.method == 'POST':
        big_file = False
        try:
            files = request.files
        except RequestEntityTooLarge, e:
            files = None
            big_file = True
            current_app.logger.info(e)
        if files:
            f = files['input_file']
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                file_format = convert.guess_format(f.filename)
                try:
                    converted = convert.convert(inp, file_format)
                except UnicodeDecodeError:
                    context['errors'] = [
                        'We had a problem with reading your file. \
                        This could have to do with the file encoding or format'
                    ]
                    converted = None
                f.seek(0)
                if converted:
                    outp = StringIO(converted)
                    reader = UnicodeCSVReader(outp)
                    session['header_row'] = reader.next()
                    rows = []
                    columns = [[] for c in session['header_row']]
                    column_ids = range(len(session['header_row']))
                    for row in range(100):
                        try:
                            rows.append(reader.next())
                        except StopIteration:
                            break
                    for i, row in enumerate(rows):
                        for j, d in enumerate(row):
                            columns[j].append(row[column_ids[j]])
                    sample_data = []
                    guesses = {}
                    for index, header_val in enumerate(session['header_row']):
                        guesses[index] = guess_geotype(header_val,
                                                       columns[index])
                        sample_data.append((index, header_val, columns[index]))
                    session['sample_data'] = sample_data
                    session['guesses'] = json.dumps(guesses)
                    outp.seek(0)
                    session['file'] = outp.getvalue()
                    session['filename'] = f.filename
                    return redirect(url_for('views.select_geo'))
            else:
                context['errors'] = [
                    'Only .xls or .xlsx and .csv files are allowed.'
                ]
        else:
            context['errors'] = ['You must provide a file to upload.']
            if big_file:
                context['errors'] = ['Uploaded file must be 10mb or less.']
Ejemplo n.º 16
0
def do_the_work(file_contents, field_defs, filename):
    """
      field_defs looks like:
      {
        10: {
          'type': 'city_state', 
          'append_columns': ['total_population', 'median_age']
        }
      }

      file_contents is a string containing the contents of the uploaded file.
    """
    contents = StringIO(file_contents)
    reader = UnicodeCSVReader(contents)
    header = reader.next()
    result = None
    geo_ids = set()
    mancer_mapper = {}
    for mancer in MANCERS:
        m = import_class(mancer)()
        mancer_cols = [k['table_id'] for k in m.column_info()]
        for k, v in field_defs.items():
            field_cols = v['append_columns']
            for f in field_cols:
                if f in mancer_cols:
                    mancer_mapper[f] = {
                        'mancer': m,
                        'geo_id_map': {},
                        'geo_ids': set(),
                        'geo_type': v['type']
                    }
    for row_idx, row in enumerate(reader):
        col_idxs = [int(k) for k in field_defs.keys()]
        for idx in col_idxs:
            val = row[idx]
            geo_type = field_defs[idx]['type']
            for column in field_defs[idx]['append_columns']:
                mancer = mancer_mapper[column]['mancer']
                try:
                    if val:
                        geoid_search = mancer.geo_lookup(val,
                                                         geo_type=geo_type)
                    else:
                        continue
                except MancerError, e:
                    return 'Error message: %s, Body: %s' % (e.message, e.body)
                row_geoid = geoid_search['geoid']
                if row_geoid:
                    mancer_mapper[column]['geo_ids'].add(row_geoid)
                    try:
                        mancer_mapper[column]['geo_id_map'][row_geoid].append(
                            row_idx)
                    except KeyError:
                        mancer_mapper[column]['geo_id_map'][row_geoid] = [
                            row_idx
                        ]
Ejemplo n.º 17
0
def do_the_work(file_contents, field_defs, filename):
    """
      field_defs looks like:
      {
        10: {
          'type': 'city_state', 
          'append_columns': ['total_population', 'median_age']
        }
      }

      or like this:

      {
        10;2: {
          'type': 'city;state', 
          'append_columns': ['total_population', 'median_age']
        }
      }

      where the semicolon separated values represent a multicolumn geography

      file_contents is a string containing the contents of the uploaded file.
    """
    contents = StringIO(file_contents)
    reader = UnicodeCSVReader(contents)
    header = reader.next()
    result = None
    geo_ids = set()
    mancer_mapper = {}
    fields_key = field_defs.keys()[0]
    errors = []

    geo_type, col_idxs, val_fmt = find_geo_type(field_defs[fields_key]['type'], 
                                       fields_key)
    geo_name = get_geo_types(geo_type=geo_type)[0][0]['info'].human_name
    for mancer in MANCERS:
        m = import_class(mancer)
        api_key = MANCER_KEYS.get(m.machine_name)
        try:
            m = m(api_key=api_key)
        except ImportError, e:
            errors.append(e.message)
            continue
        mancer_cols = [c['table_id'] for c in m.get_metadata()]
        for k, v in field_defs.items():
            field_cols = v['append_columns']
            for f in field_cols:
                if f in mancer_cols:
                    mancer_mapper[f] = {
                        'mancer': m,
                        'geo_id_map': {},
                        'geo_ids': set(),
                        'geo_type': geo_type,
                    }
Ejemplo n.º 18
0
def upload():
    context = {}
    if request.method == 'POST':
        f = request.files['input_file']
        if f:
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH:
                    inp.seek(0)
                    file_format = convert.guess_format(f.filename)
                    try:
                        converted = convert.convert(inp, file_format)
                    except UnicodeDecodeError:
                        context['errors'] = [
                            'We had a problem with reading your file. \
                            This could have to do with the file encoding or format'
                        ]
                        converted = None
                    f.seek(0)
                    if converted:
                        outp = StringIO(converted)
                        reader = UnicodeCSVReader(outp)
                        session['header_row'] = reader.next()
                        rows = []
                        columns = [[] for c in session['header_row']]
                        column_ids = range(len(session['header_row']))
                        for row in range(10):
                            try:
                                rows.append(reader.next())
                            except StopIteration:
                                break
                        for i, row in enumerate(rows):
                            for j, d in enumerate(row):
                                columns[j].append(row[column_ids[j]])
                        columns = [', '.join(c) for c in columns]
                        sample_data = []
                        for index, _ in enumerate(session['header_row']):
                            sample_data.append(
                                (index, session['header_row'][index],
                                 columns[index]))
                        session['sample_data'] = sample_data
                        outp.seek(0)
                        session['file'] = outp.getvalue()
                        session['filename'] = f.filename
                        return redirect(url_for('views.select_geo'))
                else:
                    context['errors'] = ['Uploaded file must be 10mb or less.']
            else:
                context['errors'] = [
                    'Only .xls or .xlsx and .csv files are allowed.'
                ]
        else:
            context['errors'] = ['You must provide a file to upload.']
    return render_template('upload.html', **context)
Ejemplo n.º 19
0
def upload():
    context = {}
    if request.method == 'POST':
        big_file = False
        try:
            files = request.files
        except RequestEntityTooLarge, e:
            files = None
            big_file = True
            current_app.logger.info(e)
        if files:
            f = files['input_file']
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                file_format = convert.guess_format(f.filename)
                try:
                    converted = convert.convert(inp, file_format)
                except UnicodeDecodeError:
                    context['errors'] = ['We had a problem with reading your file. \
                        This could have to do with the file encoding or format']
                    converted = None
                f.seek(0)
                if converted:
                    outp = StringIO(converted)
                    reader = UnicodeCSVReader(outp)
                    session['header_row'] = reader.next()
                    rows = []
                    columns = [[] for c in session['header_row']]
                    column_ids = range(len(session['header_row']))
                    for row in range(100):
                        try:
                            rows.append(reader.next())
                        except StopIteration:
                            break
                    for i, row in enumerate(rows):
                        for j,d in enumerate(row):
                            columns[j].append(row[column_ids[j]])
                    sample_data = []
                    guesses = {}
                    for index, header_val in enumerate(session['header_row']):
                        guesses[index] = guess_geotype(header_val, columns[index])
                        sample_data.append((index, header_val, columns[index]))
                    session['sample_data'] = sample_data
                    session['guesses'] = json.dumps(guesses)
                    outp.seek(0)
                    session['file'] = outp.getvalue()
                    session['filename'] = f.filename
                    return redirect(url_for('views.select_geo'))
            else:
                context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.']
        else:
            context['errors'] = ['You must provide a file to upload.']
            if big_file:
                context['errors'] = ['Uploaded file must be 10mb or less.'] 
Ejemplo n.º 20
0
    def _transform_daily(self,
                         raw_weather,
                         file_type,
                         start_line=0,
                         end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header = reader.next()
        header = [x.strip() for x in header]

        self.clean_observations_daily = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_daily)
        out_header = [
            "wban_code", "date", "temp_max", "temp_min", "temp_avg",
            "departure_from_normal", "dewpoint_avg", "wetbulb_avg",
            "weather_types", "snowice_depth", "snowice_waterequiv", "snowfall",
            "precip_total", "station_pressure", "sealevel_pressure",
            "resultant_windspeed", "resultant_winddirection",
            "resultant_winddirection_cardinal", "avg_windspeed",
            "max5_windspeed", "max5_winddirection",
            "max5_winddirection_cardinal", "max2_windspeed",
            "max2_winddirection", "max2_winddirection_cardinal"
        ]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            self.current_row = row
            if (row_count % 100 == 0):
                if (self.debug == True):
                    self.debug_outfile.write(
                        "\rdaily parsing: row_count=%06d" % row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count += 1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1
            #print len(header)
            #print len(row)
            #print zip(header,row)

            if (len(row) == 0):
                continue

            row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row,
                                                                        header)

            writer.writerow(row_vals)
        return self.clean_observations_daily
Ejemplo n.º 21
0
def select_geo():
    if not session.get('file'):
        return redirect(url_for('views.index'))
    context = {}
    if request.method == 'POST':
        inp = StringIO(session['file'])
        reader = UnicodeCSVReader(inp)
        header = reader.next()
        fields = {}
        valid = True
        geotype_val = None
        if not request.form:
            valid = False
            context['errors'] = ['Select a field that contains a geography type']
        else:
            geotypes = []
            indexes = []
            for k,v in request.form.items():
                if k.startswith("geotype"):
                    geotypes.append(v)
                    indexes.append(k.split('_')[1])
            if len(indexes) > 2:
                valid = False
                context['errors'] = ['We can only merge geographic information from 2 columns']
            else:
                fields_key = ';'.join([header[int(i)] for i in indexes])
                geotype_val = ';'.join([g for g in geotypes])
                if not check_combos(geotype_val):
                    valid = False
                    types = [t.title() for t in geotype_val.split(';')]
                    context['errors'] = ['The geographic combination of {0} and {1} does not work'.format(*types)]
                else:
                    fields[fields_key] = {
                        'geo_type': geotype_val,
                        'column_index': ';'.join(indexes)
                    }

            # found_geo_type = get_geo_types(geo_type)[0]['info']
            # sample_list = session['sample_data'][index][2]
            # valid, message = found_geo_type.validate(sample_list)
            # context['errors'] = [message]
        if valid:
            try:
                geo_type = SENSICAL_TYPES[geotype_val]
            except KeyError:
                geo_type = geotype_val
            mancer_data, errors = get_data_sources(geo_type=geo_type)
            session['fields'] = fields
            session['mancer_data'] = mancer_data
            for error in errors:
                flash(error)
            return redirect(url_for('views.select_tables'))
    return render_template('select_geo.html', **context)
Ejemplo n.º 22
0
def do_the_work(file_contents, field_defs, filename):
    """
      field_defs looks like:
      {
        10: {
          'type': 'city_state', 
          'append_columns': ['total_population', 'median_age']
        }
      }

      file_contents is a string containing the contents of the uploaded file.
    """
    contents = StringIO(file_contents)
    reader = UnicodeCSVReader(contents)
    header = reader.next()
    result = None
    geo_ids = set()
    mancer_mapper = {}
    for mancer in MANCERS:
        m = import_class(mancer)()
        mancer_cols = [k['table_id'] for k in m.column_info()]
        for k, v in field_defs.items():
            field_cols = v['append_columns']
            for f in field_cols:
                if f in mancer_cols:
                    mancer_mapper[f] = {
                        'mancer': m,
                        'geo_id_map': {},
                        'geo_ids': set(),
                        'geo_type': v['type']
                    }
    for row_idx, row in enumerate(reader):
        col_idxs = [int(k) for k in field_defs.keys()]
        for idx in col_idxs:
            val = row[idx]
            geo_type = field_defs[idx]['type']
            for column in field_defs[idx]['append_columns']:
                mancer = mancer_mapper[column]['mancer']
                try:
                    if val:
                        geoid_search = mancer.geo_lookup(val, geo_type=geo_type)
                    else:
                        continue
                except MancerError, e:
                    return 'Error message: %s, Body: %s' % (e.message, e.body)
                row_geoid = geoid_search['geoid']
                if row_geoid:
                    mancer_mapper[column]['geo_ids'].add(row_geoid)
                    try:
                        mancer_mapper[column]['geo_id_map'][row_geoid].append(row_idx)
                    except KeyError:
                        mancer_mapper[column]['geo_id_map'][row_geoid] = [row_idx]
Ejemplo n.º 23
0
    def _from_inference(f):
        """
        Generate columns by scanning source CSV and inferring column types.
        """
        reader = UnicodeCSVReader(f)
        # Always create columns with slugified names
        header = map(slugify, reader.next())

        cols = []
        for col_idx, col_name in enumerate(header):
            col_type, nullable = iter_column(col_idx, f)
            cols.append(_make_col(col_name, col_type, nullable))
        return cols
Ejemplo n.º 24
0
    def _from_inference(f):
        """
        Generate columns by scanning source CSV and inferring column types.
        """
        reader = UnicodeCSVReader(f)
        # Always create columns with slugified names
        header = map(slugify, reader.next())

        cols = []
        for col_idx, col_name in enumerate(header):
            col_type, nullable = iter_column(col_idx, f)
            cols.append(_make_col(col_name, col_type, nullable))
        return cols
Ejemplo n.º 25
0
def iter_column(idx, f):
    f.seek(0)
    reader = UnicodeCSVReader(f)
    header = reader.next()
    col = []
    for row in reader:
        if row:
            try:
                col.append(row[idx])
            except IndexError:
                # Bad data. Maybe we can fill with nulls?
                pass
    col_type, null_values = normalize_column_type(col)
    return col_type, null_values
Ejemplo n.º 26
0
    def _transform_hourly(self,
                          raw_weather,
                          file_type,
                          start_line=0,
                          end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header = reader.next()
        # strip leading and trailing whitespace from header (e.g. from tarfiles)
        header = [x.strip() for x in header]

        self.clean_observations_hourly = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_hourly)
        out_header = ["wban_code","datetime","old_station_type","station_type", \
                      "sky_condition","sky_condition_top","visibility",\
                      "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\
                      "dewpoint_fahrenheit","relative_humidity",\
                      "wind_speed","wind_direction","wind_direction_cardinal",\
                      "station_pressure","sealevel_pressure","report_type",\
                      "hourly_precip"]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            if (row_count % 1000 == 0):
                if (self.debug == True):
                    self.debug_outfile.write("\rparsing: row_count=%06d" %
                                             row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count += 1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1

            if (len(row) == 0):
                continue

            # this calls either self._parse_zipfile_row_hourly
            # or self._parse_tarfile_row_hourly
            row_vals = getattr(self,
                               '_parse_%s_row_hourly' % file_type)(row, header)
            if (not row_vals):
                continue

            writer.writerow(row_vals)
        return self.clean_observations_hourly
Ejemplo n.º 27
0
def iter_column(idx, f):
    f.seek(0)
    reader = UnicodeCSVReader(f)
    header = reader.next()
    col = []
    for row in reader:
        if row:
            try:
                col.append(row[idx])
            except IndexError:
                # Bad data. Maybe we can fill with nulls?
                pass
    col_type, null_values = normalize_column_type(col)
    return col_type, null_values
Ejemplo n.º 28
0
def infer_csv_columns(inp):
    """

    :param inp: File handle to a CSV dataset
                that we can throw into a UnicodeCSVReader
    :return: List of `ColumnInfo`s
    """
    reader = UnicodeCSVReader(inp)
    header = reader.next()
    inp.seek(0)
    iter_output = [iter_column(col_idx, inp)
                   for col_idx in range(len(header))]

    return [ColumnInfo(name, type_, has_nulls)
            for name, (type_, has_nulls) in zip(header, iter_output)]
Ejemplo n.º 29
0
def infer_csv_columns(inp):
    """

    :param inp: File handle to a CSV dataset
                that we can throw into a UnicodeCSVReader
    :return: List of `ColumnInfo`s
    """
    reader = UnicodeCSVReader(inp)
    header = reader.next()
    inp.seek(0)
    iter_output = [iter_column(col_idx, inp)
                   for col_idx in range(len(header))]

    return [ColumnInfo(name, type_, has_nulls)
            for name, (type_, has_nulls) in zip(header, iter_output)]
Ejemplo n.º 30
0
def upload():
    context = {}
    if request.method == 'POST':
        f = request.files['input_file']
        if f:
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH:
                    inp.seek(0)
                    file_format = convert.guess_format(f.filename)
                    try:
                        converted = convert.convert(inp, file_format)
                    except UnicodeDecodeError:
                        context['errors'] = ['We had a problem with reading your file. \
                            This could have to do with the file encoding or format']
                        converted = None
                    f.seek(0)
                    if converted:
                        outp = StringIO(converted)
                        reader = UnicodeCSVReader(outp)
                        session['header_row'] = reader.next()
                        rows = []
                        columns = [[] for c in session['header_row']]
                        column_ids = range(len(session['header_row']))
                        for row in range(10):
                            try:
                                rows.append(reader.next())
                            except StopIteration:
                                break
                        for i, row in enumerate(rows):
                            for j,d in enumerate(row):
                                columns[j].append(row[column_ids[j]])
                        columns = [', '.join(c) for c in columns]
                        sample_data = []
                        for index,_ in enumerate(session['header_row']):
                            sample_data.append((index, session['header_row'][index], columns[index]))
                        session['sample_data'] = sample_data
                        outp.seek(0)
                        session['file'] = outp.getvalue()
                        session['filename'] = f.filename
                        return redirect(url_for('views.select_geo'))
                else:
                   context['errors'] = ['Uploaded file must be 10mb or less.'] 
            else:
                context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.']
        else:
            context['errors'] = ['You must provide a file to upload.']
    return render_template('upload.html', **context)
Ejemplo n.º 31
0
    def _transform_daily(self, raw_weather, file_type, start_line=0, end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header = reader.next()
        header = [x.strip() for x in header]

        self.clean_observations_daily = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_daily)
        out_header = ["wban_code","date","temp_max","temp_min",
                      "temp_avg","departure_from_normal",
                      "dewpoint_avg", "wetbulb_avg","weather_types",
                      "snowice_depth", "snowice_waterequiv",
                      "snowfall","precip_total", "station_pressure",
                      "sealevel_pressure", 
                      "resultant_windspeed", "resultant_winddirection", "resultant_winddirection_cardinal",
                      "avg_windspeed",
                      "max5_windspeed", "max5_winddirection","max5_winddirection_cardinal",
                      "max2_windspeed", "max2_winddirection","max2_winddirection_cardinal"]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            self.current_row = row
            if (row_count % 100 == 0):
                if (self.debug == True):
                    self.debug_outfile.write("\rdaily parsing: row_count=%06d" % row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count +=1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1
            #print len(header)
            #print len(row)
            #print zip(header,row)

            if (len(row) == 0):
                continue

            row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row, header)

            writer.writerow(row_vals)
        return self.clean_observations_daily
Ejemplo n.º 32
0
    def _transform_hourly(self, raw_weather, file_type, start_line=0, end_line=None):
        raw_weather.seek(0)
        reader = UnicodeCSVReader(raw_weather)
        header= reader.next()
        # strip leading and trailing whitespace from header (e.g. from tarfiles)
        header = [x.strip() for x in header]

        self.clean_observations_hourly = StringIO()
        writer = UnicodeCSVWriter(self.clean_observations_hourly)
        out_header = ["wban_code","datetime","old_station_type","station_type", \
                      "sky_condition","sky_condition_top","visibility",\
                      "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\
                      "dewpoint_fahrenheit","relative_humidity",\
                      "wind_speed","wind_direction","wind_direction_cardinal",\
                      "station_pressure","sealevel_pressure","report_type",\
                      "hourly_precip"]
        writer.writerow(out_header)

        row_count = 0
        for row in reader:
            if (row_count % 1000 == 0):
                if (self.debug==True):
                    self.debug_outfile.write( "\rparsing: row_count=%06d" % row_count)
                    self.debug_outfile.flush()

            if (start_line > row_count):
                row_count +=1
                continue
            if ((end_line is not None) and (row_count > end_line)):
                break

            row_count += 1

            if (len(row) == 0):
                continue

            # this calls either self._parse_zipfile_row_hourly
            # or self._parse_tarfile_row_hourly
            row_vals = getattr(self, '_parse_%s_row_hourly' % file_type)(row, header)
            if (not row_vals):
                continue

            writer.writerow(row_vals)
        return self.clean_observations_hourly
Ejemplo n.º 33
0
def select_geo():
    if not session.get('file'):
        return redirect(url_for('views.index'))
    context = {}
    if request.method == 'POST':
        inp = StringIO(session['file'])
        reader = UnicodeCSVReader(inp)
        header = reader.next()
        fields = {}
        geo_type = None
        valid = True
        if not request.form:
            valid = False
            context['errors'] = [
                'Select a field that contains a geography type'
            ]
        else:
            for k, v in request.form.items():
                if k.startswith("geotype"):
                    geo_type = v
                    index = int(k.split('_')[1])
                    fields[header[index]] = {
                        'geo_type': v,
                        'column_index': index
                    }

            found_geo_type = get_geo_types(geo_type)[0]['info']
            sample_as_list = session['sample_data'][index][2].split(', ')
            valid = validate_geo_type(found_geo_type, sample_as_list)
            context['errors'] = [
                'The column you selected must be formatted like "%s" to match on %s geographies. Please pick another column or change the format of your data.'
                %
                (found_geo_type.formatting_example, found_geo_type.human_name)
            ]
        if valid:
            mancer_data = get_data_sources(geo_type)
            session.update({'fields': fields, 'mancer_data': mancer_data})
            return redirect(url_for('views.select_tables'))
    return render_template('select_geo.html', **context)
Ejemplo n.º 34
0
def prepareResults(clustered_dupes, fpath):
    """ 
    Prepare deduplicated file for writing to various formats with
    duplicates clustered. 
    """
    cluster_membership = {}
    cluster_id = None
    for cluster_id, cluster in enumerate(clustered_dupes):
        for record_id, score in zip(*cluster):
            cluster_membership[record_id] = cluster_id
    
    unique_record_id = cluster_id + 1
    clustered_rows = []
    with open('{0}-converted.csv'.format(fpath), 'rb') as f:
        reader = UnicodeCSVReader(f)
 
        heading_row = reader.next()
        heading_row.insert(0, 'Group ID')
 
        rows = []
 
        for row_id, row in enumerate(reader):
            if row_id in cluster_membership:
                cluster_id = cluster_membership[row_id]
            else:
                cluster_id = unique_record_id
                unique_record_id += 1
            row.insert(0, cluster_id)
            rows.append(row)
        rows = sorted(rows, key=itemgetter(0))
        rows.insert(0, heading_row)
        for row in rows:
            d = OrderedDict()
            for k,v in zip(heading_row, row):
                d[k] = v
            clustered_rows.append(d)
    return unique_record_id, clustered_rows
Ejemplo n.º 35
0
def add_dataset():
    dataset_info = {}
    errors = []
    socrata_source = False
    if request.method == 'POST':
        url = request.form.get('dataset_url')
        if url:
            four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', url)
            errors = True
            if four_by_four:
                parsed = urlparse(url)
                host = 'https://%s' % parsed.netloc
                path = 'api/views'
                view_url = '%s/%s/%s' % (host, path, four_by_four[-1])
                dataset_info, errors, status_code = get_socrata_data_info(
                    view_url)
                if not errors:
                    socrata_source = True
                    dataset_info['submitted_url'] = url
            if errors:
                errors = []
                try:
                    r = requests.get(url, stream=True)
                    status_code = r.status_code
                except requests.exceptions.InvalidURL:
                    errors.append('Invalid URL')
                except requests.exceptions.ConnectionError:
                    errors.append('URL can not be reached')
                if status_code != 200:
                    errors.append('URL returns a %s status code' % status_code)
                if not errors:
                    dataset_info['submitted_url'] = url
                    dataset_info['name'] = urlparse(url).path.split('/')[-1]
                    inp = StringIO()
                    line_no = 0
                    lines = []
                    for line in r.iter_lines():
                        try:
                            inp.write(line + '\n')
                            line_no += 1
                            if line_no > 1000:
                                raise StopIteration
                        except StopIteration:
                            break
                    inp.seek(0)
                    reader = UnicodeCSVReader(inp)
                    header = reader.next()
                    col_types = []
                    inp.seek(0)
                    for col in range(len(header)):
                        col_types.append(iter_column(col, inp))
                    dataset_info['columns'] = []
                    for idx, col in enumerate(col_types):
                        d = {
                            'human_name': header[idx],
                            'data_type': col.__visit_name__.lower()
                        }
                        dataset_info['columns'].append(d)
        else:
            errors.append('Need a URL')
    context = {
        'dataset_info': dataset_info,
        'errors': errors,
        'socrata_source': socrata_source
    }
    return render_template('add-dataset.html', **context)
Ejemplo n.º 36
0
if FILENAME == 'FAKE':
    for geography in collection.find({}, fields=['geoid', 'xwalk']):
        if 'xwalk' not in geography:
            geography['xwalk'] = {} 

        geography['xwalk'][geography['geoid']] = {
            'POPPCT00': 1.0,
            'HUPCT00': 1.0
        }

        collection.update({ '_id': objectid.ObjectId(geography['_id']) }, { '$set': { 'xwalk': geography['xwalk'] } }, safe=True) 
        row_count += 1
        inserts += 1
else:
    with open(FILENAME) as f:
        rows = UnicodeCSVReader(f)
        headers = rows.next()

        for row in rows:
            row_count += 1
            row_dict = dict(zip(headers, row))

            if row_dict['STATE10'] != STATE_FIPS:
                continue
            
            geography = collection.find_one({ 'geoid': row_dict['GEOID10'] }, fields=['xwalk'])

            if not geography:
                continue

            pop_pct_2000 = float(row_dict['POPPCT00']) / 100
Ejemplo n.º 37
0
            row[7] = res[0]
            row[6] = res[1]
        yield row

def make_db(fname, tblname):
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(open(fname, 'rb'), name=tblname)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    print create_st
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    headers = t.headers()
    print headers
    rows = [dict(zip(headers, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    return curs

if __name__ == '__main__':
    curs = make_db('macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description')
    outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb')
    writer = UnicodeCSVWriter(outp)
    with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f:
        reader = UnicodeCSVReader(f)
        headers = reader.next()
        headers.insert(1, 'Fund ID')
        writer.writerow(headers)
        writer.writerows(add_attrs(reader, curs))
Ejemplo n.º 38
0
def get_context_for_new_dataset(url):
    dataset_info = {}
    errors = []
    socrata_source = False
    if url:
        url = url.strip(' \t\n\r')  # strip whitespace, tabs, etc
        four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', url)
        errors = True
        if four_by_four:
            parsed = urlparse(url)
            host = 'https://%s' % parsed.netloc
            path = 'api/views'

            dataset_info, errors, status_code = get_socrata_data_info(
                host, path, four_by_four[-1])
            if not errors:
                socrata_source = True
                dataset_info['submitted_url'] = url
        if errors:
            errors = []
            try:
                r = requests.get(url, stream=True)
                status_code = r.status_code
            except requests.exceptions.InvalidURL:
                errors.append('Invalid URL')
            except requests.exceptions.ConnectionError:
                errors.append('URL can not be reached')
            if status_code != 200:
                errors.append('URL returns a %s status code' % status_code)
            if not errors:
                dataset_info['submitted_url'] = url
                dataset_info['name'] = urlparse(url).path.split('/')[-1]
                inp = StringIO()
                line_no = 0
                lines = []
                for line in r.iter_lines():
                    try:
                        inp.write(line + '\n')
                        line_no += 1
                        if line_no > 1000:
                            raise StopIteration
                    except StopIteration:
                        break
                inp.seek(0)
                reader = UnicodeCSVReader(inp)
                header = reader.next()
                col_types = []
                inp.seek(0)
                for col in range(len(header)):
                    col_types.append(iter_column(col, inp)[0])
                dataset_info['columns'] = []
                for idx, col in enumerate(col_types):
                    d = {
                        'human_name': header[idx],
                        'data_type': col.__visit_name__.lower()
                    }
                    dataset_info['columns'].append(d)
    else:
        errors.append('Need a URL')
    #print "get_context_for_new_dataset(): returning ", dataset_info, errors, socrata_source
    return (dataset_info, errors, socrata_source)
Ejemplo n.º 39
0
        'indent': indent,
        'labels': row[3:9],
        'continuation': continuation
    }


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit(
            'You must provide the filename of a CSV as an argument to this script.'
        )

    FILENAME = sys.argv[1]

    with open(FILENAME) as f:
        rows = UnicodeCSVReader(f, encoding='latin-1')
        headers = rows.next()

        inserts = 0
        row_count = 0
        skipped = 0

        table = None
        tables = {}
        hierarchy = []
        last_key = ''
        last_indent = 0

        for row in rows:
            row_count += 1
            if not row: continue
def make_db(fname, tblname):
    conn = sqlite3.connect(':memory:')
    t = Table.from_csv(open(fname, 'rb'), name=tblname)
    sql_table = make_table(t)
    create_st = make_create_table_statement(sql_table)
    print create_st
    insert = sql_table.insert()
    curs = conn.cursor()
    curs.execute(create_st)
    headers = t.headers()
    print headers
    rows = [dict(zip(headers, row)) for row in t.to_rows()]
    for row in rows:
        curs.execute(str(insert), row)
    return curs


if __name__ == '__main__':
    curs = make_db(
        'macoupin-budget-update/moucoupin-budget-department-desc.csv',
        'description')
    outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb')
    writer = UnicodeCSVWriter(outp)
    with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f:
        reader = UnicodeCSVReader(f)
        headers = reader.next()
        headers.insert(1, 'Fund ID')
        writer.writerow(headers)
        writer.writerows(add_attrs(reader, curs))
Ejemplo n.º 41
0
    return { 
        'table_id': table_id,
        'line': line,
        'indent': indent,
        'labels': row[3:9],
        'continuation': continuation
    }

if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit('You must provide the filename of a CSV as an argument to this script.')

    FILENAME = sys.argv[1]

    with open(FILENAME) as f:
        rows = UnicodeCSVReader(f, encoding='latin-1')
        headers = rows.next()

        inserts = 0
        row_count = 0
        skipped = 0

        table = None 
        tables = {}
        hierarchy = []
        last_key = ''
        last_indent = 0

        for row in rows:
            row_count += 1
            if not row: continue