def prepareUniqueResults(clustered_dupes, fpath): cluster_membership = {} for (cluster_id, cluster) in enumerate(clustered_dupes): for record_id, score in zip(*cluster): cluster_membership[record_id] = cluster_id unique_rows = [] with open('{0}-converted.csv'.format(fpath), 'rb') as f: reader = UnicodeCSVReader(f) rows = [reader.next()] seen_clusters = set() for row_id, row in enumerate(reader): if row_id in cluster_membership: cluster_id = cluster_membership[row_id] if cluster_id not in seen_clusters: rows.append(row) seen_clusters.add(cluster_id) else: rows.append(row) for row in rows: d = OrderedDict() for k,v in zip(rows[0], row): d[k] = v unique_rows.append(d) return unique_rows
def select_geo(): if not session.get('file'): return redirect(url_for('views.index')) context = {} if request.method == 'POST': inp = StringIO(session['file']) reader = UnicodeCSVReader(inp) header = reader.next() fields = {} geo_type = None valid = True if not request.form: valid = False context['errors'] = [ 'Select a field that contains a geography type' ] if valid: for k, v in request.form.items(): if k.startswith("geotype"): geo_type = v index = int(k.split('_')[1]) fields[header[index]] = { 'geo_type': v, 'column_index': index } mancer_data = get_data_sources(geo_type) session.update({'fields': fields, 'mancer_data': mancer_data}) return redirect(url_for('views.select_tables')) return render_template('select_geo.html', **context)
def iter_column(idx, f): """ :param idx: index of column :param f: gzip file object of CSV dataset :return: col_type, null_values where col_type is inferred type from typeinference.py and null_values is whether null values were found and normalized. """ f.seek(0) reader = UnicodeCSVReader(f) # Discard the header reader.next() col = [] for row in reader: if row: try: col.append(row[idx]) except IndexError: # Bad data. Maybe we can fill with nulls? pass col_type, null_values = normalize_column_type(col) return col_type, null_values
def select_geo(): if not session.get('file'): return redirect(url_for('views.index')) context = {} if request.method == 'POST': inp = StringIO(session['file']) reader = UnicodeCSVReader(inp) header = reader.next() fields = {} geo_type = None valid = True if not request.form: valid = False context['errors'] = ['Select a field that contains a geography type'] if valid: for k,v in request.form.items(): if k.startswith("geotype"): geo_type = v index = int(k.split('_')[1]) fields[header[index]] = { 'geo_type': v, 'column_index': index } mancer_data = get_data_sources(geo_type) session.update({'fields': fields, 'mancer_data': mancer_data}) return redirect(url_for('views.select_tables')) return render_template('select_geo.html', **context)
def get_context_for_new_dataset(url): dataset_info = {} errors = [] socrata_source = False if url: url = url.strip(' \t\n\r') # strip whitespace, tabs, etc four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', url) errors = True if four_by_four: parsed = urlparse(url) host = 'https://%s' % parsed.netloc path = 'api/views' dataset_info, errors, status_code = get_socrata_data_info(host, path, four_by_four[-1]) if not errors: socrata_source = True dataset_info['submitted_url'] = url if errors: errors = [] try: r = requests.get(url, stream=True) status_code = r.status_code except requests.exceptions.InvalidURL: errors.append('Invalid URL') except requests.exceptions.ConnectionError: errors.append('URL can not be reached') if status_code != 200: errors.append('URL returns a %s status code' % status_code) if not errors: dataset_info['submitted_url'] = url dataset_info['name'] = urlparse(url).path.split('/')[-1] inp = StringIO() line_no = 0 lines = [] for line in r.iter_lines(): try: inp.write(line + '\n') line_no += 1 if line_no > 1000: raise StopIteration except StopIteration: break inp.seek(0) reader = UnicodeCSVReader(inp) header = reader.next() col_types = [] inp.seek(0) for col in range(len(header)): col_types.append(iter_column(col, inp)) dataset_info['columns'] = [] for idx, col in enumerate(col_types): d = { 'human_name': header[idx], 'data_type': col.__visit_name__.lower() } dataset_info['columns'].append(d) else: errors.append('Need a URL') #print "get_context_for_new_dataset(): returning ", dataset_info, errors, socrata_source return (dataset_info, errors, socrata_source)
def scrape_by_election(): id = 1 blank = 0 all_cands = [] header = None last = False while not last: cand_info = fetch_data(id) if not cand_info \ or 'Unexpected errors occurred trying to populate page.' in cand_info: blank += 1 if blank > 20: last = True else: inp = StringIO(cand_info) reader = UnicodeCSVReader(inp) header = reader.next() all_cands.extend(list(reader)) blank = 0 id += 1 all_cands.sort() no_dup_cands = [] header.extend(['FullName', 'FullAddress']) for cand in all_cands: if cand not in no_dup_cands and cand != header: cand.insert(-2, '%s %s' % (cand[4], cand[3])) cand.insert(-1, '%s %s %s, %s %s' % \ (cand[7], cand[8], cand[9], cand[10], cand[11])) no_dup_cands.append(cand) return header, no_dup_cands
def select_geo(): if not session.get('file'): return redirect(url_for('views.index')) context = {} if request.method == 'POST': inp = StringIO(session['file']) reader = UnicodeCSVReader(inp) header = reader.next() fields = {} valid = True geotype_val = None if not request.form: valid = False context['errors'] = [ 'Select a field that contains a geography type' ] else: geotypes = [] indexes = [] for k, v in request.form.items(): if k.startswith("geotype"): geotypes.append(v) indexes.append(k.split('_')[1]) if len(indexes) > 2: valid = False context['errors'] = [ 'We can only merge geographic information from 2 columns' ] else: fields_key = ';'.join([header[int(i)] for i in indexes]) geotype_val = ';'.join([g for g in geotypes]) if not check_combos(geotype_val): valid = False types = [t.title() for t in geotype_val.split(';')] context['errors'] = [ 'The geographic combination of {0} and {1} does not work' .format(*types) ] else: fields[fields_key] = { 'geo_type': geotype_val, 'column_index': ';'.join(indexes) } # found_geo_type = get_geo_types(geo_type)[0]['info'] # sample_list = session['sample_data'][index][2] # valid, message = found_geo_type.validate(sample_list) # context['errors'] = [message] if valid: try: geo_type = SENSICAL_TYPES[geotype_val] except KeyError: geo_type = geotype_val mancer_data, errors = get_data_sources(geo_type=geo_type) session['fields'] = fields session['mancer_data'] = mancer_data for error in errors: flash(error) return redirect(url_for('views.select_tables')) return render_template('select_geo.html', **context)
def _transform(self): reader = UnicodeCSVReader(self.station_raw_info) header = ['wban_code', 'station_name', 'country', 'state', 'call_sign', 'location', 'elevation', 'begin', 'end'] reader.next() self.clean_station_info = StringIO() all_rows = [] wbans = [] for row in reader: if row[1] == '99999': continue elif row[1] in wbans: continue elif row[5] and row[6]: row.pop(0) row.pop(3) lat = row[5].replace('+', '') lon = row[6].replace('+', '') elev = row[7].replace('+', '') begin = parser.parse(row[8]).isoformat() end = parser.parse(row[9]).isoformat() row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000)) row[6] = float(elev) / 10 row[7] = begin row[8] = end row.pop() wbans.append(row[0]) all_rows.append(row) writer = UnicodeCSVWriter(self.clean_station_info) writer.writerow(header) writer.writerows(all_rows) self.clean_station_info.seek(0)
def _transform(self): reader = UnicodeCSVReader(self.station_raw_info) header = [ 'wban_code', 'station_name', 'country', 'state', 'call_sign', 'location', 'elevation', 'begin', 'end' ] reader.next() self.clean_station_info = StringIO() all_rows = [] wbans = [] for row in reader: if row[1] == '99999': continue elif row[1] in wbans: continue elif row[5] and row[6]: row.pop(0) row.pop(3) lat = row[5].replace('+', '') lon = row[6].replace('+', '') elev = row[7].replace('+', '') begin = parser.parse(row[8]).isoformat() end = parser.parse(row[9]).isoformat() row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000)) row[6] = float(elev) / 10 row[7] = begin row[8] = end row.pop() wbans.append(row[0]) all_rows.append(row) writer = UnicodeCSVWriter(self.clean_station_info) writer.writerow(header) writer.writerows(all_rows) self.clean_station_info.seek(0)
def clean(f): reader = UnicodeCSVReader(f) good = [] bad = [] header = reader.next() for row in reader: try: row[0] = int(row[0]) row[3] = int(row[3]) row[5] = int(row[5]) row[7] = int(row[7]) row[4] = row[4].replace(',', '') if len(row) == 12: good.append(row) else: bad.append(row) except (TypeError, ValueError): bad.append(row) goodf = open('data/trips_cleaned.csv', 'wb') badf = open('data/trips_dirty.csv', 'wb') goodwriter = UnicodeCSVWriter(goodf) goodwriter.writerow(header) goodwriter.writerows(good) badwriter = UnicodeCSVWriter(badf) badwriter.writerow(header) badwriter.writerows(bad) goodf.close() badf.close()
def test_dump_entity_map(self): with open(join(fixtures_path, 'csv_example_messy_input.csv'), 'rb') as inp: with open(join('/tmp/{0}_raw.csv'.format(self.dd_sess.id)), 'wb') as outp: outp.write(inp.read()) initializeSession(self.dd_sess.id) initializeModel(self.dd_sess.id) dedupeRaw(self.dd_sess.id) with self.app.test_request_context(): self.login() with self.client as c: c.get('/mark-all-clusters/?session_id={0}'.format( self.dd_sess.id)) rv = c.get('/dump-entity-map/?session_id=' + self.dd_sess.id) row_count = ''' SELECT count(*) FROM "raw_{0}" AS r JOIN "entity_{0}" AS e ON r.record_id = e.record_id WHERE e.clustered = TRUE '''.format(self.dd_sess.id) with self.engine.begin() as conn: row_count = list(conn.execute(row_count)) row_count = row_count[0][0] s = StringIO(rv.data) reader = UnicodeCSVReader(s) reader.next() assert len([r for r in list(reader) if r[0]]) == row_count
def makeRawTable(contents): inp = StringIO(contents) reader = UnicodeCSVReader(inp) header = reader.next() header = [slugify(h) for h in header] outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader]) outp.seek(0) conn = sqlite3.connect(':memory:') t = Table.from_csv(outp, name='raw_table', blanks_as_nulls=False, infer_types=False) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) parts = create_st.split('raw_table (') create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts) insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) rows = [dict(zip(header, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) dump = StringIO() for line in conn.iterdump(): dump.write(unidecode(line)) dump.seek(0) return dump.getvalue(), header
def select_geo(): if not session.get('file'): return redirect(url_for('views.index')) context = {} if request.method == 'POST': inp = StringIO(session['file']) reader = UnicodeCSVReader(inp) header = reader.next() fields = {} geo_type = None valid = True if not request.form: valid = False context['errors'] = ['Select a field that contains a geography type'] else: for k,v in request.form.items(): if k.startswith("geotype"): geo_type = v index = int(k.split('_')[1]) fields[header[index]] = { 'geo_type': v, 'column_index': index } found_geo_type = get_geo_types(geo_type)[0]['info'] sample_as_list = session['sample_data'][index][2].split(', ') valid = validate_geo_type(found_geo_type, sample_as_list) context['errors'] = ['The column you selected must be formatted like "%s" to match on %s geographies. Please pick another column or change the format of your data.' % (found_geo_type.formatting_example, found_geo_type.human_name)] if valid: mancer_data = get_data_sources(geo_type) session.update({'fields': fields, 'mancer_data': mancer_data}) return redirect(url_for('views.select_tables')) return render_template('select_geo.html', **context)
def upload(): context = {} if request.method == 'POST': big_file = False try: files = request.files except RequestEntityTooLarge, e: files = None big_file = True current_app.logger.info(e) if files: f = files['input_file'] if allowed_file(f.filename): inp = StringIO(f.read()) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = [ 'We had a problem with reading your file. \ This could have to do with the file encoding or format' ] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(100): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j, d in enumerate(row): columns[j].append(row[column_ids[j]]) sample_data = [] guesses = {} for index, header_val in enumerate(session['header_row']): guesses[index] = guess_geotype(header_val, columns[index]) sample_data.append((index, header_val, columns[index])) session['sample_data'] = sample_data session['guesses'] = json.dumps(guesses) outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = [ 'Only .xls or .xlsx and .csv files are allowed.' ] else: context['errors'] = ['You must provide a file to upload.'] if big_file: context['errors'] = ['Uploaded file must be 10mb or less.']
def do_the_work(file_contents, field_defs, filename): """ field_defs looks like: { 10: { 'type': 'city_state', 'append_columns': ['total_population', 'median_age'] } } file_contents is a string containing the contents of the uploaded file. """ contents = StringIO(file_contents) reader = UnicodeCSVReader(contents) header = reader.next() result = None geo_ids = set() mancer_mapper = {} for mancer in MANCERS: m = import_class(mancer)() mancer_cols = [k['table_id'] for k in m.column_info()] for k, v in field_defs.items(): field_cols = v['append_columns'] for f in field_cols: if f in mancer_cols: mancer_mapper[f] = { 'mancer': m, 'geo_id_map': {}, 'geo_ids': set(), 'geo_type': v['type'] } for row_idx, row in enumerate(reader): col_idxs = [int(k) for k in field_defs.keys()] for idx in col_idxs: val = row[idx] geo_type = field_defs[idx]['type'] for column in field_defs[idx]['append_columns']: mancer = mancer_mapper[column]['mancer'] try: if val: geoid_search = mancer.geo_lookup(val, geo_type=geo_type) else: continue except MancerError, e: return 'Error message: %s, Body: %s' % (e.message, e.body) row_geoid = geoid_search['geoid'] if row_geoid: mancer_mapper[column]['geo_ids'].add(row_geoid) try: mancer_mapper[column]['geo_id_map'][row_geoid].append( row_idx) except KeyError: mancer_mapper[column]['geo_id_map'][row_geoid] = [ row_idx ]
def do_the_work(file_contents, field_defs, filename): """ field_defs looks like: { 10: { 'type': 'city_state', 'append_columns': ['total_population', 'median_age'] } } or like this: { 10;2: { 'type': 'city;state', 'append_columns': ['total_population', 'median_age'] } } where the semicolon separated values represent a multicolumn geography file_contents is a string containing the contents of the uploaded file. """ contents = StringIO(file_contents) reader = UnicodeCSVReader(contents) header = reader.next() result = None geo_ids = set() mancer_mapper = {} fields_key = field_defs.keys()[0] errors = [] geo_type, col_idxs, val_fmt = find_geo_type(field_defs[fields_key]['type'], fields_key) geo_name = get_geo_types(geo_type=geo_type)[0][0]['info'].human_name for mancer in MANCERS: m = import_class(mancer) api_key = MANCER_KEYS.get(m.machine_name) try: m = m(api_key=api_key) except ImportError, e: errors.append(e.message) continue mancer_cols = [c['table_id'] for c in m.get_metadata()] for k, v in field_defs.items(): field_cols = v['append_columns'] for f in field_cols: if f in mancer_cols: mancer_mapper[f] = { 'mancer': m, 'geo_id_map': {}, 'geo_ids': set(), 'geo_type': geo_type, }
def upload(): context = {} if request.method == 'POST': f = request.files['input_file'] if f: if allowed_file(f.filename): inp = StringIO(f.read()) if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH: inp.seek(0) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = [ 'We had a problem with reading your file. \ This could have to do with the file encoding or format' ] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(10): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j, d in enumerate(row): columns[j].append(row[column_ids[j]]) columns = [', '.join(c) for c in columns] sample_data = [] for index, _ in enumerate(session['header_row']): sample_data.append( (index, session['header_row'][index], columns[index])) session['sample_data'] = sample_data outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Uploaded file must be 10mb or less.'] else: context['errors'] = [ 'Only .xls or .xlsx and .csv files are allowed.' ] else: context['errors'] = ['You must provide a file to upload.'] return render_template('upload.html', **context)
def upload(): context = {} if request.method == 'POST': big_file = False try: files = request.files except RequestEntityTooLarge, e: files = None big_file = True current_app.logger.info(e) if files: f = files['input_file'] if allowed_file(f.filename): inp = StringIO(f.read()) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = ['We had a problem with reading your file. \ This could have to do with the file encoding or format'] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(100): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j,d in enumerate(row): columns[j].append(row[column_ids[j]]) sample_data = [] guesses = {} for index, header_val in enumerate(session['header_row']): guesses[index] = guess_geotype(header_val, columns[index]) sample_data.append((index, header_val, columns[index])) session['sample_data'] = sample_data session['guesses'] = json.dumps(guesses) outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.'] else: context['errors'] = ['You must provide a file to upload.'] if big_file: context['errors'] = ['Uploaded file must be 10mb or less.']
def _transform_daily(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header = reader.next() header = [x.strip() for x in header] self.clean_observations_daily = StringIO() writer = UnicodeCSVWriter(self.clean_observations_daily) out_header = [ "wban_code", "date", "temp_max", "temp_min", "temp_avg", "departure_from_normal", "dewpoint_avg", "wetbulb_avg", "weather_types", "snowice_depth", "snowice_waterequiv", "snowfall", "precip_total", "station_pressure", "sealevel_pressure", "resultant_windspeed", "resultant_winddirection", "resultant_winddirection_cardinal", "avg_windspeed", "max5_windspeed", "max5_winddirection", "max5_winddirection_cardinal", "max2_windspeed", "max2_winddirection", "max2_winddirection_cardinal" ] writer.writerow(out_header) row_count = 0 for row in reader: self.current_row = row if (row_count % 100 == 0): if (self.debug == True): self.debug_outfile.write( "\rdaily parsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count += 1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 #print len(header) #print len(row) #print zip(header,row) if (len(row) == 0): continue row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row, header) writer.writerow(row_vals) return self.clean_observations_daily
def select_geo(): if not session.get('file'): return redirect(url_for('views.index')) context = {} if request.method == 'POST': inp = StringIO(session['file']) reader = UnicodeCSVReader(inp) header = reader.next() fields = {} valid = True geotype_val = None if not request.form: valid = False context['errors'] = ['Select a field that contains a geography type'] else: geotypes = [] indexes = [] for k,v in request.form.items(): if k.startswith("geotype"): geotypes.append(v) indexes.append(k.split('_')[1]) if len(indexes) > 2: valid = False context['errors'] = ['We can only merge geographic information from 2 columns'] else: fields_key = ';'.join([header[int(i)] for i in indexes]) geotype_val = ';'.join([g for g in geotypes]) if not check_combos(geotype_val): valid = False types = [t.title() for t in geotype_val.split(';')] context['errors'] = ['The geographic combination of {0} and {1} does not work'.format(*types)] else: fields[fields_key] = { 'geo_type': geotype_val, 'column_index': ';'.join(indexes) } # found_geo_type = get_geo_types(geo_type)[0]['info'] # sample_list = session['sample_data'][index][2] # valid, message = found_geo_type.validate(sample_list) # context['errors'] = [message] if valid: try: geo_type = SENSICAL_TYPES[geotype_val] except KeyError: geo_type = geotype_val mancer_data, errors = get_data_sources(geo_type=geo_type) session['fields'] = fields session['mancer_data'] = mancer_data for error in errors: flash(error) return redirect(url_for('views.select_tables')) return render_template('select_geo.html', **context)
def do_the_work(file_contents, field_defs, filename): """ field_defs looks like: { 10: { 'type': 'city_state', 'append_columns': ['total_population', 'median_age'] } } file_contents is a string containing the contents of the uploaded file. """ contents = StringIO(file_contents) reader = UnicodeCSVReader(contents) header = reader.next() result = None geo_ids = set() mancer_mapper = {} for mancer in MANCERS: m = import_class(mancer)() mancer_cols = [k['table_id'] for k in m.column_info()] for k, v in field_defs.items(): field_cols = v['append_columns'] for f in field_cols: if f in mancer_cols: mancer_mapper[f] = { 'mancer': m, 'geo_id_map': {}, 'geo_ids': set(), 'geo_type': v['type'] } for row_idx, row in enumerate(reader): col_idxs = [int(k) for k in field_defs.keys()] for idx in col_idxs: val = row[idx] geo_type = field_defs[idx]['type'] for column in field_defs[idx]['append_columns']: mancer = mancer_mapper[column]['mancer'] try: if val: geoid_search = mancer.geo_lookup(val, geo_type=geo_type) else: continue except MancerError, e: return 'Error message: %s, Body: %s' % (e.message, e.body) row_geoid = geoid_search['geoid'] if row_geoid: mancer_mapper[column]['geo_ids'].add(row_geoid) try: mancer_mapper[column]['geo_id_map'][row_geoid].append(row_idx) except KeyError: mancer_mapper[column]['geo_id_map'][row_geoid] = [row_idx]
def _from_inference(f): """ Generate columns by scanning source CSV and inferring column types. """ reader = UnicodeCSVReader(f) # Always create columns with slugified names header = map(slugify, reader.next()) cols = [] for col_idx, col_name in enumerate(header): col_type, nullable = iter_column(col_idx, f) cols.append(_make_col(col_name, col_type, nullable)) return cols
def iter_column(idx, f): f.seek(0) reader = UnicodeCSVReader(f) header = reader.next() col = [] for row in reader: if row: try: col.append(row[idx]) except IndexError: # Bad data. Maybe we can fill with nulls? pass col_type, null_values = normalize_column_type(col) return col_type, null_values
def _transform_hourly(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header = reader.next() # strip leading and trailing whitespace from header (e.g. from tarfiles) header = [x.strip() for x in header] self.clean_observations_hourly = StringIO() writer = UnicodeCSVWriter(self.clean_observations_hourly) out_header = ["wban_code","datetime","old_station_type","station_type", \ "sky_condition","sky_condition_top","visibility",\ "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\ "dewpoint_fahrenheit","relative_humidity",\ "wind_speed","wind_direction","wind_direction_cardinal",\ "station_pressure","sealevel_pressure","report_type",\ "hourly_precip"] writer.writerow(out_header) row_count = 0 for row in reader: if (row_count % 1000 == 0): if (self.debug == True): self.debug_outfile.write("\rparsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count += 1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 if (len(row) == 0): continue # this calls either self._parse_zipfile_row_hourly # or self._parse_tarfile_row_hourly row_vals = getattr(self, '_parse_%s_row_hourly' % file_type)(row, header) if (not row_vals): continue writer.writerow(row_vals) return self.clean_observations_hourly
def infer_csv_columns(inp): """ :param inp: File handle to a CSV dataset that we can throw into a UnicodeCSVReader :return: List of `ColumnInfo`s """ reader = UnicodeCSVReader(inp) header = reader.next() inp.seek(0) iter_output = [iter_column(col_idx, inp) for col_idx in range(len(header))] return [ColumnInfo(name, type_, has_nulls) for name, (type_, has_nulls) in zip(header, iter_output)]
def upload(): context = {} if request.method == 'POST': f = request.files['input_file'] if f: if allowed_file(f.filename): inp = StringIO(f.read()) if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH: inp.seek(0) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = ['We had a problem with reading your file. \ This could have to do with the file encoding or format'] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(10): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j,d in enumerate(row): columns[j].append(row[column_ids[j]]) columns = [', '.join(c) for c in columns] sample_data = [] for index,_ in enumerate(session['header_row']): sample_data.append((index, session['header_row'][index], columns[index])) session['sample_data'] = sample_data outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Uploaded file must be 10mb or less.'] else: context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.'] else: context['errors'] = ['You must provide a file to upload.'] return render_template('upload.html', **context)
def _transform_daily(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header = reader.next() header = [x.strip() for x in header] self.clean_observations_daily = StringIO() writer = UnicodeCSVWriter(self.clean_observations_daily) out_header = ["wban_code","date","temp_max","temp_min", "temp_avg","departure_from_normal", "dewpoint_avg", "wetbulb_avg","weather_types", "snowice_depth", "snowice_waterequiv", "snowfall","precip_total", "station_pressure", "sealevel_pressure", "resultant_windspeed", "resultant_winddirection", "resultant_winddirection_cardinal", "avg_windspeed", "max5_windspeed", "max5_winddirection","max5_winddirection_cardinal", "max2_windspeed", "max2_winddirection","max2_winddirection_cardinal"] writer.writerow(out_header) row_count = 0 for row in reader: self.current_row = row if (row_count % 100 == 0): if (self.debug == True): self.debug_outfile.write("\rdaily parsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count +=1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 #print len(header) #print len(row) #print zip(header,row) if (len(row) == 0): continue row_vals = getattr(self, '_parse_%s_row_daily' % file_type)(row, header) writer.writerow(row_vals) return self.clean_observations_daily
def _transform_hourly(self, raw_weather, file_type, start_line=0, end_line=None): raw_weather.seek(0) reader = UnicodeCSVReader(raw_weather) header= reader.next() # strip leading and trailing whitespace from header (e.g. from tarfiles) header = [x.strip() for x in header] self.clean_observations_hourly = StringIO() writer = UnicodeCSVWriter(self.clean_observations_hourly) out_header = ["wban_code","datetime","old_station_type","station_type", \ "sky_condition","sky_condition_top","visibility",\ "weather_types","drybulb_fahrenheit","wetbulb_fahrenheit",\ "dewpoint_fahrenheit","relative_humidity",\ "wind_speed","wind_direction","wind_direction_cardinal",\ "station_pressure","sealevel_pressure","report_type",\ "hourly_precip"] writer.writerow(out_header) row_count = 0 for row in reader: if (row_count % 1000 == 0): if (self.debug==True): self.debug_outfile.write( "\rparsing: row_count=%06d" % row_count) self.debug_outfile.flush() if (start_line > row_count): row_count +=1 continue if ((end_line is not None) and (row_count > end_line)): break row_count += 1 if (len(row) == 0): continue # this calls either self._parse_zipfile_row_hourly # or self._parse_tarfile_row_hourly row_vals = getattr(self, '_parse_%s_row_hourly' % file_type)(row, header) if (not row_vals): continue writer.writerow(row_vals) return self.clean_observations_hourly
def select_geo(): if not session.get('file'): return redirect(url_for('views.index')) context = {} if request.method == 'POST': inp = StringIO(session['file']) reader = UnicodeCSVReader(inp) header = reader.next() fields = {} geo_type = None valid = True if not request.form: valid = False context['errors'] = [ 'Select a field that contains a geography type' ] else: for k, v in request.form.items(): if k.startswith("geotype"): geo_type = v index = int(k.split('_')[1]) fields[header[index]] = { 'geo_type': v, 'column_index': index } found_geo_type = get_geo_types(geo_type)[0]['info'] sample_as_list = session['sample_data'][index][2].split(', ') valid = validate_geo_type(found_geo_type, sample_as_list) context['errors'] = [ 'The column you selected must be formatted like "%s" to match on %s geographies. Please pick another column or change the format of your data.' % (found_geo_type.formatting_example, found_geo_type.human_name) ] if valid: mancer_data = get_data_sources(geo_type) session.update({'fields': fields, 'mancer_data': mancer_data}) return redirect(url_for('views.select_tables')) return render_template('select_geo.html', **context)
def prepareResults(clustered_dupes, fpath): """ Prepare deduplicated file for writing to various formats with duplicates clustered. """ cluster_membership = {} cluster_id = None for cluster_id, cluster in enumerate(clustered_dupes): for record_id, score in zip(*cluster): cluster_membership[record_id] = cluster_id unique_record_id = cluster_id + 1 clustered_rows = [] with open('{0}-converted.csv'.format(fpath), 'rb') as f: reader = UnicodeCSVReader(f) heading_row = reader.next() heading_row.insert(0, 'Group ID') rows = [] for row_id, row in enumerate(reader): if row_id in cluster_membership: cluster_id = cluster_membership[row_id] else: cluster_id = unique_record_id unique_record_id += 1 row.insert(0, cluster_id) rows.append(row) rows = sorted(rows, key=itemgetter(0)) rows.insert(0, heading_row) for row in rows: d = OrderedDict() for k,v in zip(heading_row, row): d[k] = v clustered_rows.append(d) return unique_record_id, clustered_rows
def add_dataset(): dataset_info = {} errors = [] socrata_source = False if request.method == 'POST': url = request.form.get('dataset_url') if url: four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', url) errors = True if four_by_four: parsed = urlparse(url) host = 'https://%s' % parsed.netloc path = 'api/views' view_url = '%s/%s/%s' % (host, path, four_by_four[-1]) dataset_info, errors, status_code = get_socrata_data_info( view_url) if not errors: socrata_source = True dataset_info['submitted_url'] = url if errors: errors = [] try: r = requests.get(url, stream=True) status_code = r.status_code except requests.exceptions.InvalidURL: errors.append('Invalid URL') except requests.exceptions.ConnectionError: errors.append('URL can not be reached') if status_code != 200: errors.append('URL returns a %s status code' % status_code) if not errors: dataset_info['submitted_url'] = url dataset_info['name'] = urlparse(url).path.split('/')[-1] inp = StringIO() line_no = 0 lines = [] for line in r.iter_lines(): try: inp.write(line + '\n') line_no += 1 if line_no > 1000: raise StopIteration except StopIteration: break inp.seek(0) reader = UnicodeCSVReader(inp) header = reader.next() col_types = [] inp.seek(0) for col in range(len(header)): col_types.append(iter_column(col, inp)) dataset_info['columns'] = [] for idx, col in enumerate(col_types): d = { 'human_name': header[idx], 'data_type': col.__visit_name__.lower() } dataset_info['columns'].append(d) else: errors.append('Need a URL') context = { 'dataset_info': dataset_info, 'errors': errors, 'socrata_source': socrata_source } return render_template('add-dataset.html', **context)
if FILENAME == 'FAKE': for geography in collection.find({}, fields=['geoid', 'xwalk']): if 'xwalk' not in geography: geography['xwalk'] = {} geography['xwalk'][geography['geoid']] = { 'POPPCT00': 1.0, 'HUPCT00': 1.0 } collection.update({ '_id': objectid.ObjectId(geography['_id']) }, { '$set': { 'xwalk': geography['xwalk'] } }, safe=True) row_count += 1 inserts += 1 else: with open(FILENAME) as f: rows = UnicodeCSVReader(f) headers = rows.next() for row in rows: row_count += 1 row_dict = dict(zip(headers, row)) if row_dict['STATE10'] != STATE_FIPS: continue geography = collection.find_one({ 'geoid': row_dict['GEOID10'] }, fields=['xwalk']) if not geography: continue pop_pct_2000 = float(row_dict['POPPCT00']) / 100
row[7] = res[0] row[6] = res[1] yield row def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs if __name__ == '__main__': curs = make_db('macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description') outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb') writer = UnicodeCSVWriter(outp) with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f: reader = UnicodeCSVReader(f) headers = reader.next() headers.insert(1, 'Fund ID') writer.writerow(headers) writer.writerows(add_attrs(reader, curs))
def get_context_for_new_dataset(url): dataset_info = {} errors = [] socrata_source = False if url: url = url.strip(' \t\n\r') # strip whitespace, tabs, etc four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', url) errors = True if four_by_four: parsed = urlparse(url) host = 'https://%s' % parsed.netloc path = 'api/views' dataset_info, errors, status_code = get_socrata_data_info( host, path, four_by_four[-1]) if not errors: socrata_source = True dataset_info['submitted_url'] = url if errors: errors = [] try: r = requests.get(url, stream=True) status_code = r.status_code except requests.exceptions.InvalidURL: errors.append('Invalid URL') except requests.exceptions.ConnectionError: errors.append('URL can not be reached') if status_code != 200: errors.append('URL returns a %s status code' % status_code) if not errors: dataset_info['submitted_url'] = url dataset_info['name'] = urlparse(url).path.split('/')[-1] inp = StringIO() line_no = 0 lines = [] for line in r.iter_lines(): try: inp.write(line + '\n') line_no += 1 if line_no > 1000: raise StopIteration except StopIteration: break inp.seek(0) reader = UnicodeCSVReader(inp) header = reader.next() col_types = [] inp.seek(0) for col in range(len(header)): col_types.append(iter_column(col, inp)[0]) dataset_info['columns'] = [] for idx, col in enumerate(col_types): d = { 'human_name': header[idx], 'data_type': col.__visit_name__.lower() } dataset_info['columns'].append(d) else: errors.append('Need a URL') #print "get_context_for_new_dataset(): returning ", dataset_info, errors, socrata_source return (dataset_info, errors, socrata_source)
'indent': indent, 'labels': row[3:9], 'continuation': continuation } if __name__ == '__main__': if len(sys.argv) < 2: sys.exit( 'You must provide the filename of a CSV as an argument to this script.' ) FILENAME = sys.argv[1] with open(FILENAME) as f: rows = UnicodeCSVReader(f, encoding='latin-1') headers = rows.next() inserts = 0 row_count = 0 skipped = 0 table = None tables = {} hierarchy = [] last_key = '' last_indent = 0 for row in rows: row_count += 1 if not row: continue
def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs if __name__ == '__main__': curs = make_db( 'macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description') outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb') writer = UnicodeCSVWriter(outp) with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f: reader = UnicodeCSVReader(f) headers = reader.next() headers.insert(1, 'Fund ID') writer.writerow(headers) writer.writerows(add_attrs(reader, curs))
return { 'table_id': table_id, 'line': line, 'indent': indent, 'labels': row[3:9], 'continuation': continuation } if __name__ == '__main__': if len(sys.argv) < 2: sys.exit('You must provide the filename of a CSV as an argument to this script.') FILENAME = sys.argv[1] with open(FILENAME) as f: rows = UnicodeCSVReader(f, encoding='latin-1') headers = rows.next() inserts = 0 row_count = 0 skipped = 0 table = None tables = {} hierarchy = [] last_key = '' last_indent = 0 for row in rows: row_count += 1 if not row: continue