def makeRawTable(contents): inp = StringIO(contents) reader = UnicodeCSVReader(inp) header = reader.next() header = [slugify(h) for h in header] outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader]) outp.seek(0) conn = sqlite3.connect(':memory:') t = Table.from_csv(outp, name='raw_table', blanks_as_nulls=False, infer_types=False) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) parts = create_st.split('raw_table (') create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts) insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) rows = [dict(zip(header, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) dump = StringIO() for line in conn.iterdump(): dump.write(unidecode(line)) dump.seek(0) return dump.getvalue(), header
def clean(f): reader = UnicodeCSVReader(f) good = [] bad = [] header = reader.next() for row in reader: try: row[0] = int(row[0]) row[3] = int(row[3]) row[5] = int(row[5]) row[7] = int(row[7]) row[4] = row[4].replace(',', '') if len(row) == 12: good.append(row) else: bad.append(row) except (TypeError, ValueError): bad.append(row) goodf = open('data/trips_cleaned.csv', 'wb') badf = open('data/trips_dirty.csv', 'wb') goodwriter = UnicodeCSVWriter(goodf) goodwriter.writerow(header) goodwriter.writerows(good) badwriter = UnicodeCSVWriter(badf) badwriter.writerow(header) badwriter.writerows(bad) goodf.close() badf.close()
def _transform(self): reader = UnicodeCSVReader(self.station_raw_info) header = ['wban_code', 'station_name', 'country', 'state', 'call_sign', 'location', 'elevation', 'begin', 'end'] reader.next() self.clean_station_info = StringIO() all_rows = [] wbans = [] for row in reader: if row[1] == '99999': continue elif row[1] in wbans: continue elif row[5] and row[6]: row.pop(0) row.pop(3) lat = row[5].replace('+', '') lon = row[6].replace('+', '') elev = row[7].replace('+', '') begin = parser.parse(row[8]).isoformat() end = parser.parse(row[9]).isoformat() row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000)) row[6] = float(elev) / 10 row[7] = begin row[8] = end row.pop() wbans.append(row[0]) all_rows.append(row) writer = UnicodeCSVWriter(self.clean_station_info) writer.writerow(header) writer.writerows(all_rows) self.clean_station_info.seek(0)
def _transform(self): reader = UnicodeCSVReader(self.station_raw_info) header = [ 'wban_code', 'station_name', 'country', 'state', 'call_sign', 'location', 'elevation', 'begin', 'end' ] reader.next() self.clean_station_info = StringIO() all_rows = [] wbans = [] for row in reader: if row[1] == '99999': continue elif row[1] in wbans: continue elif row[5] and row[6]: row.pop(0) row.pop(3) lat = row[5].replace('+', '') lon = row[6].replace('+', '') elev = row[7].replace('+', '') begin = parser.parse(row[8]).isoformat() end = parser.parse(row[9]).isoformat() row[5] = 'SRID=4326;POINT(%s %s)' % ((float(lon) / 1000), (float(lat) / 1000)) row[6] = float(elev) / 10 row[7] = begin row[8] = end row.pop() wbans.append(row[0]) all_rows.append(row) writer = UnicodeCSVWriter(self.clean_station_info) writer.writerow(header) writer.writerows(all_rows) self.clean_station_info.seek(0)
def writeCSV(fpath, output): with open(fpath, 'wb') as f: writer = UnicodeCSVWriter(f) writer.writerows(output)
def writeBlockingMap(session_id, block_data, canonical=False): pk_type = Integer if canonical: session_id = '{0}_cr'.format(session_id) pk_type = String metadata = MetaData() engine = worker_session.bind bkm = Table('block_{0}'.format(session_id), metadata, Column('block_key', Text), Column('record_id', pk_type) ) bkm.drop(engine, checkfirst=True) bkm.create(engine) with open('/tmp/{0}.csv'.format(session_id), 'wb') as s: writer = UnicodeCSVWriter(s) writer.writerows(block_data) conn = engine.raw_connection() cur = conn.cursor() with open('/tmp/{0}.csv'.format(session_id), 'rb') as s: cur.copy_expert('COPY "block_{0}" FROM STDIN CSV'.format(session_id), s) conn.commit() os.remove('/tmp/{0}.csv'.format(session_id)) block_key_idx = Index('bk_{0}_idx'.format(session_id), bkm.c.block_key) block_key_idx.create(engine) plural_key = Table('plural_key_{0}'.format(session_id), metadata, Column('block_key', Text), Column('block_id', Integer, primary_key=True) ) plural_key.drop(engine, checkfirst=True) plural_key.create(engine) bkm_sel = select([bkm.c.block_key], from_obj=bkm)\ .group_by(bkm.c.block_key)\ .having(func.count(bkm.c.block_key) > 1) pl_ins = plural_key.insert()\ .from_select([plural_key.c.block_key], bkm_sel) with engine.begin() as c: c.execute(pl_ins) pl_key_idx = Index('pk_{0}_idx'.format(session_id), plural_key.c.block_key) pl_key_idx.create(engine) with engine.begin() as c: c.execute('DROP TABLE IF EXISTS "plural_block_{0}"'.format(session_id)) pl_bk_stmt = ''' CREATE TABLE "plural_block_{0}" AS ( SELECT p.block_id, b.record_id FROM "block_{0}" AS b INNER JOIN "plural_key_{0}" AS p USING (block_key) )'''.format(session_id) with engine.begin() as c: c.execute(pl_bk_stmt) with engine.begin() as c: c.execute(''' CREATE INDEX "pl_bk_idx_{0}" ON "plural_block_{0}" (record_id)'''.format(session_id) ) with engine.begin() as c: c.execute('DROP INDEX IF EXISTS "pl_bk_id_idx_{0}"'.format(session_id)) with engine.begin() as c: c.execute(''' CREATE UNIQUE INDEX "pl_bk_id_idx_{0}" on "plural_block_{0}" (block_id, record_id) '''.format(session_id) ) cov_bks_stmt = ''' CREATE TABLE "covered_{0}" AS ( SELECT record_id, string_agg(CAST(block_id AS TEXT), ',' ORDER BY block_id) AS sorted_ids FROM "plural_block_{0}" GROUP BY record_id ) '''.format(session_id) with engine.begin() as c: c.execute('DROP TABLE IF EXISTS "covered_{0}"'.format(session_id)) with engine.begin() as c: c.execute(cov_bks_stmt) with engine.begin() as c: c.execute(''' CREATE UNIQUE INDEX "cov_bks_id_idx_{0}" ON "covered_{0}" (record_id) '''.format(session_id) ) with engine.begin() as c: c.execute('DROP TABLE IF EXISTS "small_cov_{0}"'.format(session_id)) small_cov = ''' CREATE TABLE "small_cov_{0}" AS ( SELECT record_id, block_id, TRIM(',' FROM split_part(sorted_ids, CAST(block_id AS TEXT), 1)) AS smaller_ids FROM "plural_block_{0}" INNER JOIN "covered_{0}" USING (record_id) ) '''.format(session_id) with engine.begin() as c: c.execute(small_cov) with engine.begin() as c: c.execute(''' CREATE INDEX "sc_idx_{0}" ON "small_cov_{0}" (record_id)'''.format(session_id) ) with engine.begin() as c: c.execute(''' CREATE INDEX "sc_bk_idx_{0}" ON "small_cov_{0}" (block_id)'''.format(session_id) )
reader = UnicodeCSVDictReader(inp) comm_ids = [i['id'] for i in list(reader)] candidate_pattern = '/CommitteeDetailCandidates.aspx?id=%s' cand_scraper = CandidateScraper(url_pattern=candidate_pattern) cand_scraper.cache_storage = scrapelib.cache.FileCache('/cache/cache') cand_scraper.cache_write_only = False for comm_id in comm_ids: for cand in cand_scraper.scrape_one(comm_id): if cand: cand['CommitteeID'] = comm_id insert = 'insert into candidates("ID", "FullName", "FullAddress", \ "PartyName", "OfficeName", "CommitteeID") values (:ID, :FullName, :FullAddress, \ :PartyName, :OfficeName, :CommitteeID)' c.execute(insert, cand) conn.commit() else: print 'Got a 500 for %s' % comm_id c.execute('select * from candidates') header = list(map(lambda x: x[0], c.description)) cands = c.fetchall() outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows(cands) outp.seek(0) k.key = 'Candidates.csv' k.set_contents_from_file(outp) k.make_public()
def getMatchingReady(session_id): addRowHash(session_id) cleanupTables(session_id) engine = worker_session.bind with engine.begin() as conn: conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\ .format(session_id)) conn.execute(''' CREATE TABLE "match_blocks_{0}" ( block_key VARCHAR, record_id BIGINT ) '''.format(session_id)) sess = worker_session.query(DedupeSession).get(session_id) field_defs = json.loads(sess.field_defs) # Save Gazetteer settings d = dedupe.Gazetteer(field_defs) # Disabling canopy based predicates for now for definition in d.data_model.primary_fields: for idx, predicate in enumerate(definition.predicates): if predicate.type == 'TfidfPredicate': definition.predicates.pop(idx) d.readTraining(StringIO(sess.training_data)) d.train() g_settings = StringIO() d.writeSettings(g_settings) g_settings.seek(0) sess.gaz_settings_file = g_settings.getvalue() worker_session.add(sess) worker_session.commit() # Write match_block table model_fields = list(set([f['field'] for f in field_defs])) fields = ', '.join(['p.{0}'.format(f) for f in model_fields]) sel = ''' SELECT p.record_id, {0} FROM "processed_{1}" AS p LEFT JOIN "exact_match_{1}" AS e ON p.record_id = e.match WHERE e.record_id IS NULL; '''.format(fields, session_id) conn = engine.connect() rows = conn.execute(sel) data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \ for row in rows) block_gen = d.blocker(data) s = StringIO() writer = UnicodeCSVWriter(s) writer.writerows(block_gen) conn.close() s.seek(0) conn = engine.raw_connection() curs = conn.cursor() try: curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\ .format(session_id), s) conn.commit() except Exception, e: # pragma: no cover conn.rollback() raise e
def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs if __name__ == '__main__': curs = make_db( 'macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description') outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb') writer = UnicodeCSVWriter(outp) with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f: reader = UnicodeCSVReader(f) headers = reader.next() headers.insert(1, 'Fund ID') writer.writerow(headers) writer.writerows(add_attrs(reader, curs))
row[7] = res[0] row[6] = res[1] yield row def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs if __name__ == '__main__': curs = make_db('macoupin-budget-update/moucoupin-budget-department-desc.csv', 'description') outp = open('macoupin-budget-update/macoupin-budget-2014-update.csv', 'wb') writer = UnicodeCSVWriter(outp) with open('macoupin-budget-update/macoupin-budget.csv', 'rb') as f: reader = UnicodeCSVReader(f) headers = reader.next() headers.insert(1, 'Fund ID') writer.writerow(headers) writer.writerows(add_attrs(reader, curs))