def main(self): # Ensure we're handling a list, even if it's just one file if not isinstance(self.args.files, list): self.args.files = [self.args.files] else: if self.args.table_name: self.argparser.error('The --table argument is only allowed when specifying a single file.') for f in self.args.files: if self.args.table_name: table_name = self.args.table_name elif f != sys.stdin: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] else: self.argparser.error('The --table argument is required when providing data over STDIN.') if self.args.dialect and self.args.connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if self.args.insert and not self.args.connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') if self.args.no_create and not self.args.insert: self.argparser.error('The --no-create option is only valid --insert is also specified.') csv_table = table.Table.from_csv(f, name=table_name, snifflimit=self.args.snifflimit, blanks_as_nulls=(not self.args.blanks), **self.reader_kwargs) f.close() # Direct connections to database if self.args.connection_string: try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints, metadata) if not self.args.no_create: sql_table.create() if self.args.insert: insert = sql_table.insert() headers = csv_table.headers() conn = engine.connect() trans = conn.begin() for row in csv_table.to_rows(): conn.execute(insert, [dict(zip(headers, row)), ]) trans.commit() conn.close() # Writing to file else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write((u'%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)).encode('utf-8'))
def main(self): if self.args.table_name: table_name = self.args.table_name elif self.args.file != sys.stdin: # Use filename as table name table_name = os.path.splitext( os.path.split(self.args.file.name)[1])[0] else: self.argparser.error( 'The --table argument is required when providing data over STDIN.' ) if self.args.loosey: loosey = True if self.args.dialect and self.args.connection_string: self.argparser.error( 'The --dialect option is only valid when --db is not specified.' ) if self.args.insert and not self.args.connection_string: self.argparser.error( 'The --insert option is only valid when --db is also specified.' ) csv_table = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) # Direct connections to database if self.args.connection_string: try: engine, metadata = sql.get_connection( self.args.connection_string) except ImportError: raise ImportError( 'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n' ) sql_table = sql.make_table(csv_table, table_name, loosey, metadata) sql_table.create() if self.args.insert: insert = sql_table.insert() headers = csv_table.headers() for row in csv_table.to_rows(serialize_dates=True): engine.execute(insert, [ dict(zip(headers, row)), ]) # Writing to file else: sql_table = sql.make_table(csv_table, table_name, loosey) self.output_file.write((u'%s\n' % sql.make_create_table_statement( sql_table, dialect=self.args.dialect)).encode('utf-8'))
def make_insert_statement(self): sql_table = sql.make_table(self.csv_table, "csvsql") statement = sql.make_insert_statement(sql_table, self.csv_table._prepare_rows_for_serialization()[0]) self.assertEqual( statement, u"INSERT INTO test_table (text, integer, datetime, empty_column) VALUES ('Chicago Reader', 40, '2008-01-01T04:40:00', NULL);", )
def makeRawTable(contents): inp = StringIO(contents) reader = UnicodeCSVReader(inp) header = reader.next() header = [slugify(h) for h in header] outp = StringIO() writer = UnicodeCSVWriter(outp) writer.writerow(header) writer.writerows([[preProcess(unicode(i)) for i in r] for r in reader]) outp.seek(0) conn = sqlite3.connect(':memory:') t = Table.from_csv(outp, name='raw_table', blanks_as_nulls=False, infer_types=False) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) parts = create_st.split('raw_table (') create_st = '{0} raw_table ( record_id INTEGER PRIMARY KEY,{1}'.format(*parts) insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) rows = [dict(zip(header, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) dump = StringIO() for line in conn.iterdump(): dump.write(unidecode(line)) dump.seek(0) return dump.getvalue(), header
def load_file(self): """ (1) Use csv file to Generate csvkit Table and SQL table objects """ fh = open(self.fname, 'rb') self.csv_table = table.Table.from_csv(fh, self.table_name) self.sql_table = sql.make_table(self.csv_table, self.table_name)
def _get_column_types(self): self.tracker.forward('Inferring datatype of columns') # Load the csv and use csvkit's sql.make_table utility # to infer the datatypes of the columns. with open(self.path, 'r') as f: csv_table = table.Table.from_csv(f, delimiter=',') sql_table = sql.make_table(csv_table) for i, column in enumerate(sql_table.columns): # Clean the type and name values raw_type = str(column.type) clean_type = re.sub(re.compile(r'\(\w+\)'), '', raw_type) # Temporary fix for issue #19 if raw_type == 'BOOLEAN': raw_type = 'VARCHAR(10)' if raw_type == 'DATETIME': # Dumb guess at the maximum length of a datetime field. Find a # better way! raw_type = 'VARCHAR(100)' parsed_length = re.search(re.compile(r'\((\w+)\)'), raw_type) if parsed_length: clean_length = int(parsed_length.group(1)) else: clean_length = None self.columns[i]['datatype'] = clean_type.lower() self.columns[i]['raw_type'] = raw_type self.columns[i]['length'] = clean_length
def load_file(file_name, db_engine): # Pass file name and SQLAlchemy db engine # Opens file, makes a file object, creates table # Builds and runs a COPY FROM statement print '-----------------' print ' Opening {} ({})...'.format(file_name, datetime.now() - start_time) f = codecs.open('data/{}'.format(file_name), 'rU') print ' Sniffing file dialect ({})...'.format(datetime.now() - start_time) dialect = sniffer.sniff_dialect(f.read()) # Return to top of file f.seek(0) print ' Making csv Table object ({})...'.format(datetime.now() - start_time) from_file = Table.from_csv(f, name = file_name.rstrip('.csv'), encoding = 'utf-8') # Here be some overhead print ' Making SQLAlchemy Table object ({})...'.format(datetime.now() - start_time) sql_table = sql.make_table(from_file) print ' Creating db table ({})...'.format(datetime.now() - start_time) sql_table.create(engine, checkfirst=True) print ' Loading {} ({})...'.format(file_name, datetime.now() - start_time) copy_from_sql = '''COPY "{table_name}" FROM '{file_w_path}' DELIMITER '{delimiter}' QUOTE '{quote_character}' ENCODING 'UTF8' CSV HEADER;'''.format( table_name = file_name.rstrip('.csv') , file_w_path = os.getcwd() + '/data/' + file_name , delimiter = dialect.delimiter , quote_character = dialect.quotechar , escape_character = '' if dialect.escapechar is None else dialect.escapechar ) conn = db_engine.connect() t = conn.begin() try: conn.execute(copy_from_sql) t.commit() except: t.rollback() print copy_from_sql.replace('\t', '') print "Failed to commit." conn.close()
def make_insert_statement(self): sql_table = sql.make_table(self.csv_table, 'csvsql') statement = sql.make_insert_statement( sql_table, self.csv_table._prepare_rows_for_serialization()[0]) self.assertEqual( statement, u'INSERT INTO test_table (text, integer, datetime, empty_column) VALUES (\'Chicago Reader\', 40, \'2008-01-01T04:40:00\', NULL);' )
def test_make_create_table_statement(self): sql_table = sql.make_table(self.csv_table, 'csvsql') statement = sql.make_create_table_statement(sql_table) self.assertEqual( statement, u"""CREATE TABLE test_table ( \ttext VARCHAR(17) NOT NULL, \tinteger INTEGER, \tdatetime DATETIME, \tempty_column VARCHAR(32) );""")
def test_make_create_table_statement(self): sql_table = sql.make_table(self.csv_table, 'csvsql') statement = sql.make_create_table_statement(sql_table) self.assertEqual(statement, u"""CREATE TABLE test_table ( \ttext VARCHAR(17) NOT NULL, \tinteger INTEGER, \tdatetime DATETIME, \tempty_column VARCHAR(32) );""")
def test_make_create_table_statement_no_constraints(self): sql_table = sql.make_table(self.csv_table, 'csvsql', True) statement = sql.make_create_table_statement(sql_table) self.assertEqual(statement, u"""CREATE TABLE test_table ( \ttext VARCHAR, \tinteger INTEGER, \tdatetime DATETIME, \tempty_column VARCHAR );""")
def main(self): if self.args.table_name: table_name = self.args.table_name elif self.args.file != sys.stdin: # Use filename as table name table_name = os.path.splitext(os.path.split(self.args.file.name)[1])[0] else: self.argparser.error('The --table argument is required when providing data over STDIN.') if self.args.loosey: loosey = True if self.args.dialect and self.args.connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if self.args.insert and not self.args.connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') csv_table = table.Table.from_csv(self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs) # Direct connections to database if self.args.connection_string: try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') sql_table = sql.make_table(csv_table, table_name, loosey, metadata) sql_table.create() if self.args.insert: insert = sql_table.insert() headers = csv_table.headers() for row in csv_table.to_rows(serialize_dates=True): engine.execute(insert, [dict(zip(headers, row)), ]) # Writing to file else: sql_table = sql.make_table(csv_table, table_name, loosey) self.output_file.write((u'%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)).encode('utf-8'))
def make_db(fname, tblname): conn = sqlite3.connect(':memory:') t = Table.from_csv(open(fname, 'rb'), name=tblname) sql_table = make_table(t) create_st = make_create_table_statement(sql_table) print create_st insert = sql_table.insert() curs = conn.cursor() curs.execute(create_st) headers = t.headers() print headers rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: curs.execute(str(insert), row) return curs
def main(self): tabname = os.path.splitext( os.path.basename(self.args.file._lazy_args[0]))[0] tab = table.Table.from_csv(self.args.file, name=tabname, **self.reader_kwargs) stmt = make_create_table_statement(make_table(tab), dialect='sqlite') conn = sqlite3.connect(':memory:') c = conn.cursor() c.execute(stmt) for row in tab.to_rows(): vals = ','.join(['?'] * len(row)) prepared = "INSERT INTO %s VALUES(%s)" % (tab.name, vals) c.execute(prepared, row) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in c.execute(self.args.query): output.writerow(row)
def fetch(self): if self.isLoaded: tool.VERBOSE('use csv {}', self.name) return True tool.VERBOSE('fetching csv {}', self.name) with tool.INDENT(): csv_table = table.Table.from_csv( open(self.inputs[0], 'rb'), name=self.name, # sniff_limit = 0, blanks_as_nulls=True, infer_types=True, no_header_row=False) tool.VERBOSE('read {} row(s) from {}', csv_table.count_rows(), self.inputs[0]) connection = self.db.engine.connect() transaction = connection.begin() sql_table = sql.make_table(csv_table, self.name, False, None, self.db.metadata) sql_table.create() if csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() connection.execute( insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) self._count = csv_table.count_rows() tool.VERBOSE('wrote {} row(s) to table {}', self.count, self.name) self._loaded = True transaction.commit() connection.close() return self._loaded
def csv_to_table(): # -------------------------------------- msgt('(2) csvkit to table') # -------------------------------------- fh = open(t4_out, 'rb') csv_args = dict(delimiter='\t',\ quotechar='"') print 'QUOTE_NONE', QUOTE_NONE csv_table = table.Table.from_csv(f=fh,\ name='tname',\ snifflimit=None,\ #quoting=QUOTE_NONNUMERIC,\ # **csv_args\ ) for col in csv_table: msg('%s, %s' % (col.name, col.type)) sql_table = csvkit_sql.make_table(csv_table, 'new_table') create_table_sql = csvkit_sql.make_create_table_statement(sql_table, dialect="postgresql") msg('create_table_sql: %s' % create_table_sql) msg(csv_table.to_rows())
def csv2sql(file=None, db_schema=None, tablename=None, encoding='utf-8', snifflimit=512*1024): try: conn = engine.connect() trans = conn.begin() csv_table = table.Table.from_csv( file.stream, name=tablename, snifflimit=snifflimit, blanks_as_nulls=True, infer_types=True, no_header_row=False, encoding=encoding ) sql_table = sql.make_table( csv_table, tablename, False, # self.args.no_constraints db_schema, # self.args.db_schema metadata ) sql_table.create() insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) trans.commit() except Exception as e: trans.rollback() print(e) return e finally: conn.close() file.close() return json.dumps(tablename)
def main(self): if self.args.file.name != "<stdin>": # Use filename as table name table_name = os.path.splitext(os.path.split(self.args.file.name)[1])[0] else: table_name = "csvsql_table" csv_table = table.Table.from_csv( self.args.file, name=table_name, snifflimit=self.args.snifflimit, **self.reader_kwargs ) sql_table = sql.make_table(csv_table) self.output_file.write( (u"%s\n" % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)).encode("utf-8") ) if self.args.inserts: self.output_file.write("\n") for row in csv_table.to_rows(serialize_dates=True): self.output_file.write( (u"%s\n" % sql.make_insert_statement(sql_table, row, dialect=self.args.dialect)).encode("utf-8") )
def create_table(self, cursor, schema, table, virtual_table): sql_table = csv_sql.make_table(virtual_table, db_schema=schema) create_table_sql = csv_sql.make_create_table_statement(sql_table, dialect='postgresql') print("Creating table {}.{}".format(schema, table)) cursor.execute(create_table_sql)
def create_table(self, cursor, schema, table, virtual_table): sql_table = csv_sql.make_table(virtual_table, db_schema=schema) create_table_sql = csv_sql.make_create_table_statement( sql_table, dialect='postgresql') logger.info("Creating table {}.{}".format(schema, table)) cursor.execute(create_table_sql)
def main(self): connection_string = self.args.connection_string do_insert = self.args.insert query = self.args.query self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if self.args.table_names: table_names = self.args.table_names.split(',') else: table_names = [] # Create an SQLite database in memory if no connection string is specified if query and not connection_string: connection_string = "sqlite:///:memory:" do_insert = True if self.args.dialect and connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if do_insert and not connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') if self.args.no_create and not do_insert: self.argparser.error('The --no-create option is only valid --insert is also specified.') # Establish database validity before reading CSV files if connection_string: try: engine, metadata = sql.get_connection(connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() trans = conn.begin() for f in self.input_files: try: # Try to use name specified via --table table_name = table_names.pop(0) except IndexError: if f == sys.stdin: table_name = "stdin" else: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] csv_table = table.Table.from_csv( f, name=table_name, sniff_limit=self.args.sniff_limit, blanks_as_nulls=(not self.args.blanks), infer_types=(not self.args.no_inference), no_header_row=self.args.no_header_row, **self.reader_kwargs ) f.close() if csv_table: if connection_string: sql_table = sql.make_table( csv_table, table_name, self.args.no_constraints, self.args.db_schema, metadata ) # Create table if not self.args.no_create: sql_table.create() # Insert data if do_insert and csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) # Output SQL statements else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write('%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)) if connection_string: if query: # Execute specified SQL queries queries = query.split(';') rows = None for q in queries: if q: rows = conn.execute(q) # Output result of last query as CSV try: output = agate.csv.writer(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) except AttributeError: pass trans.commit() conn.close()
def main(self): # Ensure we're handling a list, even if it's just one file if not isinstance(self.args.files, list): self.args.files = [self.args.files] else: if self.args.table_name: self.argparser.error( 'The --table argument is only allowed when specifying a single file.' ) for f in self.args.files: if self.args.table_name: table_name = self.args.table_name elif f != sys.stdin: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] else: self.argparser.error( 'The --table argument is required when providing data over STDIN.' ) if self.args.dialect and self.args.connection_string: self.argparser.error( 'The --dialect option is only valid when --db is not specified.' ) if self.args.insert and not self.args.connection_string: self.argparser.error( 'The --insert option is only valid when --db is also specified.' ) if self.args.no_create and not self.args.insert: self.argparser.error( 'The --no-create option is only valid --insert is also specified.' ) csv_table = table.Table.from_csv( f, name=table_name, snifflimit=self.args.snifflimit, blanks_as_nulls=(not self.args.blanks), **self.reader_kwargs) f.close() # Direct connections to database if self.args.connection_string: try: engine, metadata = sql.get_connection( self.args.connection_string) except ImportError: raise ImportError( 'You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n' ) sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints, metadata) if not self.args.no_create: sql_table.create() if self.args.insert: insert = sql_table.insert() headers = csv_table.headers() conn = engine.connect() trans = conn.begin() for row in csv_table.to_rows(): conn.execute(insert, [ dict(zip(headers, row)), ]) trans.commit() conn.close() # Writing to file else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write( (u'%s\n' % sql.make_create_table_statement( sql_table, dialect=self.args.dialect)).encode('utf-8'))
def test_make_create_table_statement_with_dialects(self): for dialect in sql.DIALECTS: sql_table = sql.make_table(self.csv_table, 'csvsql', db_schema='test_schema') statement = sql.make_create_table_statement(sql_table, dialect)
def main(self): connection_string = self.args.connection_string do_insert = self.args.insert query = self.args.query self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if self.args.table_names: table_names = self.args.table_names.split(',') else: table_names = [] # If one or more filenames are specified, we need to add stdin ourselves (if available) if sys.stdin not in self.input_files: try: if not sys.stdin.isatty(): self.input_files.insert(0, sys.stdin) except: pass # Create an SQLite database in memory if no connection string is specified if query and not connection_string: connection_string = "sqlite:///:memory:" do_insert = True if self.args.dialect and connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if do_insert and not connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') if self.args.no_create and not do_insert: self.argparser.error('The --no-create option is only valid --insert is also specified.') # Establish database validity before reading CSV files if connection_string: try: engine, metadata = sql.get_connection(connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() trans = conn.begin() for f in self.input_files: try: # Try to use name specified via --table table_name = table_names.pop(0) except IndexError: if f == sys.stdin: table_name = "stdin" else: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] csv_table = table.Table.from_csv( f, name=table_name, snifflimit=self.args.snifflimit, blanks_as_nulls=(not self.args.blanks), infer_types=(not self.args.no_inference), no_header_row=self.args.no_header_row, **self.reader_kwargs ) f.close() if connection_string: sql_table = sql.make_table( csv_table, table_name, self.args.no_constraints, self.args.db_schema, metadata ) # Create table if not self.args.no_create: sql_table.create() # Insert data if do_insert and csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) # Output SQL statements else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write('%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)) if connection_string: if query: # Execute specified SQL queries queries = query.split(';') rows = None for q in queries: if q: rows = conn.execute(q) # Output result of last query as CSV try: output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) except AttributeError: pass trans.commit() conn.close()
report_scraper = ReportScraper(url_pattern=report_pattern) report_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) report_scraper.cache_write_only = False conn = sqlite3.connect(DB_NAME) c = conn.cursor() for comm_url in comm_urls: for report_data in report_scraper.scrape_one(comm_url): comm_id = parse_qs(urlparse(comm_url).query)['id'][0] report_data['committee_id'] = comm_id outp = StringIO() writer = UnicodeCSVDictWriter(outp, fieldnames=report_data.keys()) writer.writeheader() writer.writerow(report_data) outp.seek(0) t = Table.from_csv(outp, name='reports') sql_table = make_table(t) try: c.execute('select * from reports limit 1') except sqlite3.OperationalError: create_st = make_create_table_statement(sql_table) c.execute(create_st) conn.commit() c.execute('select * from reports where id = ?', (int(report_data['id']),)) existing = c.fetchall() if not existing: insert = sql_table.insert() headers = t.headers() rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: c.execute(str(insert), row) conn.commit()
def process_csv_file(absolute_base_file, table_name_temp, new_table, geom_table_name, geom_table_id, geom_table_columns, geom_table_geom): # Create table based on CSV import csv f = open(absolute_base_file, 'rb') no_header_row = False with open(absolute_base_file, 'rb') as csvfile: # get the type of delimiter dialect = csv.Sniffer().sniff(csvfile.read()) try: csv_table = table.Table.from_csv(f, name=table_name_temp, no_header_row=no_header_row, delimiter=dialect.delimiter) except: status_code = '400' errormsgs_val = "Failed to create the table from CSV." return errormsgs_val, status_code for idx, column in enumerate(csv_table): column.name = slugify(unicode(column.name)).replace('-', '_') # Check if the selected value from the dropdown menu matches the first value of the CSV header if idx == 0: print("column.name.strip()", column.name.strip()) print("geom_table_id.strip()", geom_table_id.strip()) if column.name.strip() != geom_table_id.strip(): errormsgs_val = "The selected value of Layer Type doesn't match the one of the imported layer." status_code = '400' return errormsgs_val, status_code # Check if there are added columns in the CSV if idx < 2: errormsgs_val = "The CSV has no added columns. Please add extra columns." status_code = '400' return errormsgs_val, status_code else: try: sql_table = sql.make_table(csv_table, table_name_temp) create_table_sql = sql.make_create_table_statement( sql_table, dialect="postgresql") create_table_sql = re.sub(r'VARCHAR\([0-9]*\)', 'VARCHAR(254)', create_table_sql) except: return None, str(sys.exc_info()[0]) constr = "dbname='{dbname}' user='******' host='{host}' password='******'".format( **{ 'dbname': settings.DATABASES['uploaded']['NAME'], 'user': settings.DATABASES['uploaded']['USER'], 'host': settings.DATABASES['uploaded']['HOST'], 'password': settings.DATABASES['uploaded']['PASSWORD'] }) conn = psycopg2.connect(constr) try: # Check if there is already a table with the same name cur = conn.cursor() sqlstr = "SELECT EXISTS(SELECT * FROM information_schema.tables WHERE table_name='{new_table_name}');".format( **{'new_table_name': new_table}) cur.execute(sqlstr) exists = cur.fetchone()[0] if exists: errormsgs_val = "There is already a layer with this name. Please choose another title." status_code = '400' return errormsgs_val, status_code # If temporary table exists then drop it - the create it and add primary key cur.execute('DROP TABLE IF EXISTS %s CASCADE;' % table_name_temp) cur.execute(create_table_sql) conn.commit() sqlstr = "ALTER TABLE IF EXISTS {temp_table} ADD COLUMN fid SERIAL PRIMARY KEY;".format( **{'temp_table': table_name_temp}) cur.execute(sqlstr) conn.commit() except Exception as e: logger.error("Error Creating Temporary table %s:%s", table_name_temp, str(e)) # Copy data to table connection_string = "postgresql://%s:%s@%s:%s/%s" % ( settings.DATABASES['uploaded']['USER'], settings.DATABASES['uploaded']['PASSWORD'], settings.DATABASES['uploaded']['HOST'], settings.DATABASES['uploaded']['PORT'], settings.DATABASES['uploaded']['NAME']) try: engine, metadata = sql.get_connection(connection_string) except ImportError: return None, str(sys.exc_info()[0]) conn_eng = engine.connect() trans = conn_eng.begin() if csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() try: conn_eng.execute( insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) except: return None, str(sys.exc_info()[0]) trans.commit() conn_eng.close() # Create joined table - drop table_name_temp new_clmns = [] for idx, item in enumerate(headers): if ( idx > 1 ): # The downloaded layer contains two columns from the global table, which do not include them again new_column = "{table_name}.{item}".format(**{ 'table_name': table_name_temp, 'item': item }) new_clmns.append(new_column) added_columns = ', '.join(new_clmns) try: # Joined table sqlstr = "CREATE TABLE {new_table_name} AS (SELECT {geom_table_columns}, {added_columns} FROM {geom_table} INNER JOIN {temp_table} ON (g.{id} = {temp_table}.{id}));".format( **{ 'new_table_name': new_table, 'geom_table': geom_table_name, 'geom_table_columns': geom_table_columns, 'temp_table': table_name_temp, 'id': geom_table_id, 'added_columns': added_columns }) cur.execute(sqlstr) conn.commit() sqlstr = "ALTER TABLE IF EXISTS {new_table_name} ADD COLUMN fid SERIAL PRIMARY KEY;".format( **{'new_table_name': new_table}) cur.execute(sqlstr) conn.commit() sqlstr = "CREATE INDEX indx_{new_table_name} ON {new_table_name} USING btree({id});".format( **{ 'new_table_name': new_table, 'id': geom_table_id, }) cur.execute(sqlstr) conn.commit() sqlstr = "CREATE INDEX indx_geom_{new_table_name} ON {new_table_name} USING GIST({geom});".format( **{ 'new_table_name': new_table, 'geom': geom_table_geom, }) cur.execute(sqlstr) conn.commit() except: print "Failed to create joined table." logger.error("Failed to create joined table.") try: sqlstr = "DROP TABLE IF EXISTS {temp_table} CASCADE;".format( **{'temp_table': table_name_temp}) cur.execute(sqlstr) conn.commit() except: logger.error("Failed to drop temporary table.") conn.close() status_code = 200 errormsgs_val = '' return errormsgs_val, status_code
def process_csv_file(data_table, is_dataverse_db, delimiter=",", no_header_row=False, force_char_column=None): """ Transform csv file and add it to the postgres DataStore :param instance: :param delimiter: :param no_header_row: :return: success: (datatable, None) err: (None, error message) """ assert isinstance(data_table, DataTable), "instance must be a DataTable object" # full path to csv file # csv_filename = data_table.uploaded_file.path # Standardize table_name for the DataTable # if data_table.id is not None: # This DataTable already has a unique name table_name = data_table.table_name else: # Get a unique name for the data table table_name = os.path.splitext(os.path.basename(csv_filename))[0] table_name = get_unique_tablename(table_name) data_table.table_name = table_name data_table.save() # ----------------------------------------------------- # Transform csv file to csvkit Table # ----------------------------------------------------- csv_file_handle = open(csv_filename, 'rb') try: csv_table = table.Table.from_csv(\ csv_file_handle, name=table_name, no_header_row=no_header_row, delimiter=delimiter) except: data_table.delete() err_msg = str(sys.exc_info()[0]) LOGGER.error('Failed to convert csv file to table. Error: %s'\ , err_msg) return None, err_msg #csv_file = File(f) csv_file_handle.close() # ----------------------------------------------------- # If needed, force a column to be character # ----------------------------------------------------- #for col in csv_table: # print 'PRE col: %s, %s' % (col.name, col.type) csv_table = force_csv_column_tochar(csv_table,\ force_char_column) #for col in csv_table: # print 'POST col: %s, %s' % (col.name, col.type) # ----------------------------------------------------- # Create DataTableAttribute objects # ----------------------------------------------------- try: # Iterate through header row # for column in csv_table: # Standardize column name # column.name = standardize_column_name(column.name) # Create DataTableAttribute object # is_visible = True if column.name == '_unnamed': is_visible = False attribute, created = DataTableAttribute.objects.get_or_create(\ datatable=data_table, attribute=column.name, attribute_label=column.name, attribute_type=column.type.__name__, display_order=column.order, visible=is_visible) except: data_table.delete( ) # Deleting DataTable also deletes related DataTableAttribute objects err_msg = 'Failed to convert csv file to table. Error: %s' % str( sys.exc_info()[0]) LOGGER.error(err_msg) return None, err_msg msg('process_csv_file 3') # ----------------------------------------------------- # Generate SQL to create table from csv file # ----------------------------------------------------- try: sql_table = sql.make_table(csv_table, table_name) create_table_sql = sql.make_create_table_statement( sql_table, dialect="postgresql") data_table.create_table_sql = create_table_sql data_table.save() except: data_table.delete() err_msg = 'Generate SQL to create table from csv file. Error: %s' % str( sys.exc_info()[0]) LOGGER.error(err_msg) return None, err_msg msg('process_csv_file 4') # ----------------------------------------------------- # Execute the SQL and Create the Table (No data is loaded) # ----------------------------------------------------- conn = psycopg2.connect( get_datastore_connection_string(is_dataverse_db=is_dataverse_db)) try: cur = conn.cursor() cur.execute('drop table if exists %s CASCADE;' % table_name) cur.execute(create_table_sql) conn.commit() cur.close() except Exception as e: traceback.print_exc(sys.exc_info()) err_msg = "Error Creating table %s:%s" % (data_table.name, str(e)) LOGGER.error(err_msg) return None, err_msg finally: conn.close() # ----------------------------------------------------- # Copy Data to postgres csv data to Postgres # ----------------------------------------------------- connection_string = get_datastore_connection_string(\ url_format=True, is_dataverse_db=is_dataverse_db) try: engine, metadata = sql.get_connection(connection_string) except ImportError: err_msg = ("Failed to get SQL connection" "for copying csv data to database." "\n{0}".format(str(sys.exc_info()[0]))) LOGGER.error(err_msg) return None, err_msg # ----------------------------------------------------- # Iterate through rows and add data # ----------------------------------------------------- conn = engine.connect() trans = conn.begin() if csv_table.count_rows() > 0: insert = sql_table.insert() # Generate insert statement headers = csv_table.headers() # Pull table headers try: # create rows of { column : value } dict's # rows_to_add = [ dict(zip(headers, row)) for row in csv_table.to_rows() ] # Add rows conn.execute(insert, rows_to_add) except: # Clean up after ourselves conn.close() csv_file_handle.close() instance.delete() err_msg = "Failed to add csv DATA to table %s.\n%s" %\ (table_name, (sys.exc_info()[0])) LOGGER.error(err_msg) return None, err_msg # Commit new rows and close connection # trans.commit() conn.close() csv_file_handle.close() return data_table, ""
def csv_to_table2(): # -------------------------------------- msgt('(3) csvkit to table reformat') # -------------------------------------- fh = open(t4_out, 'rb') csv_args = dict(delimiter='\t',\ quotechar='"') csv_table = table.Table.from_csv(f=fh,\ name='tname',\ snifflimit=None,\ ) print [c.name for c in csv_table] last_col = csv_table[-1] last_col.type = unicode for idx, val in enumerate(last_col): last_col[idx] = '%s' % val #last_col = ['%s' % x for x in last_col] #print last_col[0] msg(csv_table.to_rows()) print [ '%s, %s' % (c.name, c.type) for c in csv_table] return print 'last_col', last_col.order col_num = len(csv_table) print 'col_num', col_num quoted_data = [u'"%s"' % val for val in last_col] print 'quoted_data', quoted_data #return new_column = table.Column(order=last_col.order,\ name=last_col.name,\ l=quoted_data,\ #normal_type=None,\ ) #normal_type=None) csv_table.pop(-1) csv_table.append(new_column) sql_table = csvkit_sql.make_table(csv_table, 'new_table') create_table_sql = csvkit_sql.make_create_table_statement(sql_table, dialect="postgresql") msg('create_table_sql: %s' % create_table_sql) msg(csv_table.to_rows()) return msgt('new_column') msg(new_column) print new_column.name for val in new_column: print val #print len(new_column) """
#return new_column = table.Column(order=last_col.order,\ name=last_col.name,\ l=quoted_data,\ #normal_type=None,\ ) #normal_type=None) csv_table.pop(-1) csv_table.append(new_column) sql_table = csvkit_sql.make_table(csv_table, 'new_table') create_table_sql = csvkit_sql.make_create_table_statement(sql_table, dialect="postgresql") msg('create_table_sql: %s' % create_table_sql) msg(csv_table.to_rows()) return msgt('new_column') msg(new_column) print new_column.name for val in new_column: print val #print len(new_column) """ print csv_table.columns for col in csv_table:
report_scraper = ReportScraper(url_pattern=report_pattern) report_scraper.cache_storage = scrapelib.cache.FileCache(cache_dir) report_scraper.cache_write_only = False conn = sqlite3.connect(DB_NAME) c = conn.cursor() for comm_url in comm_urls: for report_data in report_scraper.scrape_one(comm_url): comm_id = parse_qs(urlparse(comm_url).query)['id'][0] report_data['committee_id'] = comm_id outp = StringIO() writer = UnicodeCSVDictWriter(outp, fieldnames=report_data.keys()) writer.writeheader() writer.writerow(report_data) outp.seek(0) t = Table.from_csv(outp, name='reports') sql_table = make_table(t) try: c.execute('select * from reports limit 1') except sqlite3.OperationalError: create_st = make_create_table_statement(sql_table) c.execute(create_st) conn.commit() c.execute('select * from reports where id = ?', (int(report_data['id']), )) existing = c.fetchall() if not existing: insert = sql_table.insert() headers = t.headers() rows = [dict(zip(headers, row)) for row in t.to_rows()] for row in rows: c.execute(str(insert), row)