def __init__(self, file_path, filename): now = datetime.now().isoformat() self.file_path = file_path self.filename = filename self.file_type = convert.guess_format(self.filename) if self.file_type not in ["xls", "csv", "xlsx"]: if client: client.captureMessage(" %s Unsupported Format: %s, (%s)" % (now, self.file_type, self.filename)) raise DedupeFileError("%s is not a supported format" % self.file_type) try: self.converted = convert.convert(open(self.file_path, "rb"), self.file_type) except UnicodeDecodeError: if client: client.captureException() raise DedupeFileError( "We had a problem with the file you uploaded. \ This might be related to encoding or the file name having the wrong file extension." ) self.line_count = self.converted.count("\n") if self.line_count > 10000: if client: client.captureMessage(" %s File too big: %s, (%s)" % (now, self.line_count, self.filename)) raise DedupeFileError("Your file has %s rows and we can only currently handle 10,000." % self.line_count) if client: client.captureMessage(" %s Format: %s, Line Count: %s" % (now, self.file_type, self.line_count))
def process_file(self): ''' Here we will see if the input file is CSV, or if it is an understood format that can be converted to CSV. Assuming it's one of those two, we will pass the resulting CSV file over to the csv processor. ''' for this_filename in self.filelist: logger.debug('Filename processing is %s', this_filename) self.format = convert.guess_format(this_filename) logger.debug('Guessed format of %s', self.format) if self.format == 'csv': self.filename = this_filename break elif self.format: # If it is not a CSV file, but some other # understood format, we will convert it to a CSV and # write it out to a temporary file. fh, self.temp_file = tempfile.mkstemp(suffix='.csv') os.close(fh) self.filename = self.temp_file try: logger.debug( 'Attempting to convert to format CSV (from %s)', self.format) with open(self.temp_file, 'w') as fh: fh.write( convert.convert(open(this_filename, 'rb'), self.format)) break except Exception, e: logger.exception('Failed to process %s to CSV: %s', self.filename, e) os.unlink(self.filename) self.filename = None
def upload(): session_id = unicode(uuid4()) f = request.files['input_file'] flask_session['session_name'] = f.filename file_type = f.filename.rsplit('.')[1] u = StringIO(f.read()) u.seek(0) if file_type != 'csv': # pragma: no cover file_format = convert.guess_format(flask_session['session_name']) u = StringIO(convert.convert(u, file_format)) fieldnames = [ slugify(unicode(i)) for i in u.next().strip('\r\n').split(',') ] flask_session['fieldnames'] = fieldnames user_id = flask_session['user_id'] user = db_session.query(User).get(user_id) group = user.groups[0] sess = DedupeSession(id=session_id, name=request.form.get('name'), description=request.form.get('description'), filename=f.filename, group=group, status=STATUS_LIST[0]['machine_name']) db_session.add(sess) db_session.commit() u.seek(0) with open('/tmp/%s_raw.csv' % session_id, 'wb') as s: s.write(u.getvalue()) del u initializeSession.delay(session_id) flask_session['session_id'] = session_id return jsonify(ready=True, session_id=session_id)
def process_file(self): ''' Here we will see if the input file is CSV, or if it is an understood format that can be converted to CSV. Assuming it's one of those two, we will pass the resulting CSV file over to the csv processor. ''' for this_filename in self.filelist: logger.debug('Filename processing is %s', this_filename) self.format=convert.guess_format(this_filename) logger.debug('Guessed format of %s', self.format) if self.format == 'csv': self.filename=this_filename break elif self.format: # If it is not a CSV file, but some other # understood format, we will convert it to a CSV and # write it out to a temporary file. fh, self.temp_file=tempfile.mkstemp(suffix='.csv') os.close(fh) self.filename=self.temp_file try: logger.debug('Attempting to convert to format CSV (from %s)', self.format) with open(self.temp_file,'w') as fh: fh.write(convert.convert(open(this_filename,'rb'), self.format)) break except Exception, e: logger.exception('Failed to process %s to CSV: %s', self.filename, e) os.unlink(self.filename) self.filename=None
def upload(): context = {} if request.method == 'POST': big_file = False try: files = request.files except RequestEntityTooLarge, e: files = None big_file = True current_app.logger.info(e) if files: f = files['input_file'] if allowed_file(f.filename): inp = StringIO(f.read()) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = [ 'We had a problem with reading your file. \ This could have to do with the file encoding or format' ] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(100): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j, d in enumerate(row): columns[j].append(row[column_ids[j]]) sample_data = [] guesses = {} for index, header_val in enumerate(session['header_row']): guesses[index] = guess_geotype(header_val, columns[index]) sample_data.append((index, header_val, columns[index])) session['sample_data'] = sample_data session['guesses'] = json.dumps(guesses) outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = [ 'Only .xls or .xlsx and .csv files are allowed.' ] else: context['errors'] = ['You must provide a file to upload.'] if big_file: context['errors'] = ['Uploaded file must be 10mb or less.']
def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if filetype == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if filetype == 'fixed': kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) self.output_file.write(data)
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = 'fixed' elif self.args.key: format = 'json' else: if self.args.file == sys.stdin: self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) format = convert.guess_format(self.args.file) if not format: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) if isinstance(self.args.file, file): f = self.args.file elif format in ('xls', 'xlsx'): f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self.args.schema if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if format == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if format == 'fixed': kwargs['output'] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.export: kwargs['export'] = self.args.export if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if filetype == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if filetype == 'fixed': kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) self.output_file.write(data)
def main(self): if self.filetype: filetype = self.filetype if filetype not in convert.SUPPORTED_FORMATS: self.csvResult = (_('"%s" is not a supported format') % self.filetype) # elif self.args.schema: # filetype = 'fixed' # elif self.args.key: # filetype = 'json' else: if not self.file_name or self.file_name == '-': self.csvResult = _('You must specify a format.') filetype = convert.guess_format(self.file_name) if not filetype: self.csvResult = _('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.file_name, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs # if self.args.schema: # kwargs['schema'] = self._open_input_file(self.args.schema) # # if self.args.key: # kwargs['key'] = self.args.key # # if self.args.snifflimit: # kwargs['snifflimit'] = self.args.snifflimit # # if self.args.sheet: # kwargs['sheet'] = self.args.sheet # # if self.args.no_inference: # kwargs['type_inference'] = False # # if filetype == 'csv' and self.args.no_header_row: # kwargs['no_header_row'] = True # # # Fixed width can be processed as a stream # if filetype == 'fixed': # kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) print 'out: ' out_file_opened = open(self.output_file,'w') print out_file_opened out_file_opened.write(data)
def upload(): context = {} if request.method == 'POST': big_file = False try: files = request.files except RequestEntityTooLarge, e: files = None big_file = True current_app.logger.info(e) if files: f = files['input_file'] if allowed_file(f.filename): inp = StringIO(f.read()) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = ['We had a problem with reading your file. \ This could have to do with the file encoding or format'] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(100): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j,d in enumerate(row): columns[j].append(row[column_ids[j]]) sample_data = [] guesses = {} for index, header_val in enumerate(session['header_row']): guesses[index] = guess_geotype(header_val, columns[index]) sample_data.append((index, header_val, columns[index])) session['sample_data'] = sample_data session['guesses'] = json.dumps(guesses) outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.'] else: context['errors'] = ['You must provide a file to upload.'] if big_file: context['errors'] = ['Uploaded file must be 10mb or less.']
def upload(): context = {} if request.method == 'POST': f = request.files['input_file'] if f: if allowed_file(f.filename): inp = StringIO(f.read()) if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH: inp.seek(0) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = [ 'We had a problem with reading your file. \ This could have to do with the file encoding or format' ] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(10): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j, d in enumerate(row): columns[j].append(row[column_ids[j]]) columns = [', '.join(c) for c in columns] sample_data = [] for index, _ in enumerate(session['header_row']): sample_data.append( (index, session['header_row'][index], columns[index])) session['sample_data'] = sample_data outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Uploaded file must be 10mb or less.'] else: context['errors'] = [ 'Only .xls or .xlsx and .csv files are allowed.' ] else: context['errors'] = ['You must provide a file to upload.'] return render_template('upload.html', **context)
def __init__(self, incoming_file): self.file = incoming_file # We need two copies of the file chunk generator: # One to detect file encoding, and one to grab the field names. all_chunks, first_chunk = itertools.tee(self.file.chunks()) self.first_chunk = next(first_chunk) self.chunks = all_chunks self.file_type = guess_format(self.file.name.lower()) self.file_encoding = self._file_encoding() self.field_names = self._field_names()
def convert_to_csv_reader(filename, sheet=None, infer_types=True): format = convert.guess_format(filename) f = open(filename, "rb") convert_kwargs = {} if sheet is not None: # Only pass `sheet` to the `convert` function when its set to # a non-None value. This is done to satisfy csvkit which checks # for the presence of `sheet`, not whether it's valid. convert_kwargs['sheet'] = sheet converted = StringIO(convert.convert(f, format, infer_types=infer_types, **convert_kwargs)) reader = UnicodeCSVReader(converted) return reader
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = 'fixed' elif self.args.key: format = 'json' else: if self.args.file == sys.stdin: self.argparser.error('You must specify a format when providing data via STDIN (pipe).') format = convert.guess_format(self.args.file) if not format: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if isinstance(self.args.file, file): f = self.args.file elif format in ('xls', 'xlsx'): f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self.args.schema if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if format == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if format == 'fixed': kwargs['output'] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def __init__(self, file_path, filename): now = datetime.now().isoformat() self.file_path = file_path self.filename = filename self.file_type = convert.guess_format(self.filename) if self.file_type not in ['xls', 'csv', 'xlsx']: logger.warning(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename)) raise DedupeFileError('%s is not a supported format' % self.file_type) self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type) self.line_count = self.converted.count('\n') if self.line_count > 10000: logger.warning(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename)) raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count) logger.warning(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
def upload(): context = {} if request.method == 'POST': f = request.files['input_file'] if f: if allowed_file(f.filename): inp = StringIO(f.read()) if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH: inp.seek(0) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = ['We had a problem with reading your file. \ This could have to do with the file encoding or format'] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(10): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j,d in enumerate(row): columns[j].append(row[column_ids[j]]) columns = [', '.join(c) for c in columns] sample_data = [] for index,_ in enumerate(session['header_row']): sample_data.append((index, session['header_row'][index], columns[index])) session['sample_data'] = sample_data outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Uploaded file must be 10mb or less.'] else: context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.'] else: context['errors'] = ['You must provide a file to upload.'] return render_template('upload.html', **context)
def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.sniff_limit: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['column_types'] = agate.TypeTester(limit=0) if filetype == 'csv' and self.args.no_header_row: kwargs['header'] = False convert.convert(self.input_file, filetype, output=self.output_file, **kwargs)
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = "fixed" elif self.args.key: format = "json" else: if self.args.file == sys.stdin: self.argparser.error("You must specify a format when providing data via STDIN (pipe).") format = convert.guess_format(self.args.file) if not format: self.argparser.error( "Unable to automatically determine the format of the input file. Try specifying a format with --format." ) if isinstance(self.args.file, file): f = self.args.file elif format == "xls": f = open(self.args.file, "rb") else: f = open(self.args.file, "rU") kwargs = self.reader_kwargs if self.args.schema: kwargs["schema"] = self.args.schema if self.args.key: kwargs["key"] = self.args.key if self.args.snifflimit: kwargs["snifflimit"] = self.args.snifflimit # Fixed width can be processed as a stream if format == "fixed": kwargs["output"] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: sys.exit('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = 'fixed' elif self.args.key: format = 'json' else: if self.args.file == '<stdin>': sys.exit('You must specify a format when providing data via STDIN (pipe).') format = convert.guess_format(self.args.file) if not format: sys.exit('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if isinstance(self.args.file, file): f = self.args.file elif format == 'xls': f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self.args.schema if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit # Fixed width can be processed as a stream if format == 'fixed': kwargs['output'] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def __init__(self, file_path, filename): now = datetime.now().isoformat() self.file_path = file_path self.filename = filename self.file_type = convert.guess_format(self.filename) if self.file_type not in ['xls', 'csv', 'xlsx']: client.captureMessage(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename)) raise DedupeFileError('%s is not a supported format' % self.file_type) try: self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type) except UnicodeDecodeError: client.captureException() raise DedupeFileError('We had a problem with the file you uploaded. \ This might be related to encoding or the file name having the wrong file extension.') self.line_count = self.converted.count('\n') if self.line_count > 10000: client.captureMessage(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename)) raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count) client.captureMessage(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
def test_guess_xls_uppercase(self): self.assertEqual('xls', convert.guess_format('testdata.XLS'))
def test_guess_xlsx(self): self.assertEqual('xlsx', convert.guess_format('testdata.xlsx'))
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) self.buffers_input = filetype == 'csv' or not self.args.no_inference # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet')) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet')) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file) self.input_file.close() if self.args.schema: schema.close()
def test_guess_fixed(self): self.assertEqual('fixed', convert.guess_format('testdata'))
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) # Buffer standard input if the input file is in CSV format or if performing type inference. self.buffers_input = filetype == 'csv' or not self.args.no_inference # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) if self.args.names_only: sheet_names = None if filetype == 'xls': sheet_names = xlrd.open_workbook( file_contents=self.input_file.read()).sheet_names() elif filetype == 'xlsx': sheet_names = openpyxl.load_workbook(self.input_file, read_only=True, data_only=True).sheetnames if sheet_names: for name in sheet_names: self.output_file.write('%s\n' % name) else: self.argparser.error( 'You cannot use the -n or --names options with non-Excel files.' ) self.input_file.close() return # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) kwargs['sniff_limit'] = self.args.sniff_limit if filetype not in ('dbf', 'geojson', 'json', 'ndjson'): kwargs['skip_lines'] = self.args.skip_lines if filetype != 'dbf': kwargs['column_types'] = self.get_column_types() # Convert the file. if filetype == 'csv' and self.args.no_inference and not self.args.skip_lines: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, **kwargs) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, **kwargs) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file) self.input_file.close() if self.args.schema: schema.close()
def main(self): # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if self.args.sheet: kwargs['sheet'] = self.args.sheet if filetype == 'csv': kwargs.update(self.reader_kwargs) # Streaming CSV musn't set sniff_limit, but non-streaming should. if not self.args.no_inference: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.no_header_row: kwargs['header'] = False elif self.args.no_inference: # Streaming CSV musn't set column_types, but other formats should. kwargs['column_types'] = agate.TypeTester(limit=0) # Convert the file. if filetype == 'csv' and self.args.no_inference: reader = agate.reader(self.input_file, **self.reader_kwargs) writer = agate.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None)) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError('DBF files can not be converted from stdin. You must pass a filename.') table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file)
def test_guess_csv(self): self.assertEqual('csv', convert.guess_format('testdata.csv'))
def test_guess_json(self): self.assertEqual('json', convert.guess_format('testdata.json'))
def test_guess_invalid(self): self.assertEqual(None, convert.guess_format('testdata.invalid'))
def test_guess_dbf(self): self.assertEqual('dbf', convert.guess_format('testdata.dbf'))
def index(): status_code = 200 error = None return make_response(render_app_template('back_soon.html', error=error), status_code) if flask_session.get('ga_cid') is None: try: flask_session['ga_cid'] = request.cookies['_ga'] except KeyError: flask_session['ga_cid'] = str(uuid4()) if request.method == 'POST': f = request.files['input_file'] if f and allowed_file(f.filename): fname = secure_filename(str(time.time()) + "_" + f.filename) file_path = os.path.abspath(os.path.join(UPLOAD_FOLDER, fname)) f.save(file_path) file_type = convert.guess_format(f.filename) f.seek(0) try: file_contents = convert.convert(f, file_type) except UnicodeDecodeError: file_contents = None if sentry: sentry.captureException() status_code = 500 error = ''' We had a problem with the file you uploaded. This might be related to encoding or the file name having the wrong file extension. ''' if file_contents: with open('{0}-converted.csv'.format(file_path), 'wb') as o: o.write(file_contents) # Delete existing session keys sess_keys = ['training_data', 'counter'] for k in sess_keys: try: del flask_session[k] except KeyError: pass flask_session['last_interaction'] = datetime.now() flask_session['raw_table'], \ flask_session['header'] = makeRawTable(file_contents) old = datetime.now() - timedelta(seconds=60 * 30) if flask_session['last_interaction'] < old: del flask_session['raw_table'] flask_session['filename'] = f.filename flask_session['file_path'] = file_path flask_session['file_type'] = file_type #send_ga_log( # 'Row Count', # flask_session['ga_cid'], # value=inp_file.line_count #) #send_ga_log( # 'File Type', # flask_session['ga_cid'], # label=inp_file.file_type, #) return redirect(url_for('select_fields')) #except DedupeFileError as e: # send_ga_log('Upload Error', flask_session['ga_cid'], label=e.message) # error = e.message # status_code = 500 else: error = 'Error uploading file. Did you forget to select one?' send_ga_log('Upload Error', flask_session['ga_cid'], label=error) status_code = 500 return make_response(render_app_template('index.html', error=error), status_code)
def main(self): path = self.args.input_path # Determine the file type. if self.args.filetype: filetype = self.args.filetype elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not path or path == '-': self.argparser.error( 'You must specify a format when providing input as piped data via STDIN.' ) filetype = convert.guess_format(path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying ' 'a format with --format.') if self.args.names_only: if filetype in ('xls', 'xlsx'): sheets = self.sheet_names(path, filetype) for sheet in sheets: self.output_file.write('%s\n' % sheet) else: self.argparser.error( 'You cannot use the -n or --names options with non-Excel files.' ) return # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = self.open_excel_input_file(path) else: self.input_file = self._open_input_file(path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if filetype == 'csv': kwargs.update(self.reader_kwargs) kwargs['sniff_limit'] = self.args.sniff_limit if filetype in ('xls', 'xlsx'): kwargs['header'] = not self.args.no_header_row if filetype not in ('dbf', 'geojson', 'json', 'ndjson'): # csv, fixed, xls, xlsx kwargs['skip_lines'] = self.args.skip_lines if filetype != 'dbf': kwargs['column_types'] = self.get_column_types() # Convert the file. if (filetype == 'csv' and self.args.no_inference and not self.args.no_header_row and not self.args.skip_lines and self.args.sniff_limit == 0): reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write( fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls( self.input_file, sheet=self.args.sheet, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError( 'DBF files can not be converted from stdin. You must pass a filename.' ) table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file, **self.writer_kwargs) if self.args.write_sheets: # Close and re-open the file, as the file object has been mutated or closed. self.input_file.close() self.input_file = self.open_excel_input_file(path) if self.args.write_sheets == '-': sheets = self.sheet_names(path, filetype) else: sheets = [ int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',') ] if filetype == 'xls': tables = agate.Table.from_xls( self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs) base = splitext(self.input_file.name)[0] for i, table in enumerate(tables.values()): with open('%s_%d.csv' % (base, i), 'w') as f: table.to_csv(f, **self.writer_kwargs) self.input_file.close() if self.args.schema: schema.close()
def main(self): path = self.args.input_path # Determine the file type. if self.args.filetype: filetype = self.args.filetype if filetype not in SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not path or path == '-': self.argparser.error('You must specify a format when providing input as piped data via STDIN.') filetype = convert.guess_format(path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if self.args.names_only: sheets = self.sheet_names(path, filetype) if sheets: for sheet in sheets: self.output_file.write('%s\n' % sheet) else: self.argparser.error('You cannot use the -n or --names options with non-Excel files.') return # Set the input file. if filetype in ('xls', 'xlsx'): self.input_file = self.open_excel_input_file(path) else: self.input_file = self._open_input_file(path) # Set the reader's arguments. kwargs = {} if self.args.schema: schema = self._open_input_file(self.args.schema) elif filetype == 'fixed': raise ValueError('schema must not be null when format is "fixed"') if filetype == 'csv': kwargs.update(self.reader_kwargs) kwargs['sniff_limit'] = self.args.sniff_limit if filetype in ('xls', 'xlsx'): kwargs['header'] = not self.args.no_header_row if filetype not in ('dbf', 'geojson', 'json', 'ndjson'): # csv, fixed, xls, xlsx kwargs['skip_lines'] = self.args.skip_lines if filetype != 'dbf': kwargs['column_types'] = self.get_column_types() # Convert the file. if filetype == 'csv' and self.args.no_inference and not self.args.no_header_row and not self.args.skip_lines and self.args.sniff_limit == 0: reader = agate.csv.reader(self.input_file, **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) writer.writerows(reader) elif filetype == 'fixed': self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs)) elif filetype == 'geojson': self.output_file.write(geojson2csv(self.input_file, **kwargs)) elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'): if filetype == 'csv': table = agate.Table.from_csv(self.input_file, **kwargs) elif filetype == 'json': table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs) elif filetype == 'ndjson': table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs) elif filetype == 'xls': table = agate.Table.from_xls(self.input_file, sheet=self.args.sheet, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs) elif filetype == 'dbf': if not hasattr(self.input_file, 'name'): raise ValueError('DBF files can not be converted from stdin. You must pass a filename.') table = agate.Table.from_dbf(self.input_file.name, **kwargs) table.to_csv(self.output_file, **self.writer_kwargs) if self.args.write_sheets: # Close and re-open the file, as the file object has been mutated or closed. self.input_file.close() self.input_file = self.open_excel_input_file(path) if self.args.write_sheets == '-': sheets = self.sheet_names(path, filetype) else: sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')] if filetype == 'xls': tables = agate.Table.from_xls(self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs) elif filetype == 'xlsx': tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs) base = splitext(self.input_file.name)[0] for i, table in enumerate(tables.values()): with open('%s_%d.csv' % (base, i), 'w') as f: table.to_csv(f, **self.writer_kwargs) self.input_file.close() if self.args.schema: schema.close()