def sniff(self, file_upload: FileUpload, encoding: str = settings.DEFAULT_CHARSET, limit: int = 5) -> SniffResult: try: with file_upload.open() as csv_file: has_header = unicodecsv.Sniffer().has_header( csv_file.read(1024).decode(encoding)) csv_file.seek(0) dialect = unicodecsv.Sniffer().sniff( csv_file.read(1024).decode(encoding)) csv_format_opts = dict(dialect=dialect, ) csv_file.seek(0) reader = unicodecsv.reader(csv_file, **csv_format_opts) if has_header: header = next(reader) else: header = None rows = list(islice(reader, max(0, limit))) if limit > 0 else [] except (UnicodeDecodeError, unicodecsv.Error) as e: raise ParsingException(str(e)) from e contact_serializer = self.get_contact_serializer(data={}) fields = { name: field for name, field in contact_serializer.get_fields().items() if not field.read_only } headers_mapping = {} if header: for num, name in enumerate(header): field_names = difflib.get_close_matches(name, fields.keys(), n=1) if field_names: fields_name = field_names[0] headers_mapping[fields_name] = num return SniffResult( dict( has_header=has_header, delimiter=dialect.delimiter, ), list(fields.keys()), rows, headers_mapping, )
def load_from_csv(filename): waypoints = [] _dirname, _name = os.path.split(filename) _fs = open_fs(_dirname) with _fs.open(_name, "rb") as in_file: lines = in_file.readlines() if len(lines) < 4: raise SyntaxError("CSV file requires at least 4 lines!") dialect = csv.Sniffer().sniff(lines[-1].decode("utf-8")) csv_reader = csv.reader(lines, encoding="utf-8", dialect=dialect) name = next(csv_reader)[0] next(csv_reader) # header for row in csv_reader: wp = ft.Waypoint() wp.location = row[1] wp.lat = float(row[2]) wp.lon = float(row[3]) wp.flightlevel = float(row[4]) wp.pressure = float(row[5]) * 100. wp.distance_to_prev = float(row[6]) wp.distance_total = float(row[7]) wp.comments = row[8] waypoints.append(wp) name = os.path.basename(filename.replace(".csv", "").strip()) return name, waypoints
def validate_delimiter(delimiter, codebook): """ Validate the selected delimiter matches the sniffed delimeter :delimiter: delimiter as selected from the UI :codebook: open file object for reading :return: boolean of delimiter mismatch and errors list """ errors = [] delimiter_mismatch = False codebook.readline() dialect = csv.Sniffer().sniff(codebook.readline()) sniffed_delimiter = dialect.delimiter if delimiter != sniffed_delimiter: error = { 'errors': u"Selected delimiter doesn't match file delimiter", 'schema_name': 'N/A', 'schema_title': 'N/A', 'name': 'N/A', 'title': 'N/A' } delimiter_mismatch = True errors.append(error) codebook.seek(0) return delimiter_mismatch, errors
def grab_that(station): """ A method that extracts climate data from CSV and converts it to a dictionary object. """ with codecs.open( station, 'rb', ) as f: # Tries to figure out CSV formatting to address encoding issues. dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) lines = csv.reader(f, dialect) for i in range(16): # Skips the metadata next(lines) names, datum = [], {} for column in lines: for name in column: names.append(name) datum[name] = [] break reader = csv.DictReader(f, fieldnames=names, delimiter=',', quotechar='"') for row in reader: for column, value in row.iteritems(): value = convert(value) datum.setdefault(column, []).append(value) return datum
def load_dataset_csv(filename): info = open(filename, "rb") has_header = unicodecsv.Sniffer().has_header(info.read(1024)) info.seek(0) incsv = csv.reader(info) if has_header: next(incsv) #Skip header dataset = list(incsv) return dataset
def next_source_row(self, handle): """ Given a file handle, return the next row of data as a key value dict. Return None to denote the EOF Return False to skip this row of data entirely """ if not getattr(self, "detected_dialect", None): # Sniff for the dialect of the CSV file pos = handle.tell() handle.seek(0) readahead = handle.read(1024) handle.seek(pos) try: dialect = csv.Sniffer().sniff(readahead, ",") except csv.Error: # Fallback to excel format pass dialect = csv.excel dialect_attrs = [ "delimiter", "doublequote", "escapechar", "lineterminator", "quotechar", "quoting", "skipinitialspace" ] self.detected_dialect = {x: getattr(dialect, x) for x in dialect_attrs} if not getattr(self, "reader", None): self.reader = csv.reader(handle, **self.detected_dialect) if not getattr(self, "detected_columns", None): # On first iteration, the line will be the column headings, # store those and return False to skip processing columns = self.reader.next() self.detected_columns = columns return False cols = self.detected_columns try: values = self.reader.next() except StopIteration: return None if not values: return None return dict(zip(cols, values))
def main(): dialect = csv.Sniffer().sniff(EJEMPLO) reader = csv.reader(open(sys.argv[1]), dialect=dialect) writer = csv.DictWriter(open('productos.csv', 'w'), fieldnames=PRODUCTO_COLS) writer.writeheader() bar = Bar('Normalizando CSV', suffix='%(percent)d%%') for l in bar.iter(reader): data = normalizar(dict(zip(headers, l))) writer.writerow(data)
def csv_data(csv_path, skip_header=True): """Pass in the path to a CSV file, returns a CSV Reader object. """ csv_file = open(csv_path, 'r') # Determine the CSV dialect. dialect = unicodecsv.Sniffer().sniff(csv_file.read(1024)) csv_file.seek(0) data = unicodecsv.reader(csv_file, dialect) if skip_header: data.next() return data
def fectxt2pivot(self, fileobj): fieldnames = [ 'journal', # JournalCode False, # JournalLib False, # EcritureNum 'date', # EcritureDate 'account', # CompteNum False, # CompteLib 'partner_ref', # CompAuxNum False, # CompAuxLib 'ref', # PieceRef False, # PieceDate 'name', # EcritureLib 'debit', # Debit 'credit', # Credit 'reconcile_ref', # EcritureLet False, # DateLet False, # ValidDate False, # Montantdevise False, # Idevise ] first_line = fileobj.readline().decode() dialect = unicodecsv.Sniffer().sniff(first_line, delimiters="|\t") fileobj.seek(0) reader = unicodecsv.DictReader( fileobj, fieldnames=fieldnames, delimiter=dialect.delimiter, encoding=self.file_encoding) res = [] i = 0 for l in reader: i += 1 # Skip header line if i == 1: continue l['credit'] = l['credit'] or '0' l['debit'] = l['debit'] or '0' vals = { 'journal': l['journal'], 'account': l['account'], 'partner': l['partner_ref'], 'credit': float(l['credit'].replace(',', '.')), 'debit': float(l['debit'].replace(',', '.')), 'date': datetime.strptime(l['date'], '%Y%m%d'), 'name': l['name'], 'ref': l['ref'], 'reconcile_ref': l['reconcile_ref'], 'line': i, } res.append(vals) return res
def _csv_data_from_file(csv_file, preview_limit=10): try: dialect = unicodecsv.Sniffer().sniff(csv_file.read(1024)) csv_file.seek(0) csv_reader = unicodecsv.reader(csv_file, dialect) csv_values = itertools.islice(csv_reader, preview_limit) csv_values = zip(*csv_values) return {'success': True, 'data': csv_values} except unicodecsv.Error as exc: return {'success': False, 'error': exc.message} except UnicodeDecodeError as exc: return {'success': False, 'error': exc}
def genericcsv2pivot(self, fileobj): # Prisme fieldnames = [ 'date', 'journal', 'account', 'partner', 'analytic', 'name', 'debit', 'credit', 'ref', 'reconcile_ref' ] first_line = fileobj.readline().decode() dialect = unicodecsv.Sniffer().sniff(first_line) fileobj.seek(0) reader = unicodecsv.DictReader( fileobj, fieldnames=fieldnames, delimiter=dialect.delimiter, quotechar='"', quoting=unicodecsv.QUOTE_MINIMAL, encoding='utf-8') res = [] i = 0 for l in reader: i += 1 if i == 1 and self.file_with_header: continue date_str = l['date'] try: date = datetime.strptime(date_str, self.date_format) except Exception: raise UserError( (_("time data : '%s' in line %s does not match format '%s") ) % (date_str, i, self.date_format)) vals = { 'journal': l['journal'], 'account': l['account'], 'credit': float(l['credit'].replace(',', '.') or 0), 'debit': float(l['debit'].replace(',', '.') or 0), 'date': date, 'name': l['name'], 'ref': l.get('ref', ''), 'reconcile_ref': l.get('reconcile_ref', ''), 'line': i, } if l['analytic']: vals['analytic'] = l['analytic'] if l['partner']: vals['partner'] = l['partner'] res.append(vals) return res
def commit_to_csv(commit, csv_filename): """ Get a CSV generator from a git commit. """ repo.git_dir data = (commit.tree / csv_filename).data_stream.read() dialect = csv.Sniffer().sniff(StringIO(unicode(data)).read(1024)) data = data.splitlines() for n, row in enumerate(data): if n == 0: data[n] = "ID" + dialect.delimiter + row else: data[n] = str(n) + dialect.delimiter + row data = "\n".join(data) csv_out = csv.DictReader(StringIO(unicode(data), newline=None), dialect=dialect) return csv_out
def validate_delimiter(delimiter, codebook): """ Validate the selected delimiter matches the sniffed delimeter :delimiter: delimiter as selected from the UI :codebook: open file object for reading :return: boolean of delimiter mismatch and errors list """ errors = [] delimiter_mismatch = False codebook.readline() row = codebook.readline() try: dialect = csv.Sniffer().sniff(row) except Exception as e: error = { 'errors': '{} - Row: {}'.format(e.message, row), 'schema_name': 'N/A', 'schema_title': 'N/A', 'schema_publish_date': 'N/A', 'name': 'N/A', 'title': 'N/A' } else: sniffed_delimiter = dialect.delimiter if delimiter != sniffed_delimiter: error = { 'errors': u"Selected delimiter doesn't match file delimiter", 'schema_name': 'N/A', 'schema_title': 'N/A', 'schema_publish_date': 'N/A', 'name': 'N/A', 'title': 'N/A' } delimiter_mismatch = True errors.append(error) codebook.seek(0) return delimiter_mismatch, errors
def place_that(name): """ When given a filename will dump station location headers to console and return a dictionary with raw unicode keys and values for station name and location variables. """ try: location = str(name) with codecs.open(location, 'rb') as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) verifier = csv.reader(f, dialect) for count, row in enumerate(verifier): # Read and format metadata if count > 6: break f.seek(0) names = ('Station Name', 'Province', 'Latitude', 'Longitude', 'Elevation', 'Climate Identifier', 'WMO Identifier', 'TC Identifier') datum = {} for name in names: datum[name] = [] for count, row in enumerate(verifier): if count == 0: # Special handling to deal with UTF-8 BOM key = 'Station Name' field = convert(row[1]) datum[key] = field continue try: if row[0] in names: key = convert(row[0]) field = convert(row[1]) datum[key] = field except Exception as e: print e continue return datum except ValueError: raise Exception("Invalid station CSV. \ Verify that CSVs hold Environment Canada station data.") pass
def detectDialect(self, filename, comment="#"): """ detectDialect """ dialect = None with open(filename, "rb") as stream: stream = self.skip_commented_or_empty_lines(stream, comment) n = 128 detected = False while not detected: try: dialect = csv.Sniffer().sniff(stream.read(n), delimiters=";,") detected = True except Exception as ex: #print ex,"n=",n n = n * 2 stream.seek(0) return dialect
def load_csvf(fpath, fieldnames, encoding): """ :param unicode fpath: :param Optional[list[unicode]] fieldnames: :param unicode encoding: :rtype: List[dict] """ with open(fpath, 'rb') as f: snippet = f.read(8192) f.seek(0) dialect = csv.Sniffer().sniff( snippet if PYTHON2 else snippet.decode(encoding)) dialect.skipinitialspace = True return list( csv.DictReader(f, fieldnames=fieldnames, dialect=dialect, encoding=encoding))
def writeToFile(row_list): if Path('result.csv').is_file(): f = open("result.csv", "a") sniffer = csv.Sniffer() csv_dialect = sniffer.sniff(open("result.csv").readline()) writer = csv.writer(f, encoding='UTF-8', quoting=csv.QUOTE_NONE, escapechar='\\', dialect=csv_dialect) writer.writerows(row_list) f.close() else: with open('result.csv', 'w') as file: writer = csv.writer(file, encoding='UTF-8', delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\') writer.writerows(row_list)
def writeToFile(row_list): file_name = 'percents.csv' if Path(file_name).is_file(): f = open(file_name, "a") sniffer = csv.Sniffer() csv_dialect = sniffer.sniff(open(file_name).readline()) writer = csv.writer(f, encoding='UTF-8', quoting=csv.QUOTE_NONE, escapechar='|', dialect=csv_dialect) writer.writerows(row_list) f.close() else: with open(file_name, 'w') as file: writer = csv.writer(file, encoding='UTF-8', delimiter=',', quoting=csv.QUOTE_NONE, escapechar='|') writer.writerows(row_list)
def records(self, in_file): import unicodecsv as csv with open(in_file, 'rb') as csvfile: dialect = csv.Sniffer().sniff(csvfile.read(1024)) csvfile.seek(0) reader = None if self.columns: reader = csv.DictReader(csvfile, fieldnames=self.columns, dialect=dialect) elif self.read_header: reader = csv.DictReader(csvfile, dialect=dialect) else: reader = csv.reader(csvfile, dialect=dialect) def convert(obj): if isinstance(obj, dict): return obj return dict( ("_c%s" % idx, v) for idx,v in enumerate(obj)) for row in reader: if len(row) > 0: yield convert(row)
def to_internal_value(self, data): super(JsonFileField, self).to_internal_value(data) if is_zipfile(data): with ZipFile(data) as zf: raw_data = zf.read(splitext(data.name)[0]) else: data.seek(0) raw_data = data.read() try: data.json = json.loads(raw_data) except ValueError: try: data.json = json.loads(raw_data, encoding='cp1251') except ValueError: try: lines = raw_data.splitlines() dialect = csv.Sniffer().sniff(lines[0], [',', ';', '\t']) data.json = [item for item in csv.DictReader(lines, dialect=dialect)] except (ValueError, csv.Error): self.fail('json') return data
def get_csv(infile): sniff_range = 4096 sniffer = csv.Sniffer() dialect = sniffer.sniff(infile.read(sniff_range), delimiters=DELIMITERS) infile.seek(0) # Sniff for header header = sniffer.has_header(infile.read(sniff_range)) infile.seek(0) # get the csv reader reader = csv.reader(infile, dialect) firstrow = next(reader) colnames = [] for i, h in enumerate(firstrow): if len(h) > 0 and header: colnames.append(h) else: colnames.append('COLUMN{}'.format(i + 1)) if not header: infile.seek(0) return (reader, colnames)
def upload_randomization_json(context, request): """ Handles RANDID file uploads. The file is expected to be a CSV with the following columns: * ARM * STRATA * BLOCKID * RANDID In addition, the CSV file must have the columns as the form it is using for randomization. """ check_csrf_token(request) db_session = request.db_session if not context.is_randomized: # No form check required as its checked via database constraint raise HTTPBadRequest(body=_(u'This study is not randomized')) input_file = request.POST['upload'].file input_file.seek(0) # Ensure we can read the CSV try: csv.Sniffer().sniff(input_file.read(1024)) except csv.Error: raise HTTPBadRequest(body=_(u'Invalid file-type, must be CSV')) else: input_file.seek(0) reader = csv.DictReader(input_file) # Case-insensitive lookup fieldnames = dict((name.upper(), name) for name in reader.fieldnames) stratumkeys = ['ARM', 'BLOCKID', 'RANDID'] formkeys = context.randomization_schema.attributes.keys() # Ensure the CSV defines all required columns required = stratumkeys + formkeys missing = [name for name in required if name.upper() not in fieldnames] if missing: raise HTTPBadRequest( body=_(u'File upload is missing the following columns ${columns}', mapping={'columns': ', '.join(missing)})) # We'll be using this to create new arms as needed arms = dict([(arm.name, arm) for arm in context.arms]) # Default to comple state since they're generated by a statistician complete = (db_session.query( datastore.State).filter_by(name=u'complete').one()) for row in reader: arm_name = row[fieldnames['ARM']] if arm_name not in arms: arms[arm_name] = models.Arm(study=context, name=arm_name, title=arm_name) stratum = models.Stratum(study=context, arm=arms[arm_name], block_number=int(row[fieldnames['BLOCKID']]), randid=row[fieldnames['RANDID']]) if 'STRATA' in fieldnames: stratum.label = row[fieldnames['STRATA']] db_session.add(stratum) entity = datastore.Entity(schema=context.randomization_schema, state=complete) for key in formkeys: entity[key] = row[fieldnames[key.upper()]] stratum.entities.add(entity) try: db_session.flush() except sa.exc.IntegrityError as e: if 'uq_stratum_reference_number' in e.message: raise HTTPBadRequest(body=_( u'The submitted file contains existing reference numbers. ' u'Please upload a file with new reference numbers.')) return HTTPOk()
from __future__ import unicode_literals import sys from io import BytesIO import six import unicodecsv from rows.plugins.utils import ( create_table, get_filename_and_fobj, ipartition, serialize, ) sniffer = unicodecsv.Sniffer() unicodecsv.field_size_limit(sys.maxsize) def fix_dialect(dialect): if not dialect.doublequote and dialect.escapechar is None: dialect.doublequote = True if dialect.quoting == unicodecsv.QUOTE_MINIMAL and dialect.quotechar == "'": # Python csv's Sniffer seems to detect a wrong quotechar when # quoting is minimal dialect.quotechar = '"' if six.PY2:
def process(self, data, url_object): """Process the CSV, by executing rules and saving matches.""" from ..scanner_types.scanner import Scanner scanner = Scanner.from_scan_id(url_object.scan.pk) # print "*** 1 ***" # If we don't have to do any annotation/replacement, treat it like a # normal text file for efficiency if not scanner.scan_object.output_spreadsheet_file: return self.text_processor.process(data, url_object) # Check if scan is limited to certain columns. columns = scanner.scan_object.columns columns = list(map(int, columns.split(','))) if columns else [] # Try to detect the CSV Dialect using the first 1024 characters of # the data try: dialect = unicodecsv.Sniffer().sniff(data[:1024]) except unicodecsv.Error: # Couldn't detect CSV Dialect, processing failed scanner.scan_object.log_occurrence("Could not detect CSV " "dialect. Could not perform " "annotation/replacement.") return False # Sniffer.sniff doesn't set escape character or quoting dialect.escapechar = '\\' dialect.quoting = unicodecsv.QUOTE_ALL # print "*** 2 ***" # Convert unicode dialect properties to str because csv.Reader # expects them to be dialect.delimiter = str(dialect.delimiter) dialect.quotechar = str(dialect.quotechar) dialect.doublequote = str(dialect.doublequote) dialect.escapechar = str(dialect.escapechar) dialect.lineterminator = str(dialect.lineterminator) dialect.skipinitialspace = str(dialect.skipinitialspace) rows = [] # print "*** 3 ***" # Read CSV file reader = unicodecsv.reader(io.StringIO(data.encode('utf-8')), dialect) first_row = True header_row = [] for row in reader: warnings_in_row = [] if first_row: header_row = row # Append column header row.append("Matches") first_row = False rows.append(row) continue for i in range(len(row)): # If columns are specified, and present column is not listed, # skip. if columns and not i + 1 in columns: continue # Execute rules on each cell matches = scanner.execute_rules(row[i]) for match in matches: # Save matches match['url'] = url_object match['scan'] = url_object.scan match.save() warnings_in_row.append((match['matched_rule'], i)) # Only replace HIGH sensitivity matches if not match['sensitivity'] == Sensitivity.HIGH: continue if (scanner.scan_object.do_cpr_replace and match['matched_rule'] == 'cpr'): replacement = scanner.scan_object.cpr_replace_text elif (scanner.scan_object.do_name_replace and match['matched_rule'] == 'name'): replacement = scanner.scan_object.name_replace_text elif (scanner.scan_object.do_address_replace and match['matched_rule'] == 'address'): replacement = scanner.scan_object.address_replace_text else: replacement = None # Replace matched text with replacement text dependent # on rule matched if replacement is demanded if replacement is not None: # Some rules like CPR rule mask their matched_data, # so the real matched text is in original_matched_data try: search_text = match['original_matched_data'] except KeyError: search_text = match['matched_data'] row[i] = row[i].replace(search_text, replacement) # Add annotation cell indicating which rules were matched and in # which column annotation = ", ".join( "%s (%s)" % (Match.get_matched_rule_display_name(warning[0]), header_row[warning[1]]) for warning in warnings_in_row) row.append(annotation) rows.append(row) # print "*** 4 ***" # Write to output file with open(scanner.scan_object.scan_output_file, 'w') as f: writer = unicodecsv.writer(f, delimiter=';', quotechar='"', escapechar='|') writer.writerows(rows) # print "*** 5 ***" return True
# return True on success and False on errors return pisaStatus.err import_dir= sys.argv[1] + "/" user_list = import_dir + "all-students.txt" students = 1 if not os.path.exists( user_list ): user_list=import_dir + "all-user.txt" students = 0 if not os.path.exists( import_dir + "/passwordfiles" ): os.mkdir( import_dir + "passwordfiles", 0770 ); all_classes = [] with open(user_list) as csvfile: #Detect the type of the csv file dialect = unicodecsv.Sniffer().sniff(csvfile.read(1024)) csvfile.seek(0) #Create an array of dicts from it unicodecsv.register_dialect('oss',dialect) reader = unicodecsv.DictReader(csvfile,dialect='oss') for row in reader: fobj = open("/usr/share/oss/templates/password.html","r") template = fobj.read().decode('utf8') fobj.close() uid="" group="" for field in reader.fieldnames: template = template.replace(field,escape(row[field])) if field == "UID" or field == "BENUTZERNAME" or field == "LOGIN": uid=row[field] if students == 1 and ( field == "CLASS" or field == "KLASSE" ):
def analyse_stream(self, byte_file_obj, **kwargs): """ Analyse a stream of bytes and interpret as csv file. may want to revert back to this commit if things break: https://github.com/derwentx/WooGenerator/commit/c4fabf83d5b4d1e0a4d3ff755cd8eadf1433d253 Arguments: ---- byte_file_obj (io.IOBase): The byte stream to be analysed limit (int): The number of items to process from the stream dialect_suggestion (unicodecsv.Dalect, basestring, optional): A suggestion for the dialect to process the csv file as encoding (basestring, optional): The encoding of the file stream. Defaults to utf8 stream_name: Used to differentiate this stream from others in debugging. Raises: ---- UserWarning: When analyse_stream called withoud clearing transient first """ limit, dialect_suggestion, encoding, stream_name = \ (kwargs.get('limit'), kwargs.get('dialect_suggestion'), kwargs.get('encoding'), kwargs.get('stream_name')) if hasattr(self, 'rowcount') and self.rowcount > 1: warn = UserWarning( 'rowcount should be 0. Make sure clear_transients is being called on ancestors' ) self.raise_exception(warn) if encoding is None: encoding = "utf8" if stream_name is None: if hasattr(byte_file_obj, 'name'): stream_name = byte_file_obj.name else: stream_name = 'stream' if self.DEBUG_PARSER: self.register_message( "Analysing stream: {0}, encoding: {1}".format( stream_name, encoding)) # I can't imagine this having any problems byte_sample = SanitationUtils.coerce_bytes(byte_file_obj.read(1000)) byte_file_obj.seek(0) if self.DEBUG_PARSER: self.register_message("dialect_suggestion: %s" % dialect_suggestion) if dialect_suggestion: csvdialect = UnicodeCsvDialectUtils.get_dialect_from_suggestion( dialect_suggestion) else: csvdialect = unicodecsv.Sniffer().sniff(byte_sample) assert \ csvdialect.delimiter == ',' and isinstance( csvdialect.delimiter, str) if self.DEBUG_PARSER: self.register_message( UnicodeCsvDialectUtils.dialect_to_str(csvdialect)) unicodecsvreader = unicodecsv.reader(byte_file_obj, dialect=csvdialect, encoding=encoding, strict=True) return self.analyse_rows(unicodecsvreader, file_name=stream_name, limit=limit)
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"): """ Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``. Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values. """ url = os.path.basename(infile) # Get the current date and time (UTC) today = datetime.datetime.utcnow().strftime("%Y-%m-%d") if dataset_name is None: dataset_name = url if encoding is None: detector = UniversalDetector() with open(infile, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'], detector.result['confidence'])) if delimiter is None: try: #Python 3 with open(infile, 'r', errors='ignore') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) except TypeError: #Python 2 with open(infile, 'r') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter)) delimiter = dialect.delimiter logger.info("Delimiter is: {}".format(delimiter)) if base.endswith('/'): base = base[:-1] metadata = { u"@id": iribaker.to_iri(u"{}/{}".format(base, url)), u"@context": [u"https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json", {u"@language": u"en", u"@base": u"{}/".format(base)}, get_namespaces(base)], u"url": url, u"dialect": {u"delimiter": delimiter, u"encoding": encoding, u"quoteChar": quotechar }, u"dc:title": dataset_name, u"dcat:keyword": [], u"dc:publisher": { u"schema:name": u"CLARIAH Structured Data Hub - Datalegend", u"schema:url": {u"@id": u"http://datalegend.net"} }, u"dc:license": {u"@id": u"http://opendefinition.org/licenses/cc-by/"}, u"dc:modified": {u"@value": today, u"@type": u"xsd:date"}, u"tableSchema": { u"columns": [], u"primaryKey": None, u"aboutUrl": u"{_row}" } } with io.open(infile, 'rb') as infile_file: r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar) try: # Python 2 header = r.next() except AttributeError: # Python 3 header = next(r) logger.info(u"Found headers: {}".format(header)) if u'' in header: logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse") if len(set(header)) < len(header): logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse") # First column is primary key metadata[u'tableSchema'][u'primaryKey'] = header[0] for head in header: col = { u"@id": iribaker.to_iri(u"{}/{}/column/{}".format(base, url, head)), u"name": head, u"titles": [head], u"dc:description": head, u"datatype": u"string" } metadata[u'tableSchema'][u'columns'].append(col) with open(outfile, 'w') as outfile_file: outfile_file.write(json.dumps(metadata, indent=True)) logger.info("Done") return
def csv_file_dialect(fullpath): """Detect the dialect of a CSV or TXT data file. parameters: fullpath - full path to the file to process (required) returns: dialect - a csv.dialect object with the detected attributes """ if fullpath is None or len(fullpath) == 0: logging.debug('No file given in csv_file_dialect().') return False # Cannot function without an actual file where full path points if os.path.isfile(fullpath) == False: logging.debug('File %s not found in csv_file_dialect().' % fullpath) return None # Let's look at up to readto bytes from the file readto = 4096 filesize = os.path.getsize(fullpath) if filesize < readto: readto = filesize with open(fullpath, 'rb') as file: # Try to read the specified part of the file try: buf = file.read(readto) s = 'csv_file_dialect()' s += ' buf:\n%s' % buf logging.debug(s) # Make a determination based on existence of tabs in the buffer, as the # Sniffer is not particularly good at detecting TSV file formats. So, if the # buffer has a tab in it, let's treat it as a TSV file if buf.find('\t')>0: return tsv_dialect() # dialect = csv.Sniffer().sniff(file.read(readto)) # Otherwise let's see what we can find invoking the Sniffer. dialect = csv.Sniffer().sniff(buf) except csv.Error: # Something went wrong, so let's try to read a few lines from the beginning of # the file try: file.seek(0) s = 'csv_file_dialect()' s += ' Re-sniffing with tab to %s' % (readto) logging.debug(s) sample_text = ''.join(file.readline() for x in xrange(2,4,1)) dialect = csv.Sniffer().sniff(sample_text) # Sorry, couldn't figure it out except csv.Error: logging.debug('Unable to determine csv dialect') return None # Fill in some standard values for the remaining dialect attributes if dialect.escapechar is None: dialect.escapechar='/' dialect.skipinitialspace=True dialect.strict=False return dialect
def parse_and_import( self, file_upload: FileUpload, headers: Dict[str, int], has_headers: Optional[bool] = None, # todo: maybe it is better to accept dialect to give more options to configure delimiter: Optional[str] = None, encoding: str = settings.DEFAULT_CHARSET, allow_update: bool = True, atomic: bool = False, create_failed_rows_file: bool = False, detailed_errors_limit: int = 20, campaign: Optional[Campaign] = None, contact_list: Optional[ContactList] = None) -> ImportResult: indexes = {index: header for header, index in headers.items()} with file_upload.open() as csv_file: csv_format_opts = dict( dialect=unicodecsv.excel, encoding=encoding, ) try: if has_headers is None: has_headers = unicodecsv.Sniffer().has_header( csv_file.read(1024).decode(encoding)) csv_file.seek(0) if delimiter is None: dialect = unicodecsv.Sniffer().sniff( csv_file.read(1024).decode(encoding)) csv_format_opts['dialect'] = dialect csv_file.seek(0) else: csv_format_opts['delimiter'] = delimiter csv_reader = unicodecsv.reader(csv_file, **csv_format_opts) header = next(csv_reader) if has_headers else None process_rows = partial(self._process_rows, csv_reader, indexes, allow_update, atomic, detailed_errors_limit) except (UnicodeDecodeError, unicodecsv.Error) as e: raise ParsingException(str(e)) from e failed_rows_file_upload = None with transaction.atomic(savepoint=False): if not create_failed_rows_file: created_contacts, updated_contacts, skipped_contacts, errors = process_rows( None) else: with tempfile.TemporaryFile() as fp, transaction.atomic( savepoint=False): csv_writer = unicodecsv.writer(fp, **csv_format_opts) if header: csv_writer.writerow(header) created_contacts, updated_contacts, skipped_contacts, errors = process_rows( csv_writer.writerow) if errors: fp.seek(0) failed_rows_file_upload = FileUpload.objects.create( owner=file_upload.owner, uploader=FileUploader.SYSTEM, ttl=datetime.timedelta(days=2), file=File( fp, "failed-rows-from-%s" % file_upload.name)) if campaign: participating = set( campaign.contacts.values_list('id', flat=True)) Participation.objects.bulk_create((Participation( contact_id=contact_id, campaign=campaign, ) for contact_id in chain( created_contacts, filter( lambda contact_id: contact_id not in participating, updated_contacts)))) if contact_list: contact_list.contacts.add(*created_contacts) contact_list.contacts.add(*updated_contacts) return ImportResult(len(created_contacts), len(updated_contacts), len(skipped_contacts), errors, failed_rows_file_upload)
def parse_csv(myfile, newsletter, ignore_errors=False): """ Parse addresses from CSV file-object into newsletter. Returns a dictionary mapping email addresses into Subscription objects. """ import unicodecsv encoding = get_encoding(myfile) # Attempt to detect the dialect # Ref: https://bugs.python.org/issue5332 encodedfile = io.TextIOWrapper(myfile, encoding=encoding, newline='') dialect = unicodecsv.Sniffer().sniff(encodedfile.read(1024)) # Reset the file index myfile.seek(0) logger.info('Detected encoding %s and dialect %s for CSV file', encoding, dialect) myreader = unicodecsv.reader(myfile, dialect=dialect, encoding=encoding) firstrow = next(myreader) # Find name column colnum = 0 namecol = None for column in firstrow: if "name" in column.lower() or _("name") in column.lower(): namecol = colnum if "display" in column.lower() or \ _("display") in column.lower(): break colnum += 1 if namecol is None: raise forms.ValidationError(_( "Name column not found. The name of this column should be " "either 'name' or '%s'.") % _("name") ) logger.debug("Name column found: '%s'", firstrow[namecol]) # Find email column colnum = 0 mailcol = None for column in firstrow: if 'email' in column.lower() or \ 'e-mail' in column.lower() or \ _("e-mail") in column.lower(): mailcol = colnum break colnum += 1 if mailcol is None: raise forms.ValidationError(_( "E-mail column not found. The name of this column should be " "either 'email', 'e-mail' or '%(email)s'.") % {'email': _("e-mail")} ) logger.debug("E-mail column found: '%s'", firstrow[mailcol]) if namecol == mailcol: raise forms.ValidationError( _( "Could not properly determine the proper columns in the " "CSV-file. There should be a field called 'name' or " "'%(name)s' and one called 'e-mail' or '%(email)s'." ) % { "name": _("name"), "email": _("e-mail") } ) logger.debug('Extracting data.') address_list = AddressList(newsletter, ignore_errors) for row in myreader: if not max(namecol, mailcol) < len(row): logger.warning( "Column count does not match for row number %d", myreader.line_num, extra=dict(data={'row': row}) ) if ignore_errors: # Skip this record continue else: raise forms.ValidationError(_( "Row with content '%(row)s' does not contain a name and " "email field.") % {'row': row} ) address_list.add( row[mailcol], row[namecol], location="line %d" % myreader.line_num ) return address_list.addresses