def read_csv(args): dialect = csv.unix_dialect() dialect.strict = True with open(args.csvfile, newline='') as fp: reader = csv.reader(fp, dialect=dialect) read_header(next(reader), active) read_rows(reader)
def compute_dialect(self): try: extract = self.f.read(4096).decode(self.input_encoding) except (LookupError, UnicodeDecodeError): raise BadRequest('Unknown encoding {}'.format(self.input_encoding)) try: dialect = csv.Sniffer().sniff(extract) except csv.Error: dialect = csv.unix_dialect() self.f.seek(0) # Escape double quotes with double quotes if needed. # See 2.7 in http://tools.ietf.org/html/rfc4180 dialect.doublequote = True delimiter = self.request.form.get('delimiter') if delimiter: dialect.delimiter = delimiter # See https://github.com/etalab/addok/issues/90#event-353675239 # and http://bugs.python.org/issue2078: # one column files will end up with non-sense delimiters. if dialect.delimiter.isalnum(): # We guess we are in one column file, let's try to use a character # that will not be in the file content. for char in '|~^°': if char not in self.content: dialect.delimiter = char break else: raise BadRequest(self.MISSING_DELIMITER_MSG) self.dialect = dialect
def load(self, path): with codecs.open(path, 'r', encoding=self.encoding) as f: content = f.read() try: dialect = csv.Sniffer().sniff(content) except csv.Error: dialect = csv.unix_dialect() return csv.DictReader(content.splitlines(), dialect=dialect)
def load(self, path): with open(path, 'r') as f: extract = f.read(4096) try: dialect = csv.Sniffer().sniff(extract) except csv.Error: dialect = csv.unix_dialect() f.seek(0) content = f.read() return csv.DictReader(content.splitlines(), dialect=dialect)
def from_csv(cls, csv_path): working_dir = op.dirname(csv_path) with codecs.open(csv_path, 'r') as f: content = f.read() try: dialect = csv.Sniffer().sniff(content) except csv.Error: dialect = csv.unix_dialect() medias = list(csv.DictReader(content.splitlines(), dialect=dialect)) return cls(working_dir, medias)
def load_csv(path, encoding='utf-8'): path = Path(path) if not path.exists(): abort('Path does not exist: {}'.format(path)) with path.open(encoding=encoding) as f: extract = f.read(4096) try: dialect = csv.Sniffer().sniff(extract) except csv.Error: dialect = csv.unix_dialect() f.seek(0) content = f.read() return csv.DictReader(content.splitlines(), dialect=dialect)
def load_csv(path_or_file, encoding='utf-8'): if isinstance(path_or_file, (str, Path)): path = Path(path_or_file) if not path.exists(): abort('Path does not exist: {}'.format(path)) path_or_file = path.open(encoding=encoding) extract = path_or_file.read(4096) try: dialect = csv.Sniffer().sniff(extract) except csv.Error: dialect = csv.unix_dialect() path_or_file.seek(0) content = path_or_file.read() path_or_file.close() return csv.DictReader(content.splitlines(), dialect=dialect)
def split(self, outputdir, no_of_parts=None): if not os.path.isfile(self.file_or_dir): raise ValueError( "The constructor argument file_or_dir {} must be a file to invoke this operation." .format(self.file_or_dir)) # Approximate each line is 2KB KB = 1024 approx_size_of_each_line = 1 * KB approx_total_lines = os.path.getsize( self.file_or_dir) / approx_size_of_each_line # Get number of lines per part MB = 2 * (KB * KB) no_of_parts = no_of_parts or int( os.path.getsize(self.file_or_dir) / MB) + 1 no_lines_per_part = int(approx_total_lines / no_of_parts) + 1 self._logger.info("Dividing file {} into estimated {} parts".format( self.file_or_dir, no_of_parts)) with open(self.file_or_dir, encoding=self.encoding) as handle: dialect = csv.unix_dialect() # # TODO: For some reason sniff doesnt pickup quote all.., hence hardcoded .. dialect.quoting = csv.QUOTE_ALL csv_reader = csv.reader(handle, delimiter=self.delimiter, quotechar=self.quote_character) # Skip first line if header header = None if self.has_header: header = next(csv_reader) # Count the number of lines part_index = 0 end_of_file = False while (not end_of_file): part_index = part_index + 1 part_name = "{}_part_{:03d}.csv".format( os.path.basename(self.file_or_dir), part_index) output_part = os.path.join(outputdir, part_name) end_of_file = self._write_part_to_file(csv_reader, output_part, no_lines_per_part, header, dialect) self._logger.info("Completed dividing files sucessfully")
def compute_dialect(self, req, file_, content, encoding): try: extract = file_.file.read(4096).decode(encoding) except (LookupError, UnicodeDecodeError) as e: msg = 'Unable to decode with encoding "{}"'.format(encoding) raise falcon.HTTPBadRequest(msg, str(e)) try: dialect = csv.Sniffer().sniff(extract) except csv.Error: dialect = csv.unix_dialect() file_.file.seek(0) # Escape double quotes with double quotes if needed. # See 2.7 in http://tools.ietf.org/html/rfc4180 dialect.doublequote = True delimiter = req.get_param('delimiter') if delimiter: dialect.delimiter = delimiter quote = req.get_param('quote') if quote: dialect.quotechar = quote # See https://github.com/etalab/addok/issues/90#event-353675239 # and http://bugs.python.org/issue2078: # one column files will end up with non-sense delimiters. if dialect.delimiter.isalnum(): # We guess we are in one column file, let's try to use a character # that will not be in the file content. for char in '|~^°': if char not in content: dialect.delimiter = char break else: raise falcon.HTTPBadRequest(self.MISSING_DELIMITER_MSG, self.MISSING_DELIMITER_MSG) return dialect
def __init__(self, csvfn=None): '''MAPPING - Python access to the SBEM-uCT-VSD mapping map = MAPPING(csvfn) opens the given mapping file. map = MAPPING() uses the default file in the em170428 directory. Result has fields roi2can, roi2uct, roi2sbem that map ROI numbers to other IDs; sbem2can, sbem2uct, sbem2roi, sbem2roiid, sbem2tname that map SBEM ID to other IDs, and uct2can, uct2roi, uct2sbem that map uCT ID numbers to other IDs.''' self.roi2can = {} self.roi2uct = {} self.roi2sbem = {} self.sbem2can = {} self.sbem2uct = {} self.sbem2roi = {} self.sbem2roiid = {} self.sbem2tname = {} self.uct2can = {} self.uct2roi = {} self.uct2sbem = {} self.can2sbem = {} self.can2roi = {} self.can2uct = {} self.roiid2roi = {} self.roi2roiid = {} if csvfn is None: here = os.path.dirname(__file__) csvfn = here + '/../data/mapping.csv' lines = [] with open(csvfn) as f: dl = csv.unix_dialect() rdr = csv.reader(f, dl) for row in rdr: lines.append(row) hdr = lines.pop(0) for l in lines: roi = self.convert_to_number(l[0]) roiid = l[1] can = l[2] uct = self.convert_to_number(l[3]) sbem = self.convert_to_number(l[5]) tname = l[6] if can == '': can = None if roiid == '': roiid = None roi = None if roi is not None: self.roi2can[roi] = can self.roi2uct[roi] = uct self.roi2sbem[roi] = sbem self.roiid2roi[roiid] = roi self.roi2roiid[roi] = roiid if sbem is not None: self.sbem2can[sbem] = can self.sbem2uct[sbem] = uct self.sbem2roi[sbem] = roi self.sbem2roiid[sbem] = roiid self.sbem2tname[sbem] = tname if uct is not None: self.uct2can[uct] = can self.uct2sbem[uct] = sbem self.uct2roi[uct] = roi if can is not None: if can in self.can2roi: print(f'Duplicate can: {can}') self.can2sbem[can] = sbem self.can2uct[can] = uct self.can2roi[can] = roi
def __init__(self, csvfn=None): '''CONFIDENCE - Python access to the SBEM-uCT-VSD confidence file conf = CONFIDENCE(csvfn) opens the given confidence file. conf = CONFIDENCE() uses the default file in the em170428 directory. Result has several fields that each are dicts with tree IDs as keys: uctid (numeric) vsdid (letters) canoid sbemconf (0-100) - confidence of tracing vsductconf (0-100) - confidence of vsd to uct mapping gmapconf (0-100) - confidence of gmapimg results''' self.uctid = {} self.vsdid = {} self.canoid = {} self.sbemconf = {} self.vsductconf = {} self.gmapconf = {} self.vsd2tree = {} if csvfn is None: here = os.path.dirname(__file__) csvfn = here + '/../data/confidence.csv' lines = [] with open(csvfn) as f: dl = csv.unix_dialect() rdr = csv.reader(f, dl) for row in rdr: lines.append(row) hdr = lines.pop(0) r = re.compile('^(\d+)\s*(\((\d+)(-([a-z]+))?\))?$') for l in lines: ids = l[0] cano = l[1] sbemc = l[4] vumapc = l[5] gmapc = l[6] m = r.match(ids) if m: try: tid = m.group(1) uctid = m.group(3) vsdid = m.group(5) if tid is not None: tid = int(tid) if uctid is not None: uctid = int(uctid) self.uctid[tid] = uctid self.vsdid[tid] = vsdid self.vsd2tree[vsdid] = tid self.canoid[tid] = cano if sbemc == '': self.sbemconf[tid] = 100 print('Caution: no sbemconf for', ids, '- assuming 100%') else: self.sbemconf[tid] = int(sbemc) if vumapc == '-' or vumapc == '': self.vsductconf[tid] = None else: self.vsductconf[tid] = int(vumapc) if gmapc == '-' or gmapc == '' or gmapc == '?': self.gmapconf[tid] = None else: self.gmapconf[tid] = int(gmapc) except: print('Something wrong at', ids) raise else: pass # print('no match', ids)
from simplecrypt import decrypt, encrypt from collections import namedtuple from io import StringIO from os import rename from getpass import getpass import csv #TODO: use logging def debug(*args, **kwargs): if not 'file' in kwargs: kwargs['file'] = stderr return print('DEBUG:', *args, **kwargs) DIALECT = csv.unix_dialect() ENCODING = 'ascii' Database = namedtuple('Database', ['properties', 'Record', 'items']) def loadDatabase(file_name, password): #TODO: catch with open(file_name, 'rb') as f: data = decrypt(password, f.read()).decode(ENCODING) data = data.rstrip().split(DIALECT.lineterminator) reader = csv.reader(data, dialect=DIALECT) props = tuple(next(reader)) Record = namedtuple('Record', props) database = Database( properties=props,