def __init__(self, n_samples, out_file, pid, lid, keep_cols, n_retry, delimiter, dataset, pred_name, ui, file_context, fast_mode, encoding, skip_row_id, output_delimiter): self.n_samples = n_samples self.out_file = out_file self.project_id = pid self.model_id = lid self.keep_cols = keep_cols self.n_retry = n_retry self.delimiter = delimiter self.dataset = dataset self.pred_name = pred_name self.out_stream = None self._ui = ui self.file_context = file_context self.fast_mode = fast_mode self.encoding = encoding self.skip_row_id = skip_row_id self.output_delimiter = output_delimiter # dataset_dialect and writer_dialect are set by # investigate_encoding_and_dialect in utils self.dialect = csv.get_dialect('dataset_dialect') self.writer_dialect = csv.get_dialect('writer_dialect') self.scoring_succeeded = False # Removes shelves when True self.is_open = False # Removes shelves when True
def __init__(self): self.printer = printer.printer(self.printer_onlineCallback, self.printer_offlineCallback, self.printer_positionCallback, self.printer_temperatureCallback, self.printer_receiveCallback, self.printer_sendCallback) self.printProcess = printer.printprocess( self.printer, self.printProcess_showImageCallback, self.printProcess_hideImageCallback, self.printProcess_startedPrintingCallback, self.printProcess_finishedPrintingCallback) self.isOnline = False self.isPrinting = False self.reader = csv.reader(sys.stdin, csv.get_dialect('StdioPrinterDriverCustom')) self.writer = csv.writer(sys.stdout, csv.get_dialect('StdioPrinterDriverCustom')) self.verbMap = { 'move': self.move, 'moveTo': self.moveTo, 'home': self.home, 'lift': self.lift, 'askTemp': self.askTemp, 'send': self.send, 'queryOnline': self.queryOnline, 'queryPrinting': self.queryPrinting, 'stopPrinting': self.stopPrinting, 'terminate': self.terminate }
def discover_dialect(sample, dialect=None, **kwargs): """ >>> s = ''' ... 1,1 ... 2,2''' >>> discover_dialect(s) # doctest: +SKIP {'escapechar': None, 'skipinitialspace': False, 'quoting': 0, 'delimiter': ',', 'lineterminator': '\r\n', 'quotechar': '"', 'doublequote': False} """ if isinstance(dialect, py2help._strtypes): dialect = csv.get_dialect(dialect) sniffer = csv.Sniffer() if not dialect: try: dialect = sniffer.sniff(sample) except: dialect = csv.get_dialect("excel") # Convert dialect to dictionary dialect = dict((key, getattr(dialect, key)) for key in dir(dialect) if not key.startswith("_")) # Update dialect with any keyword arguments passed in # E.g. allow user to override with delimiter=',' for k, v in kwargs.items(): if k in dialect: dialect[k] = v return dialect
def __init__(self, n_samples, out_file, pid, lid, keep_cols, n_retry, delimiter, dataset, pred_name, ui, file_context, fast_mode, encoding, skip_row_id, output_delimiter, pred_threshold_name, pred_decision_name, max_prediction_explanations, ): self.n_samples = n_samples self.out_file = out_file self.project_id = pid self.model_id = lid self.keep_cols = keep_cols self.n_retry = n_retry self.delimiter = delimiter self.dataset = dataset self.pred_name = pred_name self.pred_threshold_name = pred_threshold_name self.pred_decision_name = pred_decision_name self.out_stream = None self._ui = ui self.file_context = file_context self.fast_mode = fast_mode self.encoding = encoding self.skip_row_id = skip_row_id self.output_delimiter = output_delimiter # dataset_dialect and writer_dialect are set by # investigate_encoding_and_dialect in utils self.dialect = csv.get_dialect('dataset_dialect') self.writer_dialect = csv.get_dialect('writer_dialect') self.scoring_succeeded = False # Removes shelves when True self.is_open = False # Removes shelves when True self.max_prediction_explanations = max_prediction_explanations
def discover_dialect(sample, dialect=None, **kwargs): """ Discover CSV dialect from string sample Returns dict """ if isinstance(dialect, compatibility._strtypes): dialect = csv.get_dialect(dialect) sniffer = csv.Sniffer() if not dialect: try: dialect = sniffer.sniff(sample) except: dialect = csv.get_dialect('excel') # Convert dialect to dictionary dialect = dict((key, getattr(dialect, key)) for key in dir(dialect) if not key.startswith('_')) # Update dialect with any keyword arguments passed in # E.g. allow user to override with delimiter=',' for k, v in kwargs.items(): if k in dialect: dialect[k] = v return dialect
def register_dialect(cls): dialect = cls.Meta.dialect assert issubclass(dialect, BaseTimeRecordDialect) try: csv.get_dialect(dialect.dialect_name) except csv.Error: csv.register_dialect(dialect.dialect_name, dialect)
def __init__(self, path, mode='r', schema=None, dialect=None, has_header=None, **kwargs): if os.path.isfile(path) is not True: raise ValueError('CSV file "%s" does not exist' % path) self.path = path self.mode = mode csvfile = open(path, mode=self.mode) # Handle Schema if isinstance(schema, py2help._strtypes): schema = datashape.dshape(schema) if isinstance(schema, datashape.DataShape) and len(schema) == 1: schema = schema[0] if not isinstance(schema, datashape.Record): raise TypeError( 'schema cannot be converted into a blaze record dshape') self.schema = str(schema) # Handle Dialect if dialect is None: # Guess the dialect sniffer = csv.Sniffer() try: dialect = sniffer.sniff(csvfile.read(1024)) except: # Cannot guess dialect. Assume Excel. dialect = csv.get_dialect('excel') csvfile.seek(0) else: dialect = csv.get_dialect(dialect) self.dialect = dict((key, getattr(dialect, key)) for key in dir(dialect) if not key.startswith('_')) # Update dialect with any keyword arguments passed in # E.g. allow user to override with delimiter=',' for k, v in kwargs.items(): if k in self.dialect: self.dialect[k] = v # Handle Header if has_header is None: # Guess whether the file has a header or not sniffer = csv.Sniffer() csvfile.seek(0) sample = csvfile.read(1024) self.has_header = sniffer.has_header(sample) else: self.has_header = has_header csvfile.close()
def _get_dialect(self) -> csv.Dialect: # Get default format parameters from dialect attr = self._dialect if attr is None: dialect = csv.get_dialect('excel') # Default dialect elif isinstance(attr, csv.Dialect): dialect = attr # type: ignore elif attr in csv.list_dialects(): dialect = csv.get_dialect(attr) else: raise ValueError(f"unkown CSV-dialect '{attr}'") return dialect # type: ignore
def sniff(filestream): ##sample = csv.Sniffer().sniff(filestream.read(1024)) lines = list(line for line in filestream if not (line.startswith("#") or len(line) == 0)) if all(line.find("\t") >= 0 for line in lines): dialect = csv.get_dialect("excel-tab") else: sample = "\n".join(lines) try: dialect = csv.Sniffer().sniff(sample) except Exception as ex: print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr) dialect = csv.get_dialect("excel") return (lines, dialect)
def get_reader(self, reader_class=namedtuple_csv_reader): f = self.files['file'] d = self.cleaned_data['dialect'] if not d: d = "autodetect" if d == 'autodetect': dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) if dialect.delimiter not in "\t,;": dialect = csv.get_dialect('excel') else: dialect = csv.get_dialect(d) enc = self.cleaned_data['encoding'] encoding = {'encoding' : ENCODINGS[int(enc)]} if enc and reader_class == namedtuple_csv_reader else {} return reader_class(f, dialect=dialect, **encoding)
def open(self): if self.is_open: self._ui.debug('OPEN CALLED ON ALREADY OPEN RUNCONTEXT') return self.is_open = True self._ui.debug('OPEN CALLED ON RUNCONTEXT') csv.register_dialect('dataset_dialect', **self.dialect) csv.register_dialect('writer_dialect', **self.writer_dialect) self.dialect = csv.get_dialect('dataset_dialect') self.writer_dialect = csv.get_dialect('writer_dialect') self.db = shelve.open(self.file_context.file_name, writeback=True) if six.PY2: self.out_stream = open(self.out_file, 'ab') elif six.PY3: self.out_stream = open(self.out_file, 'a', newline='')
def test_build_csv_multiple_cols(): lt = csv.get_dialect('excel').lineterminator dm = csv.get_dialect('excel').delimiter data = [ co.OrderedDict([('col1', 'value'), ('col2', 'another value'), ('col3', 'more')]), co.OrderedDict([('col1', 'one value'), ('col2', 'two value'), ('col3', 'three')]) ] result = lt.join([ dm.join(['col1', 'col2', 'col3']), dm.join(['value', 'another value', 'more']), dm.join(['one value', 'two value', 'three']) ]) + lt assert build_csv(data) == result
def __init__(self, queue, batch_gen_args, ui): self._ui = ui self.queue = queue self.batch_gen_args = batch_gen_args self.dialect = csv.get_dialect('dataset_dialect') # The following should only impact Windows self._ui.set_next_UI_name('batcher')
def main(): parser = argparse.ArgumentParser(description="Fix csv files") parser.add_argument('infile', type=argparse.FileType('rt')) parser.add_argument('outfile', type=argparse.FileType('wt')) parser.add_argument('extras', nargs='*', help=argparse.SUPPRESS) parser.add_argument('--in-delimiter', action='store', nargs='?', type=str) parser.add_argument('--in-quote', action='store', default='"', type=str) args = parser.parse_args() try: if len(args.extras) > 0: raise BaseException('Too many arguments') if len(sys.argv) > 3: csv.register_dialect('in', delimiter=args.in_delimiter, quotechar=args.in_quote, quoting=_csv.QUOTE_MINIMAL) d_in = csv.get_dialect('in') else: sniffer = csv.Sniffer() d_in = sniffer.sniff(args.infile.read(1024)) args.infile.seek(0) csv_in = csv.reader(args.infile, dialect=d_in) csv_out = csv.writer(args.outfile) for row in csv_in: csv_out.writerow(row) finally: args.infile.close() args.outfile.close()
def set_dialect(self): dialectname = self.settings.get('use_dialect') try: csv.get_dialect(dialectname) return dialectname except Exception: user_dialects = self.settings.get('dialects') try: csv.register_dialect(dialectname, **user_dialects[dialectname]) print("DataConverter: Using custom dialect", dialectname) return dialectname except Exception: print("DataConverter: Couldn't register custom dialect named", dialectname) return None
def sqlite_to_csv( input_filename, table_name, output_filename, dialect=csv.excel, batch_size=10000, encoding="utf-8", callback=None, query=None, ): """Export a table inside a SQLite database to CSV""" # TODO: should be able to specify fields # TODO: should be able to specify custom query if isinstance(dialect, six.text_type): dialect = csv.get_dialect(dialect) if query is None: query = "SELECT * FROM {}".format(table_name) connection = sqlite3.Connection(input_filename) cursor = connection.cursor() result = cursor.execute(query) header = [item[0] for item in cursor.description] fobj = open_compressed(output_filename, mode="w", encoding=encoding) writer = csv.writer(fobj, dialect=dialect) writer.writerow(header) total_written = 0 for batch in rows.plugins.utils.ipartition(result, batch_size): writer.writerows(batch) written = len(batch) total_written += written if callback: callback(written, total_written) fobj.close()
def __call__(self, value, error_callback, convertor_fmt_str): buffer = StringIO(value) if self._delimeter is None: dialect = csv.Sniffer().sniff(value) dialect.skipinitialspace = True else: csv.register_dialect('my_dialect', delimiter=self._delimeter, quoting=csv.QUOTE_MINIMAL, skipinitialspace=True) dialect = csv.get_dialect('my_dialect') reader = csv.reader(buffer, dialect) lst = next(reader) try: if self._elem_get_input: converted_list = [] for item in lst: valid, value = self._elem_get_input.process_value(item) if valid is True: converted_list.append(value) else: raise ConvertorError else: converted_list = lst except ConvertorError: raise ConvertorError(self.value_error_str) return converted_list
class Manufacturer(db.Model): """The normalized information about a manufacturer. Ideally users should use the names from this list when submitting devices. """ CSV_DELIMITER = csv.get_dialect('excel').delimiter name = db.Column(CIText(), primary_key=True) name.comment = """The normalized name of the manufacturer.""" url = db.Column(URL(), unique=True) url.comment = """An URL to a page describing the manufacturer.""" logo = db.Column(URL()) logo.comment = """An URL pointing to the logo of the manufacturer.""" __table_args__ = ( # from https://niallburkley.com/blog/index-columns-for-like-in-postgres/ db.Index('name_index', text('name gin_trgm_ops'), postgresql_using='gin'), { 'schema': 'common' }) @classmethod def add_all_to_session(cls, session: db.Session): """Adds all manufacturers to session.""" cursor = session.connection().connection.cursor() #: Dialect used to write the CSV with pathlib.Path(__file__).parent.joinpath( 'manufacturers.csv').open() as f: cursor.copy_expert( 'COPY common.manufacturer FROM STDIN (FORMAT csv)', f)
def __init__(self, *args, **kwds): self.data = kwds.pop('data') kwds["style"] = \ wx.DEFAULT_DIALOG_STYLE | wx.RESIZE_BORDER | wx.THICK_FRAME wx.Dialog.__init__(self, *args, **kwds) self.csvwidgets = CsvParameterWidgets(self, None) dialect = csv.get_dialect(csv.list_dialects()[0]) self.has_header = False style = wx.TE_MULTILINE | wx.TE_READONLY | wx.HSCROLL self.preview_textctrl = CSVPreviewTextCtrl(self, -1, style=style) self.button_cancel = wx.Button(self, wx.ID_CANCEL, "") self.button_apply = wx.Button(self, wx.ID_APPLY, "") self.button_ok = wx.Button(self, wx.ID_OK, "") self._set_properties() self._do_layout() self.preview_textctrl.fill(data=self.data, dialect=dialect) self.Bind(wx.EVT_BUTTON, self.OnButtonApply, self.button_apply)
def infer(location, default=None): """ Return a format to use for the filename provided. """ url = urllib.parse.urlparse(location) path = url.path # try and infer using an extension while path and '.' in path: path, ext = os.path.splitext(path) ext = ext[1:].lower() # is it JSON? if ext.startswith('json'): return JSONLines() if 'l' in ext else JSON() # is it a registered CSV dialect? csv_dialect = csv.get_dialect(ext) if csv_dialect is not None: return CSV( sep=csv_dialect.delimiter, linesep=csv_dialect.lineterminator, quotechar=csv_dialect.quotechar, escapechar=csv_dialect.escapechar, ) # unable to infer return default
def process_recording(recording, csv_out, out_directory, overwrite=False): """Process a single recording recordings: List of recording folders csv_out: CSV file name under which the result will be saved overwrite: Boolean indicating if an existing csv file should be overwritten """ if len(out_directory) == 0: csv_out_path = os.path.join(recording, csv_out) else: csv_out_path = os.path.join(out_directory, csv_out) if os.path.exists(csv_out_path): if not overwrite: logger.warning( "{} exists already! Not overwriting.".format(csv_out_path)) return else: logger.warning( "{} exists already! Overwriting.".format(csv_out_path)) with open(csv_out_path, "w", newline='') as csv_file: writer = csv.writer(csv_file, dialect=csv.get_dialect('excel')) writer.writerow(csv_header()) extracted_rows = load_and_yield_data(recording) writer.writerows(extracted_rows) return
def get_dialect(dialect, **fmtparams): if isinstance(dialect, basestring): dialect = csv.get_dialect(dialect) # Unlike the standard csv module, this module does not have its own # universal newline handling, but instead expects the provided file objects # to be opened in universal newline mode. We therefore convert all newline # line terminators to '\n'. lineterminator = fmtparams.get('lineterminator', dialect.lineterminator) if lineterminator in {'\r', '\r\n'}: lineterminator = '\n' #XXX csv.Dialect does for some reason not expose strict. We set strict=False # by default, but this will of course be wrong if the original dialect had # strict=True. strict = False if hasattr(dialect, 'strict'): strict = dialect.strict return Dialect( delimiter=fmtparams.get('delimiter', dialect.delimiter), quotechar=fmtparams.get('quotechar', dialect.quotechar), escapechar=fmtparams.get('escapechar', dialect.escapechar), doublequote=fmtparams.get('doublequote', dialect.doublequote), skipinitialspace=fmtparams.get('skipinitialspace', dialect.skipinitialspace), lineterminator=lineterminator, quoting=fmtparams.get('quoting', dialect.quoting), strict=fmtparams.get('strict', strict) )
def csv(input, dialect=None, header=True, key=None, sort=False): obj = [] fields = None if dialect: dialect = ocsv.get_dialect(dialect) else: sniffer = ocsv.Sniffer() dialect = sniffer.sniff(input.readline())() input.seek(0) reader = ocsv.reader(input, dialect=dialect) if key: header = True if header: fields = next(reader) if key and key <= len(fields): obj = OrderedDict() else: key = None for row in reader: if header: row = OrderedDict(zip(fields, row)) if key: rowkey = row.pop(fields[key - 1]) obj[rowkey] = row else: obj.append(row) else: obj.append(row) if sort: obj = sort_ordereddict(obj) return obj
def sniff(self, sample): try: dialect = csv.Sniffer().sniff(sample) print dialect.delimiter except Exception as e: delimiter = self.settings.get('delimiter', ',').pop() delimiter = bytes(delimiter) # dialect definition takes a 1-char bytestring print "DataConverter had trouble sniffing:", e try: csv.register_dialect('barebones', delimiter=delimiter) dialect = csv.get_dialect('barebones') except Exception as e: dialect = csv.get_dialect('excel') return dialect
def __init__(self, filename, renmwo=True): self.filename = filename self.renmwo = "%s.~renmwo%d~" % (filename, os.getpid()) if renmwo else filename self.file = open(self.renmwo, "w") self.writer = csv.writer(self.file, dialect=csv.get_dialect("excel-tab"))
def reportcrossval(args, *cvresults): h, w = rect(args.parameters) fw, fh = pp.rcParams['figure.figsize'] figsize = h * fh, w * fw fig = pp.figure(figsize=figsize) fields = ['parameter','slope', 'intercept', 'R^2', 'P-value', 'error'] writer = csv.DictWriter(sys.stdout, fields, dialect=csv.get_dialect('excel')) writer.writerow(dict(zip(writer.fieldnames, writer.fieldnames))) for i in xrange(args.parameters): name = args.paramnames[i] x, y = cvresults[i] slope, intercept, r, pvalue, err = linregress(x, y) row = { 'parameter' : name } row.update(zip(fields[1:], (slope, intercept, r ** 2, pvalue, err))) writer.writerow(row) ax = pp.subplot(h,w,i) ax.plot(x, y, ' o', c='white', figure=fig, axes=ax) xlim = x.min(), x.max() ax.plot(xlim, xlim, 'r-', alpha=.75) pp.xlim(*xlim) pp.ylim(*xlim) pp.xlabel(r'observed', fontsize='small') pp.ylabel(r'estimated', fontsize='small') pp.title(sanetext(name), fontsize='small') pp.draw() fig.subplots_adjust(hspace=.5, wspace=.3) pp.draw()
def handle(self, *args, **options): if (len(args) < 1): return csv.register_dialect('quotescolon', quotechar='"', delimiter=';', doublequote=False, lineterminator='\n', quoting=csv.QUOTE_NONE) f = codecs.open(args[0], mode='rU') stops = file.UnicodeDictReader(f, 'utf-8', dialect=csv.get_dialect('quotescolon')) with reversion.create_revision(): source, created = Source.objects.get_or_create(source_id=u'govi', defaults={u'name': "GOVI"}) for stop in stops: split = unicode(stop['TimingPointName']).split(',') if len(split) > 1: city = split[0] name = split[1].lstrip() else: city = stop['TimingPointTown'].capitalize() name = stop['TimingPointName'] point = geo.transform_rd(Point(x=int(stop['LocationX_EW']), y=int(stop['LocationY_NS']), srid=28992)) s, created = UserStop.objects.get_or_create(tpc=stop[u"TimingPointCode"], defaults={u'common_name' : name, u'common_city' : city, 'point' : point.wkt}) # Get or create our source for attr in stop.keys(): self.get_create_update(SourceAttribute, {'stop' : s, 'source' : source, 'key' : attr.capitalize()}, {'value' : stop[attr]} ) reversion.set_comment(u"GOVI Import") f.close()
def print_info(args, cv=False): writer = csv.writer(sys.stdout, dialect=csv.get_dialect('excel')) rows = [] rows.append(('Command', 'cross-validation' if cv else 'fit')) rows.append(('Data', args.datasetname)) rows.append(('Date', datetime.now())) rows.append(('Mixture Truncation', ('yes' if args.truncated else 'no'))) rows.append(('Mixture components', args.components)) r = ['GP params'] map(r.extend,args.gpparams.items()) rows.append(r) rows.append(('Optimization method', 'fmin_l_bfgs_b' if args.bounds else 'fmin')) if cv is False: rows.append(('Bootstrap', ('yes' if args.bootstrap else 'no'))) rows.append(('Bootstrap repetitions', args.bootstrap_reps)) rows.append(('Bootstrap sample', (args.bootstrap_size if args.bootstrap_size else 'same as dataset'))) if args.weights is not None: r = ['Weights'] map(r.extend, zip(args.auxiliary, args.weights)) rows.append(r) else: rows.append(('Weights', 'no')) if args.bounds is not None: r = ['Bounds'] map(r.extend, map(lambda k,b : (k,) + b, args.paramnames, args.bounds)) rows.append(r) else: rows.append(('Bounds', 'no')) writer.writerows(rows) print sys.stdout.flush()
def makeHVsLon(outFilename, dataFile): out = ' >> ' + outFilename dialect = csv.get_dialect('excel') results = csv.reader(open(dataFile, 'r'), dialect=dialect) row = results.next() row = results.next() rows = [] for row in results: if (int(row[12]) > 5): rows.append(row) gmt = os.popen( 'psxy -B2WESn -P -JX6i/-6i -R-111.0/-102.7/24/61 -Sp.08i -G0/0/255 -K ' + out, 'w') for row in rows: gmt.write("%s %s\n" % (row[4], row[5])) gmt.close() gmt = os.popen('psxy -P -JX -R -St.1i -G255/0/0 -O -K ' + out, 'w') for row in rows: gmt.write("%s %s\n" % (row[4], row[14])) gmt.close() gmt = os.popen('pstext -P -JX -R -O -W' + out, 'w') for row in rows: thick = row[5].replace(' km', '') thick = '30' gmt.write(row[4] + " " + thick + " 12 90 4 BL " + row[0] + '.' + row[1] + "\n") gmt.close()
def makeMagVsDist(outFilename, dataFile): out = ' >> ' + outFilename #gmt= os.popen('psxy -B2WESn -P -JX6i/-6i -R20/100/5/10 -Sp.05i -G0 -K '+out, 'w') dialect = csv.get_dialect('excel') dialect.delimiter = '|' distMagBin = {} results = csv.reader(open(dataFile, 'r'), dialect=dialect) row = results.next() row = results.next() for row in results: distaz = DistAz(float(row[4]), float(row[5]), float(row[2]), float(row[3])) distBin = 5 + math.ceil(distaz.getDelta() / 10) * 10 magBin = round(float(row[7]) * 10) / 10.0 distMagBin["%i %f" % (distBin, magBin)] = distMagBin.setdefault( "%i %f" % (distBin, magBin), 0) + 1 # gmt.write("%f %s\n" % (distaz.getDelta(), row[7])) # gmt.close() gmt = os.popen( 'psxyz -R30/100/5/9/1/1000 -P -JX6.5 -JZ2.5i -So0.3ib1 -Ggray -W0.5p -E150/50 -B10/1/20:"Num Eq for Dist, Mag":WSneZ' + out, 'w') for key in distMagBin: gmt.write("%s %i\n" % (key, distMagBin[key])) gmt.close()
def sniff_dialect(sample, encoding, sep, skip_dialect, ui): t1 = time() try: if skip_dialect: ui.debug('investigate_encoding_and_dialect - skip dialect detect') if sep: csv.register_dialect('dataset_dialect', csv.excel, delimiter=sep) else: csv.register_dialect('dataset_dialect', csv.excel) dialect = csv.get_dialect('dataset_dialect') else: sniffer = csv.Sniffer() dialect = sniffer.sniff(sample.decode(encoding), delimiters=sep) ui.debug('investigate_encoding_and_dialect - seconds to detect ' 'csv dialect: {}'.format(time() - t1)) except csv.Error: decoded_one = sample.decode(encoding) t2 = time() detector = Detector() delimiter, resampled = detector.detect(decoded_one) if len(delimiter) == 1: delimiter = delimiter[0] ui.info("Detected delimiter as %s" % delimiter) if sep is not None and sep != delimiter: delimiter = sep else: raise csv.Error( "The csv module failed to detect the CSV dialect. " "Try giving hints with the --delimiter argument, " "E.g --delimiter=','" ) sniffer = csv.Sniffer() dialect = sniffer.sniff(resampled, delimiters=delimiter) ui.debug('investigate_encoding_and_dialect v2 - seconds to detect ' 'csv dialect: {}'.format(time() - t2)) if dialect.escapechar is None: csv.register_dialect('dataset_dialect', dialect, delimiter=str(dialect.delimiter), quotechar=str(dialect.quotechar), doublequote=True) dialect = csv.get_dialect('dataset_dialect') return dialect
def sniff_dialect(sample, sep, skip_dialect, ui): t1 = time() try: if skip_dialect: ui.debug('investigate_encoding_and_dialect - skip dialect detect') if sep: csv.register_dialect('dataset_dialect', csv.excel, delimiter=sep) else: csv.register_dialect('dataset_dialect', csv.excel) dialect = csv.get_dialect('dataset_dialect') else: sniffer = csv.Sniffer() dialect = sniffer.sniff(sample, delimiters=sep) ui.debug('investigate_encoding_and_dialect - seconds to detect ' 'csv dialect: {}'.format(time() - t1)) except csv.Error: decoded_one = sample t2 = time() detector = Detector() delimiter, resampled = detector.detect(decoded_one) if len(delimiter) == 1: delimiter = delimiter[0] ui.info("Detected delimiter as %s" % delimiter) if sep is not None and sep != delimiter: delimiter = sep else: raise csv.Error( "The csv module failed to detect the CSV dialect. " "Try giving hints with the --delimiter argument, " "E.g --delimiter=','" ) sniffer = csv.Sniffer() dialect = sniffer.sniff(resampled, delimiters=delimiter) ui.debug('investigate_encoding_and_dialect v2 - seconds to detect ' 'csv dialect: {}'.format(time() - t2)) if dialect.escapechar is None: csv.register_dialect('dataset_dialect', dialect, delimiter=str(dialect.delimiter), quotechar=str(dialect.quotechar), doublequote=True) dialect = csv.get_dialect('dataset_dialect') return dialect
def get_all_tweets(screen_name): #Twitter only allows access to a users most recent 3240 tweets with this method #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) #initialize a list to hold all the tweepy Tweets alltweets = [] #make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name=screen_name, count=1) #save most recent tweets alltweets.extend(new_tweets) #save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: print("getting tweets before %s" % (oldest)) #all subsequent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest, since_id=first_tweet_id) #save most recent tweets alltweets.extend(new_tweets) #update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(alltweets))) #go through all found tweets and remove the ones with no images outtweets = [] #initialize master list to hold our ready tweets for tweet in alltweets: #not all tweets will have media url, so lets skip them try: print(tweet.entities['media'][0]['media_url']) except (NameError, KeyError): media_url = '' else: #got media_url - means add it to the output media_url = tweet.entities['media'][0]['media_url'] outtweets.append([ tweet.id_str, tweet.created_at, tweet.text.replace("\n", '\\n'), media_url ]) #write the csv with open('%s_tweets.csv' % screen_name, 'w') as f: writer = csv.writer(f, dialect=csv.get_dialect("unix")) writer.writerow(["id", "created_at", "text", "media_url"]) writer.writerows(outtweets)
def readCsv(pathToCsv, ui): """ Reads csv file to a dict containing lists, each representing a column. The keys of the dictionary represent the column names, and the value contains the corresponding list of the column. Args: pathToCsv (string): a path to the csv file to be parsed Returns: a dictionary with for every key the corresponding column list of data """ encoding = 'utf-8' # delimiter = ',' # quotechar = '"' with open(pathToCsv, 'rt', newline='', encoding=encoding) as fp: try: dialect = csv.Sniffer().sniff(fp.readline()) except Exception: dialect = csv.get_dialect('excel') fp.seek(0) try: data = csv.reader(fp, dialect=dialect) except Exception as e: errorMessage = ("Cannot process csv file, unknown format, see the log file for more information") if ui is not None: logging.exception("Cannot process csv file: %s", e) ui.showErrorMessage(errorMessage) return None rowDataList = list(data) headerList = rowDataList[0] dataTupleList = list(zip(*rowDataList[1:])) dataDict = {} try: for index in range(len(headerList)): headerString = headerList[index] dataTuple = dataTupleList[index] dataDict[headerString] = list(dataTuple) except Exception as e: errorMessage = ("Cannot process csv file, unknown format") if ui is not None: logging.exception("Cannot process csv file: %s", e) ui.showErrorMessage(errorMessage) return None return dataDict
def determine_dialect(file): try: result = csv.Sniffer().sniff(file.read(1024)) file.seek(0) return result except: file.seek(0) return csv.get_dialect('excel')
def sniff(self, sample): try: dialect = csv.Sniffer().sniff(sample) print dialect.delimiter except Exception as e: delimiter = self.settings.get('delimiter', ',').pop() delimiter = bytes( delimiter) # dialect definition takes a 1-char bytestring print "DataConverter had trouble sniffing:", e try: csv.register_dialect('barebones', delimiter=delimiter) dialect = csv.get_dialect('barebones') except Exception as e: dialect = csv.get_dialect('excel') return dialect
def sniff_dialect(self): self.sample = self.read_sample() try: self.dialect = get_dialect_data(self.csv_sniffer.sniff( self.sample)) except csv.Error: self.dialect = get_dialect_data(csv.get_dialect("excel")) return self.dialect
def detect_csv_dialect(self): try: f = open(self.path, "rb") except IOError: sys.exit('No such file or directory:' + self.path) d = csv.Sniffer().sniff(f.read(1024)) d.lineterminator = "\n" csv.register_dialect('auto', d) self.dialect = 'auto' print "Detected dialect:" print "delimiter=", csv.get_dialect(self.dialect).delimiter print "quote char=", csv.get_dialect(self.dialect).quotechar print "quoting=", csv.get_dialect(self.dialect).quoting print "line terminator=", csv.get_dialect(self.dialect)\ .lineterminator.replace('\n', '\\n').replace('\r', "\\r") print "escape char=", csv.get_dialect(self.dialect).escapechar print "----------------------------"
def test_register_kwargs(self): name = 'fedcba' csv.register_dialect(name, delimiter=';') try: self.failUnless(csv.get_dialect(name).delimiter, '\t') self.failUnless(list(csv.reader('X;Y;Z', name)), ['X', 'Y', 'Z']) finally: csv.unregister_dialect(name)
def test_register_kwargs(self): name = "fedcba" csv.register_dialect(name, delimiter=";") try: self.failUnless(csv.get_dialect(name).delimiter, "\t") self.failUnless(list(csv.reader("X;Y;Z", name)), ["X", "Y", "Z"]) finally: csv.unregister_dialect(name)
def send_sms(to, msg, mask="ICICIPRU"): p = "http://enterprise.smsgupshup.com/GatewayAPI/rest" if len(to) > 10 and type(to) == type([]): register_openers() at_one_time = 100000 if type(msg) != type([]): msg = [msg for i in range(len(to))] zipped = zip(to, msg) for bucket in [zipped[i:i+at_one_time] for i in range(0, len(zipped)) if i%at_one_time == 0]: csv.register_dialect('gupshup', delimiter=',', quoting=csv.QUOTE_ALL) filename = os.path.join("/tmp/", "%s.csv" % hashlib.md5(str(time.time())).hexdigest()) file_stream = open(filename,'wb') writer = UnicodeWriter(file_stream, dialect=csv.get_dialect('gupshup')) writer.writerow(["PHONE","MESSAGE"]) if type(msg) == type([]): for i_to, i_msg in bucket: writer.writerow([i_to, "%s" % i_msg]) file_stream.close() wfile_stream = open(filename,'rb') datagen, headers = multipart_encode({ "file": wfile_stream, 'method' : 'xlsUpload', 'filetype' : 'csv', 'msg_type' : 'text', 'mask' : mask, 'v' : '1.1', 'userid' : '2000058874', 'password' : 'glitterfuck', }) request = urllib2.Request(url=p, data=datagen, headers=headers) res = urllib2.urlopen(request).read() response_logger.info("Response %s" % (res)) else: if type(to) == type([]): to = ",".join(to) data = { 'msg' : msg, 'send_to' : to, 'v' : '1.1', 'userid' : '2000058874', 'password' : 'glitterfuck', 'msg_type' : 'text', 'method' : 'sendMessage', 'mask' : mask, } querystring = urllib.urlencode(data) request = urllib2.Request(url=p, data=querystring) res = urllib2.urlopen(request).read() response_logger.info("Response %s" % (res)) return res
def test_init2(self): dsv1_fh = DSV.getHandle(self.num_dsv_path) # predefined delimiter, resolved successfully # NOTE: class does not check if delimiter is valid at this point dsv1 = DSV(self.dbm, self.testdb, dsv1_fh, dtname=self.test_dtname, delimiter='\t') self.assertFalse(dsv1.isCreated()) self.assertEqual(csv.get_dialect('excel-tab'), dsv1.dialect) self.assertEqual('\t', dsv1.dialect.delimiter) dsv1.close()
def sniff(self, sample): if self.dialect: return self.dialect try: dialect = csv.Sniffer().sniff(sample) print 'DataConverter is using this delimiter:', dialect.delimiter return dialect except Exception as e: print "DataConverter had trouble sniffing:", e delimiter = self.settings.get('delimiter', ',') delimiter = bytes(delimiter) # dialect definition takes a 1-char bytestring try: csv.register_dialect('barebones', delimiter=delimiter) return csv.get_dialect('barebones') except Exception as e: return csv.get_dialect('excel')
def close(self): if not self.is_open: self._ui.debug('CLOSE CALLED ON CLOSED RUNCONTEXT') return self.is_open = False self._ui.debug('CLOSE CALLED ON RUNCONTEXT') self.dialect = csv.get_dialect('dataset_dialect') self.writer_dialect = csv.get_dialect('writer_dialect') values = ['delimiter', 'doublequote', 'escapechar', 'lineterminator', 'quotechar', 'quoting', 'skipinitialspace', 'strict'] self.dialect = {k: getattr(self.dialect, k) for k in values if hasattr(self.dialect, k)} self.writer_dialect = {k: getattr(self.writer_dialect, k) for k in values if hasattr(self.writer_dialect, k)} self.db.sync() self.db.close() if self.out_stream is not None: self.out_stream.close()
def read_xsv( file: IO, dialect: str, fieldnames: List[str] = None, first_line_is_column_header: bool = True, discard: int = None, load_at_most: int = None, ) -> Iterable[Dict]: """Returns an iterable of dict. Must be iterated while file is still open. Args: file: An open file. dialect: As used in built-in module `csv`. fieldnames: TODO: Pending documentation for 'fieldnames' first_line_is_column_header: If True, parses first line as column headers. discard: Non-negative integer or None. Initial rows of _data_ to discard. load_at_most: Non-negative integer or None. Rows of _data_ to load. Notes: Use 'excel' dialect for CSV. Use 'excel-tab' for TSV. Warnings: Must be iterated while file is still open. """ kwargs = { 'fieldnames': fieldnames, 'dialect': dialect, } if not first_line_is_column_header and fieldnames is None: # use 'Column X' as fieldnames like in OpenRefine first_line = file.readline(1) file.seek(-1) delimiter = csv.get_dialect(dialect).delimiter num_cols = len(first_line.split(delimiter)) kwargs['fieldnames'] = [f'Column {i + 1}' for i in range(num_cols)] if first_line_is_column_header and fieldnames is not None: raise NotImplementedError( "Changing column names isn't supported for simplicity") reader = csv.DictReader(file, **select_not_null(kwargs, 'fieldnames', 'dialect')) stop = None if load_at_most is not None: stop = load_at_most if discard is not None: stop += discard return islice(reader, discard, stop)
def test_investigate_encoding_and_dialect(): with UI(None, logging.DEBUG, stdout=False) as ui: data = 'tests/fixtures/windows_encoded.csv' encoding = investigate_encoding_and_dialect(data, None, ui) dialect = csv.get_dialect('dataset_dialect') assert encoding == 'iso-8859-2' assert dialect.lineterminator == '\r\n' assert dialect.quotechar == '"' assert dialect.delimiter == ','
def get_reader(self, reader_class=namedtuple_csv_reader): file = self.decode_file(self.files['file']) if file.name.endswith(".xlsx"): if reader_class != namedtuple_csv_reader: raise ValueError("Cannot handle xlsx files with non-default reader, sorry!") return namedtuple_xlsx_reader(file) d = self.cleaned_data['dialect'] or "autodetect" if d == 'autodetect': dialect = csv.Sniffer().sniff(file.readline()) file.seek(0) if dialect.delimiter not in "\t,;": dialect = csv.get_dialect('excel') else: dialect = csv.get_dialect(d) return reader_class(file, dialect=dialect)
def go(self): dataset_dialect = csv.get_dialect('dataset_dialect') args = ([self.batch_gen_args, SerializableDialect.from_dialect(dataset_dialect), self.queue]) self.p = multiprocessing.Process(target=self._shove, args=args, name='Shovel_Proc') self.p.start() return self.p
def set_dialect(dialectname, user_dialects): '''Get a CSV dialect from csv.dialects or a register one from passed dict.''' try: csv.get_dialect(dialectname) return dialectname except _csv.Error: try: user_quoting = user_dialects[dialectname].pop('quoting', 'QUOTE_MINIMAL') quoting = getattr(csv, user_quoting, csv.QUOTE_MINIMAL) csv.register_dialect(dialectname, quoting=quoting, **user_dialects[dialectname]) print("DataConverter: Using custom dialect", dialectname) return dialectname except _csv.Error: print("DataConverter: Couldn't register custom dialect named", dialectname) return None
def pgexport( database_uri, table_name, filename, encoding="utf-8", dialect=csv.excel, callback=None, timeout=0.1, chunk_size=8388608, ): """Export data from PostgreSQL into a CSV file using the fastest method Required: psql command """ if isinstance(dialect, six.text_type): dialect = csv.get_dialect(dialect) # Prepare the `psql` command to be executed to export data command = get_psql_copy_command( database_uri=database_uri, direction="TO", encoding=encoding, header=None, # Needed when direction = 'TO' table_name=table_name, dialect=dialect, ) fobj = open_compressed(filename, mode="wb") try: process = subprocess.Popen( shlex.split(command), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) total_written = 0 data = process.stdout.read(chunk_size) while data != b"": written = fobj.write(data) total_written += written if callback: callback(written, total_written) data = process.stdout.read(chunk_size) stdout, stderr = process.communicate() if stderr != b"": raise RuntimeError(stderr.decode("utf-8")) except FileNotFoundError: raise RuntimeError("Command `psql` not found") except BrokenPipeError: raise RuntimeError(process.stderr.read().decode("utf-8")) return {"bytes_written": total_written}
def test_registry(self): class myexceltsv(csv.excel): delimiter = "\t" name = "myexceltsv" expected_dialects = csv.list_dialects() + [name] expected_dialects.sort() csv.register_dialect(name, myexceltsv) self.addCleanup(csv.unregister_dialect, name) self.assertEqual(csv.get_dialect(name).delimiter, '\t') got_dialects = sorted(csv.list_dialects()) self.assertEqual(expected_dialects, got_dialects)
def get_reader(self, reader_class=namedtuple_csv_reader): f = self.files['file'] if f.name.endswith(".xlsx"): if reader_class != namedtuple_csv_reader: raise Exception("Cannot handle xlsx files with non-default reader, sorry!") return namedtuple_xlsx_reader(f) d = self.cleaned_data['dialect'] if not d: d = "autodetect" if d == 'autodetect': dialect = csv.Sniffer().sniff(f.readline()) f.seek(0) if dialect.delimiter not in "\t,;": dialect = csv.get_dialect('excel') else: dialect = csv.get_dialect(d) enc = self.cleaned_data['encoding'] encoding = {'encoding' : ENCODINGS[int(enc)]} if enc and reader_class == namedtuple_csv_reader else {} return reader_class(f, dialect=dialect, **encoding)
def initialize(self): """Initialize CSV source stream: #. perform autodetection if required: #. detect encoding from a sample data (if requested) #. detect whether CSV has headers from a sample data (if requested) #. create CSV reader object #. read CSV headers if requested and initialize stream fields """ self.file, self.close_file = base.open_resource(self.resource) handle = None if self._autodetection: sample = self.file.read(self.sample_size) # Encoding test if self.detect_encoding and type(sample) == unicode: self.encoding = "utf-8" if self.detect_header: sample = sample.encode('utf-8') sniffer = csv.Sniffer() self.read_header = sniffer.has_header(sample) self.file.seek(0) if self.dialect: if type(self.dialect) == str: dialect = csv.get_dialect(self.dialect) else: dialect = self.dialect self.reader_args["dialect"] = dialect # self.reader = csv.reader(handle, **self.reader_args) self.reader = UnicodeReader(self.file, encoding = self.encoding, **self.reader_args) if self.skip_rows: for i in range(0, self.skip_rows): self.reader.next() # Initialize field list if self.read_header: field_names = self.reader.next() fields = [ (name, "string", "default") for name in field_names] self._fields = brewery.metadata.FieldList(fields)