def load_gtfs(self, gtfs_filename, tables=None, reporter=None, verbose=False): c = self.conn.cursor() if not os.path.isdir( gtfs_filename ): zf = ZipFile( gtfs_filename ) for tablename, table_def in self.GTFS_DEF: filename = tablename[5:] + '.txt' if tables is not None and tablename not in tables: print( "skipping table %s - not included in 'tables' list" % tablename ) continue print( "creating table %s\n"%tablename ) create_table( c, tablename, table_def ) print( "loading table %s\n"%tablename ) try: if not os.path.isdir( gtfs_filename ): trips_file = iterdecode( zf.read(filename).split("\n"), "utf-8" ) else: trips_file = iterdecode( open( os.path.join( gtfs_filename, filename ) ), "utf-8" ) load_gtfs_table_to_sqlite(trips_file, tablename, c, table_def, verbose=verbose) except (KeyError, IOError): print( "NOTICE: GTFS feed has no file %s.txt, cannot load\n"%tablename ) self._create_indices(c) self.conn.commit() c.close()
def testIncrementalDecoder(self): # Tests derived from Python standard library test/test_codecs.py incremental_tests = ( (u"python.org", b"python.org"), (u"python.org.", b"python.org."), (u"pyth\xf6n.org", b"xn--pythn-mua.org"), (u"pyth\xf6n.org.", b"xn--pythn-mua.org."), ) for decoded, encoded in incremental_tests: if sys.version_info[0] == 2: self.assertEqual("".join(codecs.iterdecode(encoded, "idna")), decoded) else: self.assertEqual("".join(codecs.iterdecode((bytes([c]) for c in encoded), "idna")), decoded) decoder = codecs.getincrementaldecoder("idna")() self.assertEqual(decoder.decode(b"xn--xam", ), u"") self.assertEqual(decoder.decode(b"ple-9ta.o", ), u"\xe4xample.") self.assertEqual(decoder.decode(b"rg"), u"") self.assertEqual(decoder.decode(b"", True), u"org") decoder.reset() self.assertEqual(decoder.decode(b"xn--xam", ), u"") self.assertEqual(decoder.decode(b"ple-9ta.o", ), u"\xe4xample.") self.assertEqual(decoder.decode(b"rg."), u"org.") self.assertEqual(decoder.decode(b"", True), u"")
def convert_column(data, schemae): """Convert known types from primitive to rich.""" ctype = schemae.converted_type if ctype == parquet_thrift.ConvertedType.DECIMAL: scale_factor = Decimal("10e-{}".format(schemae.scale)) if schemae.type == parquet_thrift.Type.INT32 or schemae.type == parquet_thrift.Type.INT64: return [Decimal(unscaled) * scale_factor for unscaled in data] return [Decimal(intbig(unscaled)) * scale_factor for unscaled in data] elif ctype == parquet_thrift.ConvertedType.DATE: return [datetime.date.fromordinal(d) for d in data] elif ctype == parquet_thrift.ConvertedType.TIME_MILLIS: return [datetime.timedelta(milliseconds=d) for d in data] elif ctype == parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: return [datetime.datetime.utcfromtimestamp(d / 1000.0) for d in data] elif ctype == parquet_thrift.ConvertedType.UTF8: return list(codecs.iterdecode(data, "utf-8")) elif ctype == parquet_thrift.ConvertedType.UINT_8: return _convert_unsigned(data, 'b') elif ctype == parquet_thrift.ConvertedType.UINT_16: return _convert_unsigned(data, 'h') elif ctype == parquet_thrift.ConvertedType.UINT_32: return _convert_unsigned(data, 'i') elif ctype == parquet_thrift.ConvertedType.UINT_64: return _convert_unsigned(data, 'q') elif ctype == parquet_thrift.ConvertedType.JSON: return [json.loads(s) for s in codecs.iterdecode(data, "utf-8")] elif ctype == parquet_thrift.ConvertedType.BSON and bson: return [bson.BSON(s).decode() for s in data] else: logger.info("Converted type '%s'' not handled", parquet_thrift.ConvertedType._VALUES_TO_NAMES[ctype]) # pylint:disable=protected-access return data
def test_incremental_decode(self): self.assertEquals( "".join(codecs.iterdecode("python.org", "idna")), u"python.org" ) self.assertEquals( "".join(codecs.iterdecode("python.org.", "idna")), u"python.org." ) self.assertEquals( "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), u"pyth\xf6n.org." ) self.assertEquals( "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")), u"pyth\xf6n.org." ) decoder = codecs.getincrementaldecoder("idna")() self.assertEquals(decoder.decode("xn--xam", ), u"") self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.") self.assertEquals(decoder.decode(u"rg"), u"") self.assertEquals(decoder.decode(u"", True), u"org") decoder.reset() self.assertEquals(decoder.decode("xn--xam", ), u"") self.assertEquals(decoder.decode("ple-9ta.o", ), u"\xe4xample.") self.assertEquals(decoder.decode("rg."), u"org.") self.assertEquals(decoder.decode("", True), u"")
def upsert_dataset(self, datasetCode): if datasetCode=='esri': url = self.url_amount[0] response = urllib.request.urlopen(url) draft = csv.reader(codecs.iterdecode(response, 'latin-1'), delimiter=',') included_cols = [0] list_csv = [] year_draft = [] for rrow in draft: list_csv.append(rrow) year_draft.append(list(rrow[i] for i in included_cols)) #Generating the year regarding to the standard format #year = year_draft[7:] #year_last = year[-4][0][:4]+'q4' #period_index = pandas.period_range(year[0][0][:4], year_last , freq = 'quarterly') year = year_draft[7:] year_last = year[-4][0][:4]+'q4' period_index = pandas.period_range(year[0][0][:4], year_last , freq = 'quarterly') list_csv[5][0] = 'year' # flatens the tree structure for i, j in enumerate(list_csv[6]): if j != '': if list_csv[5][i] != '': keep = list_csv[5][i] list_csv[5][i] = list_csv[5][i] + '_' + list_csv[6][i] else : list_csv[5][i] = keep + '_' + list_csv[6][i] dimensionList_content = [] response = urllib.request.urlopen(url) reader = csv.DictReader(codecs.iterdecode(response, 'latin-1'), fieldnames=list_csv[5] ,delimiter=',') for i in range(len(reader.fieldnames)): if reader.fieldnames[i]!='' and reader.fieldnames[i] != 'year' : dimensionList_content.append(reader.fieldnames[i]) dimensionList = {'content':dimensionList_content} datasetCode = 'esri' releaseDates =response.getheaders()[0][1] lastUpdate = datetime.datetime.strptime(releaseDates[5:], "%d %b %Y %H:%M:%S GMT") document = Dataset(provider = 'esri', name = year_draft[1][0] , datasetCode = 'esri', lastUpdate = lastUpdate, dimensionList = dimensionList, docHref = "http://www.cao.go.jp/index-e.html") #print(document) effective_dimension_list = self.update_series('esri', dimensionList) document.update_database() #print(effective_dimension_list) effective_dimension_list = self.update_series('esri', dimensionList) document.update_database() document.update_es_database(effective_dimension_list) else: raise Exception("The name of dataset was not entered!")
def get_rows(self, filename): if self.zf: try: contents = self.zf.read(filename) except KeyError: raise KeyError( "%s is not present feed"%filename ) return csv.reader( iterdecode( contents.split("\n"), "utf-8" ) ) else: return csv.reader( iterdecode( open( os.path.join( self.filename, filename ) ), "utf-8" ) )
def vola_importer(url="https://raw.githubusercontent.com/flyingeek/editolido/gh-pages/ext-sources/vola_legacy_report.txt"): # https://oscar.wmo.int/oscar/vola/vola_legacy_report.txt if PY2: delimiter = b'\t' data = urlopen(url) else: delimiter = '\t' import codecs data = codecs.iterdecode(urlopen(url), 'utf-8') reader = csv.reader(data, delimiter=delimiter, quoting=csv.QUOTE_NONE) def geo_normalize(value): # recognize NSEW or undefined (which is interpreted as North) orientation = value[-1] sign = -1 if orientation in 'SW' else 1 coords = value if orientation not in 'NEWS' else value[:-1] coords += ' 0 0' # ensure missing seconds or minutes are 0 degrees, minutes, seconds = map(float, coords.split(' ', 3)[:3]) return sign * (degrees + (minutes / 60) + (seconds / 3600)) headers = next(reader) for row in reader: name = row[5] if not name: continue yield name, geo_normalize(row[9]), geo_normalize(row[8]), row[28].split(', ')
def __iter__(self): if self.is_zipped: byte_stream = BytesIO(self.response.content) with zipfile.ZipFile(byte_stream) as self.zipfile: for name in self.zipfile.namelist(): with self.zipfile.open(name) as single_file: if name[-3:] == 'csv': reader = csv.reader(single_file, delimiter=self.delimiter) else: reader = single_file reader_iterator = iter(reader) if self.is_header_present: next(reader_iterator) for line in reader_iterator: yield self._parse_line(line) byte_stream.close() else: stream = codecs.iterdecode(self.response.iter_lines(), self.response.encoding or self.response.apparent_encoding) reader = csv.reader(stream, delimiter=self.delimiter) reader_iterator = iter(reader) if self.is_header_present: next(reader_iterator) for line in reader_iterator: yield self._parse_line(line) stream.close()
def unicode_csv_reader(self, file_handle, encoding='utf-8'): if encoding == 'utf-8': encoding_sig = 'utf-8-sig' reader = csv.reader([x.encode(encoding) for x in iterdecode(file_handle, encoding_sig)]) for row in reader: yield [unicode(x, encoding) for x in row] return
def _handle_output(self, buffer_, hide, output, reader, indices): # Create a generator yielding stdout data. # NOTE: Typically, reading from any stdout/err (local, remote or # otherwise) can be thought of as "read until you get nothing back". # This is preferable over "wait until an out-of-band signal claims the # process is done running" because sometimes that signal will appear # before we've actually read all the data in the stream (i.e.: a race # condition). def get(): while True: data = reader(self.read_chunk_size) if not data: break yield self.encode(data) # Use that generator in iterdecode so it ends up in our local encoding. for data in codecs.iterdecode( get(), self.encoding, errors='replace' ): # Echo to local stdout if necessary # TODO: should we rephrase this as "if you want to hide, give me a # dummy output stream, e.g. something like /dev/null"? Otherwise, a # combo of 'hide=stdout' + 'here is an explicit out_stream' means # out_stream is never written to, and that seems...odd. if not hide: output.write(data) output.flush() # Store in shared buffer so main thread can do things with the # result after execution completes. # NOTE: this is threadsafe insofar as no reading occurs until after # the thread is join()'d. buffer_.append(data) # Run our specific buffer & indices through the autoresponder self.respond(buffer_, indices)
def io(self, reader, output, buffer_, hide): """ Perform I/O (reading, capturing & writing). Specifically: * Read bytes from ``reader``, giving it some number of bytes to read at a time. (Typically this function is the result of `stdout_reader` or `stderr_reader`.) * Decode the bytes into a string according to ``self.encoding`` (typically derived from `default_encoding` or runtime keyword args). * Save a copy of the bytes in ``buffer_``, typically a `list`, which the caller will expect to be mutated. * If ``hide`` is ``False``, write bytes to ``output``, a stream such as `sys.stdout`. """ # Inner generator yielding read data def get(): while True: data = reader(1000) if not data: break # Sometimes os.read gives us bytes under Python 3...and # sometimes it doesn't. ¯\_(ツ)_/¯ if not isinstance(data, six.binary_type): # Can't use six.b because that just assumes latin-1 :( data = data.encode(self.encoding) yield data # Decode stream using our generator & requested encoding for data in codecs.iterdecode(get(), self.encoding, errors='replace'): if not hide: output.write(data) output.flush() buffer_.append(data)
def stock(self, s): url = 'http://quote.yahoo.com/d/quotes.csv?s=%s&f=l1c1p2d1t1' u = urlopen(url % s) reader = csv.reader(codecs.iterdecode(u, 'utf-8')) # in python v3.x 'u' returns bytes that needs decoding res = reader.__next__() # the first/next item of the iterable u.close() # this closes 'u' and 'reader' return res
def wmo_importer(url='https://raw.githubusercontent.com/flyingeek/editolido/gh-pages/ext-sources/nsd_bbsss.txt'): # http://tgftp.nws.noaa.gov/data/nsd_bbsss.txt if PY2: delimiter = b';' data = urlopen(url) else: delimiter = ';' import codecs data = codecs.iterdecode(urlopen(url), 'utf-8') reader = csv.reader(data, delimiter=delimiter, quoting=csv.QUOTE_NONE) def geo_normalize(value): # recognize NSEW or undefined (which is interpreted as North) orientation = value[-1] sign = -1 if orientation in 'SW' else 1 coords = value if orientation not in 'NEWS' else value[:-1] coords += '-0-0' # ensure missing seconds or minutes are 0 degrees, minutes, seconds = map(float, coords.split('-', 3)[:3]) return sign * (degrees + (minutes / 60) + (seconds / 3600)) not_airport = '----' for row in reader: name = row[0] + row[1] if row[2] == not_airport else row[2] yield name, row[0] + row[1], geo_normalize(row[8]), geo_normalize(row[7])
def query_nasdaq(self, exch_name): """Query Nasdaq for list of tickers by exchange""" header = {'user-agent': 'Mozilla/5.0 '\ '(Macintosh; Intel Mac OS X 10.9; rv:32.0)'\ ' Gecko/20100101 Firefox/32.0',} url = 'http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0'\ '&exchange=%s&render=download' % (exch_name) req = Request(url, headers = header) try: response = urlopen(req) #Catch errors. except URLError as e: self.exchange_flag[0] = '1' if hasattr(e, 'reason'): return e.reason elif hasattr(e,'code'): return 'Error', e.code #Setup list(s) of exchange names. exch_result = csv.reader(iterdecode(response,'utf-8')) if exch_name == 'nasdaq': self.nasdaq_list = [row for row in exch_result] elif exch_name == 'nyse': self.nyse_list = [row for row in exch_result] elif exch_name == 'amex': self.amex_list = [row for row in exch_result] return 'Unknown Exception in query_nasdaq'
def _add_csv_file_to_db(self, decoder): f = codecs.iterdecode( self.upload_file_form.cleaned_data['marketing_file'], decoder ) reader = csv.reader(f) if not self.uploaded_file: new_file = UploadedFile( filename=self.upload_file_form.cleaned_data['marketing_file'].name, uploaded_by=self.request.user, num_columns=0, ) new_file.save() self.uploaded_file = new_file is_first_row = True self.num_cols = None row_number = 0 for row in reader: if not self.num_cols: self.num_cols = len(row) if self._csv_row_is_not_blank(row): self._add_csv_row_to_db(row, is_first_row, row_number) is_first_row = False row_number += 1 if self.num_cols: self.uploaded_file.num_columns = self.num_cols self.uploaded_file.save()
def buildDruidCache(self,cutoff_druid_score=0.2): druid_bz2 = bz2.BZ2File(self.druid_mwe_file, mode='r') druid_file = codecs.iterdecode(druid_bz2, 'utf-8') num_added_words=0 for line in druid_file: split = line.split(u'\t') words = split[1].lower() druid_score = split[2] has_number = self.RE_D.search(words) #exlude any lines that have one or more numbers in them if not has_number: words_split = [filterHyphens(word) for word in words.split(u' ')] float_druid_score = float(druid_score) if float_druid_score > cutoff_druid_score: if not any((word in self.stopwords) for word in words_split): self.keyword_dict[words] = float_druid_score num_added_words += 1 if num_added_words % 1000 == 0: print words, self.keyword_dict[words] else: break if self.extra_keywords != '': with codecs.open(self.extra_keywords) as infile: for line in infile: words = line[:-1].lower() print 'Loading user set keyword:',words self.keyword_dict[words] = 3.0
def test_csv(self): reports = make(Report, _quantity=3) response = _export(reports, format="csv") reader = csv.DictReader(codecs.iterdecode(response, "utf8")) rows = list(reader) self.assertEqual(3, len(rows)) self.assertEqual(rows[2]['Description'], reports[2].description)
def fixed2csv(f, schema, output=None, **kwargs): """ Convert a fixed-width file to csv using a CSV-formatted schema description. A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer). Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based. If output is specified, rows will be written to that object, otherwise the complete data will be returned. """ streaming = True if output else False if not streaming: output = StringIO() if 'encoding' in kwargs and kwargs['encoding']: f = iterdecode(f, kwargs['encoding']) writer = CSVKitWriter(output) reader = FixedWidthReader(f, schema) writer.writerows(reader) if not streaming: data = output.getvalue() return data # Return empty string when streaming return ''
def __init__(self, f, schema, encoding=None): if encoding is not None: f = iterdecode(f, encoding) self.file = f self.parser = FixedWidthRowParser(schema) self.header = True
def load_data_for_year(db, y): '''Loads data for the year from the CSV file in the current directory into the sqlite DB.''' print('Loading data for {y}'.format(y=y)) db.execute('DROP TABLE IF EXISTS ucpay{y}'.format(y=y)) # Some of these columns probably won't be needed, but we may as well store them. db.execute(''' CREATE TABLE ucpay{y} ( ucpay_id INT PRIMARY KEY , year INT , campus TEXT , name TEXT , job_title TEXT , gross_pay NUMERIC , base_pay NUMERIC , overtime_pay NUMERIC , extra_pay NUMERIC ) '''.format(y=y)) inner_filename = 'ucpay.csv' if y != 2010 else 'ucpay2010.csv' datafile = iterdecode(ZipFile('./ucpay{y}.csv.zip'.format(y=y), mode='r').open(inner_filename, mode='r'), 'utf8') data = csv.reader(datafile, dialect=(csv.excel_tab if y == 2010 else csv.excel)) if y != 2009: print(next(data)) # skip header for e in data: assert len(e) == len(["ID","year","campus","name","title","gross","base","overtime","extra","exclude"]) # Skip tuples with "exclude"=="1", which are grad students and temporary employees rather than research professors if e[-1] != '1': assert e[-1] == '0' db.execute('INSERT INTO ucpay{y} VALUES (?,?,?,?,?,?,?,?,?)'.format(y=y), e[:-1]) db.commit() print(' done')
def populateStep(csvfile): reader = csv.DictReader(codecs.iterdecode(csvfile, 'utf-8'), delimiter=',', quotechar='"') for row in reader: quoi = row['Quoi'] produit = row['Nom du produit'] ech = row['echantillon'] if ech == '': ech = 0.0 else: ech = float(ech) manip = row['manip'] if manip == '': manip = 0.0 else: manip = float(manip) manip_object, created_manip = Manip.objects.get_or_create(name=quoi) product, product_created = Product.objects.get_or_create(name=produit) step, step_created = StepProto.objects.get_or_create( manip=manip_object, product=product, reac_by_sample=ech, reac_by_manip=manip )
def parse_binary(fd): """Detect encoding of binary file fd and yield all chunks, encoded.""" def find_header(): rbuf = ReadBuffer(fd) parser = parse_encoded(rbuf) for msg in parser: if msg.msgid == '': charset, headers = parse_header_data(msg.msgstrs[0]) return charset, rbuf.bytelines raise PoError('no-header', 'No header found in file %s' % getfilename(fd)) # Non-strict parsing to find header and extract charset: charset, lines = find_header() parser = parse_encoded(iterdecode(itertools.chain(lines, fd), encoding=charset)) # Always yield header first. We buffer the messsages (again) until # we find the header, yield the header, then those in the buffer msgs = [] for msg in parser: msgs.append(msg) if msg.msgid == '': break yield msgs.pop() for msg in msgs: yield msg for msg in parser: yield msg
def scrape(file_obj=None, include_header=True): """ Download the source CSV data from Google Spreadsheets, convert it from wide-form to long-form, and output it to a file-like object (stdout by default). Args: file_obj: file-like object in which to output the parsed CSV data include_header: if True (default), include a header row in the output """ if file_obj is None: file_obj = sys.stdout output = csv.writer(file_obj) if include_header: output.writerow(("date", "pollster", "party", "support")) response = urllib.request.urlopen(EXPORT_URL.format(SPREADSHEET_ID)) # Since urllib returns bytes, iterate through the CSV data and decode it as # UTF-8. rows = csv.reader(codecs.iterdecode(response, "UTF-8")) # Party names are in the first row, from the second column onwards. parties = next(rows)[1:] for row in rows: # Dates are in the first column, from the second row onwards. date = datetime.datetime.strptime(row[0], "%d/%m/%Y").date() parties_support = [] for party, support in zip(parties, row[1:]): try: output.writerow((date, "MMR", party, float(support.replace(",", ".")))) except ValueError: # No value given for the party, and so no need to output a row. pass
def carga(self,file): with open(file,'rb') as csvfile: spamreader = csv.reader(codecs.iterdecode(csvfile,'latin1'), delimiter=';', quotechar='|') for row in spamreader: if row[0].upper() != "CNPJ" and len(row[0]) > 0: self.dados.append(row)
def _read_data(): ''' ''' nan = float('NaN') data = {} with gzip.open(package_path('US_Regions_State_Boundaries.csv.gz')) as f: decoded = codecs.iterdecode(f, "utf-8") next(decoded) reader = csv.reader(decoded, delimiter=str(','), quotechar=str('"')) for row in reader: region, name, code, geometry, dummy = row xml = et.fromstring(geometry) lats = [] lons = [] for i, poly in enumerate(xml.findall('.//outerBoundaryIs/LinearRing/coordinates')): if i > 0: lats.append(nan) lons.append(nan) coords = (c.split(',')[:2] for c in poly.text.split()) lat, lon = list(zip(*[(float(lat), float(lon)) for lon, lat in coords])) lats.extend(lat) lons.extend(lon) data[code] = { 'name' : name, 'region' : region, 'lats' : lats, 'lons' : lons, } return data
def build_druid_cache(self, cutoff_druid_score): druid_bz2 = bz2.BZ2File(self.druid_mwe_file, mode='r') druid_file = codecs.iterdecode(druid_bz2, 'utf-8') num_added_words = 0 logger.info("Loading DRUID cache...") start_time = time.time() for line in druid_file: split = line.split(u'\t') words = split[1].lower() druid_score = split[2] has_number = self.RE_D.search(words) # exclude any lines that have one or more numbers in them if not has_number: words_split = [filter_hyphens(word) for word in words.split(u' ')] float_druid_score = float(druid_score) if float_druid_score < cutoff_druid_score: break if not any((word in self.stopwords) for word in words_split): self.keyword_dict[words] = float_druid_score num_added_words += 1 if num_added_words % 1000 == 0: print words, self.keyword_dict[words] logger.info("Finished loading DRUID cache. Time needed: " + str(time.time() - start_time))
def scrape(file_obj=None, include_header=True): """ Download the source data as a CSV using DataMarket's API, clean it up and discard uninteresting rows, and output it to a file-like object (stdout by default). Args: file_obj: file-like object in which to output the parsed CSV data include_header: if True (default), include a header row in the output """ if file_obj is None: file_obj = sys.stdout output = csv.writer(file_obj) if include_header: output.writerow(("date", "pollster", "party", "support")) response = urllib.request.urlopen(SOURCE_URL) # Since urllib returns bytes, iterate through the CSV data and decode it as # UTF-8. rows = csv.reader(codecs.iterdecode(response, "ISO-8859-1")) for party, date, support in rows: # Only match rows that contain data for distinct political parties. # That way we can ignore rows containing data on government support # (e.g. "1995-2007 (B og D)"). match = PARTY_NAME_RE.match(party) if match: # Dates are only year-and-month. Normalise it to a full date, # although that won't actually be an accurate date. date = datetime.datetime.strptime(date, "%Y-%m").date() # Use the first non-null group in the regex as the party name. party = next(g for g in match.groups() if g is not None) output.writerow((date, "Gallup", party, float(support)))
def open_file(infile, informat='raw', encoding="utf-8", **kwargs): logger.debug('Opening file: {}'.format(infile)) if isinstance(infile, basestring): if informat == "vnd.ms-excel" or informat == 'xls': import xlrd logger.debug('An office file!') f = xlrd.open_workbook(infile, on_demand=True) elif informat == "xml": logger.debug('An XML file!') f = etree.parse(infile) elif informat == "csv": logger.debug('Opening as csv') f = csv.reader(open(infile, 'r'), encoding=encoding, **kwargs) else: f = codecs.open(infile, 'r', encoding) else: if informat == "vnd.ms-excel" or informat == 'xls': import xlrd logger.debug('An office file!') f = xlrd.open_workbook(file_contents=infile.read(), on_demand=True) elif informat == "xml": logger.debug('An XML file!') f = etree.fromstring(infile) elif informat == "csv": logger.debug("CSV file") f = csv.reader(infile, encoding=encoding, **kwargs) else: f = codecs.iterdecode(iter(infile.readline, ""), encoding) return f
def parse_data(fname): data = {} csvfile = list(csv.reader(codecs.iterdecode(urlopen(fname), 'utf-8'))) csvfile.pop(0) # remove 'date;area;concentration;volume' string csvfile.pop(len(csvfile) - 1) # remove empty string for row in csvfile: row_data = row[0].split(';') raw_date = re.match('(.*)T', row_data[0]).group(1) date = format_date(raw_date) prepare.append_to_data(data, date['year'], date['month'], date['day'], {}) area = float(row_data[1]) conc = 0.0 vol = 0.0 if len(row_data) > 2: conc = float(row_data[2]) vol = float(row_data[3]) data[date['year']][date['month']][date['day']]['conc'] = conc data[date['year']][date['month']][date['day']]['vol'] = vol data[date['year']][date['month']][date['day']]['area'] = area return data
def forex(self, s='usd', t='eur'): url = 'http://quote.yahoo.com/d/quotes.csv?s=%s%s=X&f=nl1d1t1' u = urlopen(url % (s, t)) reader = csv.reader(codecs.iterdecode(u, 'utf-8')) # in python v3.x 'u' returns bytes that needs decoding res = reader.__next__() # the first/next item of the iterable u.close() # this closes 'u' and 'reader' return res
def create_canada_population_table(category="VALUE"): """Creates a csv file that contains a table for all regions and the selected category (header) for the most recent date :param category: Header name in CSV file, defaults to "VALUE" :type category: str, optional """ response = urlopen(get_csv_data()) csv_reader = csv.reader(codecs.iterdecode(response, 'utf-8'), delimiter=',') f = open(FILE_PATH + 'canada-population-data.csv', 'w', newline='') writer = csv.writer(f) line_count = 0 desired_row = 0 most_recent_values = {} # Create a dictionary out of all the regions for region in selection_list: most_recent_values[region] = 0 most_recent_date = "0" # Gets the most recent date in the data for row in csv_reader: if line_count == 0: writer.writerow(['Region', 'Population', 'Date']) for index, header in enumerate(row): if (header == category): desired_row = index elif len(row) > 0 and row[1] in selection_list: # prename most_recent_date = row[0] # report_date most_recent_values[row[1]] = [row[desired_row], most_recent_date] # numtotal line_count += 1 for reg in most_recent_values: writer.writerow( [reg, most_recent_values[reg][0], most_recent_values[reg][1]]) # name, total, date if (most_recent_date != "0"): # Update the posts file with the most recent dated update update_date(most_recent_date) f.close()
def _execute(self, cmd, params=None, data=None, headers={}, method=None): """execute a tomcat command and check status returning a file obj for further processing fobj = _execute(url) """ url = self.__managerURL + "/" + cmd if params: url = url + "?%s" % urllib.parse.urlencode(params) req = ExtendedRequest(url, data, headers) if method: req.method = method response = self.__opener.open(req) content = codecs.iterdecode(response, "utf-8") status = next(content).rstrip() self.hasConnected = True if not status[:4] == "OK -": raise TomcatException(status) return content
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza._compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) with closing(urlopen(url)) as response: f = response.fp encoding = get_response_encoding(response, 'utf-8') decoded = iterdecode(f, encoding) sliced = betwix(decoded, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream
def load_data(file, group, tsv_file, type='events'): """Load downloaded event summary data into PyTables file. If you've previously downloaded event summary data from http://data.hisparc.nl/ in TSV format, you can load them into a PyTables file using this method. The result is equal to directly downloading data using :func:`download_data`. :param file: the PyTables datafile handler. :param group: the PyTables destination group, which need not exist. :param tsv_file: path to the tsv file downloaded from the HiSPARC Public Database. :param type: the datatype to load, either 'events', 'weather', 'singles' or 'lightning'. Example:: >>> import tables >>> import sapphire.esd >>> data = tables.open_file('data.h5', 'w') >>> sapphire.esd.load_data(data, '/s501', 'events-s501-20130910.tsv') """ if type == 'events': table = _get_or_create_events_table(file, group) read_and_store_class = _read_line_and_store_event_class elif type == 'weather': table = _get_or_create_weather_table(file, group) read_and_store_class = _read_line_and_store_weather_class elif type == 'singles': table = _get_or_create_singles_table(file, group) read_and_store_class = _read_line_and_store_singles_class elif type == 'lightning': table = _get_or_create_lightning_table(file, group) read_and_store_class = _read_line_and_store_lightning_class else: raise ValueError("Data type not recognized.") with open(tsv_file, 'rb') as data: reader = csv.reader(iterdecode(data, 'utf-8'), delimiter='\t') with read_and_store_class(table) as writer: for line in reader: writer.store_line(line)
def read_data(url, method): # Method one: Pandas read_csv() function if method == 1: print( "\nLoading in data via the read_csv method in pandas using a url\n" ) data = pd.read_csv(url, names=columns) # Method two: use Numpy's loadtxt() method # Make sure to specify the delimiter, otherwise it throws an error if method == 2: print( "\nLoading in data via the loadtxt method in numpy using a url\n") data = np.loadtxt(url, dtype=float, delimiter=',') # Method three: Load in using csv.reader() function # Since the original method can only read physical csvs # We need to include the urllib.request.urlopen() method for the url # THEN, to properly load the data, we need to create a generator object using codecs # Then convert to a list, THEN to a DataFrame in pandas. Having fun yet? if method == 3: print("\nLoading in data via the csv.reader() method using a url\n") response = urllib.request.urlopen(url) data = csv.reader(codecs.iterdecode(response, 'utf-8')) data = pd.DataFrame(list(data), columns=columns) print("\nHere is the shape of the data for Method %s:\n" % (method), data.shape) # For methods 1 and 3 try: print("\nHere is the head of the data for Method %s:\n" % (method), data.head(10)) print("\nHere is the tail of the data for Method %s:\n" % (method), data.tail(10)) # For method 2 except AttributeError: print("\nHere is the head of the data for Method %s:\n" % (method), data[0:9, ]) print("\nHere is the tail of the data for Method %s:\n" % (method), data[0:9, ])
def check_partial(self, input, partialresults): # get a StreamReader for the encoding and feed the bytestring version # of input to the reader byte by byte. Read everything available from # the StreamReader and check that the results equal the appropriate # entries from partialresults. q = Queue() r = codecs.getreader(self.encoding)(q) result = u"" for (c, partialresult) in zip(input.encode(self.encoding), partialresults): q.write(c) result += r.read() self.assertEqual(result, partialresult) # check that there's nothing left in the buffers self.assertEqual(r.read(), u"") self.assertEqual(r.bytebuffer, "") self.assertEqual(r.charbuffer, u"") # do the check again, this time using a incremental decoder d = codecs.getincrementaldecoder(self.encoding)() result = u"" for (c, partialresult) in zip(input.encode(self.encoding), partialresults): result += d.decode(c) self.assertEqual(result, partialresult) # check that there's nothing left in the buffers self.assertEqual(d.decode("", True), u"") self.assertEqual(d.buffer, "") # Check whether the rest method works properly d.reset() result = u"" for (c, partialresult) in zip(input.encode(self.encoding), partialresults): result += d.decode(c) self.assertEqual(result, partialresult) # check that there's nothing left in the buffers self.assertEqual(d.decode("", True), u"") self.assertEqual(d.buffer, "") # check iterdecode() encoded = input.encode(self.encoding) self.assertEqual( input, u"".join(codecs.iterdecode(encoded, self.encoding)) )
def order_csv(self): try: stream = codecs.iterdecode(self.stream, 'utf-8') row_count = 0 for row in csv.reader(stream, dialect=csv.excel): if row_count == 0: row_count += 1 else: id = row[0] purchase_date = row[1] total_price = row[2] data = OrderCSV(id=int(id), purchase_date=purchase_date, total_price=int(total_price)) db.session.add(data) db.session.commit() return 'CSV data populated into database successfully' except Exception: return 'Some Error Occurred'
def __init__(self, wb_string, logger_obj=None): self.logger = logger_obj self.log(u'Initializing a TableauWorkbook object') self.wb_string = wb_string if self.wb_string.find('.twb') != -1: self.log( u".twb found in wb_string, assuming it is actually a filename. Opening file" ) fh = open(self.wb_string, 'rb') self.wb_string = fh.read() self.wb = StringIO(self.wb_string) self.start_xml = "" self.end_xml = "" self.datasources = {} start_flag = True ds_flag = False current_ds = "" if self.logger is not None: self.enable_logging(self.logger) for line in codecs.iterdecode(self.wb, 'utf-8'): # Start parsing the datasources if start_flag is True and ds_flag is False: self.start_xml += line if start_flag is False and ds_flag is False: self.end_xml += line if ds_flag is True: current_ds += line # Break and load the datasource if line.find(u"</datasource>") != -1: self.log(u"Building TableauDatasource object") ds_obj = TableauDatasource(current_ds, logger_obj=self.logger) self.datasources[ds_obj.get_datasource_name()] = ds_obj current_ds = "" if line.find(u"<datasources") != -1 and start_flag is True: start_flag = False ds_flag = True if line.find(u"</datasources>") != -1 and ds_flag is True: self.end_xml += line ds_flag = False
def execute(self, cache=True): '''Retrieve a world bank indicator and convert to a data package. Data Package is stored at ./indicators/{indicator-name} ''' if cache: self.retrieve() (meta, data) = self.extract(open(self.meta_dest), open(self.data_dest)) else: (meta, data) = self.extract( urllib.request.urlopen(self.meta_url), codecs.iterdecode(urllib.request.urlopen(self.data_url), 'utf-8')) basepath = os.path.join('indicators', meta['name']) os.makedirs(basepath, exist_ok=True) self.datapackage(meta, data, basepath) return basepath
def get_aadf_by_direction_data(local_authority_id): """ Get a DictReader of the specified AADF By Direction dataset. """ # URL likely to change now and again. No guarantee the URL will remain # easily constructable, so not much point moving to config. url = f"https://dft-statistics.s3.amazonaws.com/road-traffic/downloads/aadfbydirection/local_authority_id/dft_aadfbydirection_local_authority_id_{local_authority_id}.csv" # Deliberately not handling any errors here. Let it crash and inspect # manually. csv_stream = urlopen(url) # CSV files tend to be a few MB, so use a generator (codecs.iterdecode) to # stream the CSV data and make the read process a bit more memory # efficient. csv_file = csv.DictReader(codecs.iterdecode(csv_stream, "utf-8")) return csv_file
def create_preferences(file): shipback = [] r = DictReader(codecs.iterdecode(file, 'utf-8')) for row in r: obj_1_id = -1 if row['object_1_content_type'] == 'course': obj_1_id = Course.objects.get(name=row['object_1_natural_id']).id elif row['object_1_content_type'] == 'teacher': obj_1_id = BaseUser.objects.get( email=row['object_1_natural_id']).id row['object_1_content_type'] = 'baseuser' # so we get the appropriate content type later elif row['object_1_content_type'] == 'timeblock': obj_1_id = Timeblock.objects.get( block_id=row['object_1_natural_id']).id elif row['object_1_content_type'] == 'section': obj_1_id = Section.objects.get( section_id=row['object_1_natural_id']).id obj_2_id = -1 if row['object_2_content_type'] == 'course': obj_2_id = Course.objects.get(name=row['object_2_natural_id']).id elif row['object_2_content_type'] == 'teacher': obj_2_id = BaseUser.objects.get( email=row['object_2_natural_id']).id row['object_2_content_type'] = 'baseuser' # so we get the appropriate content type later elif row['object_2_content_type'] == 'timeblock': obj_2_id = Timeblock.objects.get( block_id=row['object_2_natural_id']).id elif row['object_2_content_type'] == 'section': obj_2_id = Section.objects.get( section_id=row['object_2_natural_id']).id shipback.append( Preference(weight=row["weight"], object_1_content_type=ContentType.objects.filter( model=row['object_1_content_type'])[0], object_2_content_type=ContentType.objects.filter( model=row['object_2_content_type'])[0], object_1_id=obj_1_id, object_2_id=obj_2_id)) return shipback
def get_canton_data(canton='ZH'): # This CSV is messy. There are very many missing fields, because each canton # reports different data, and each new row is populated in increments (eg. # they might know how many are in quarantine that day in the morning, but # might not take hospital counts until the evening, etc.). csv_url = f'https://github.com/openZH/covid_19/raw/master/fallzahlen_kanton_total_csv_v2/COVID19_Fallzahlen_Kanton_{canton}_total.csv' csv_lines = requests.get(csv_url).iter_lines() reader = csv.reader(codecs.iterdecode(csv_lines, 'utf-8'), delimiter=',') # Columns of CSV: # [0: date, 1: time, 2: abbreviation_canton_and_fl, 3: ncumul_tested, # 4: ncumul_conf, 5: new_hosp, 6: current_hosp, 7: current_icu, # 8: current_vent, 9: ncumul_released, 10: ncumul_deceased, 11: source, # 12: current_isolated, 13: current_quarantined, # 14: current_quarantined_riskareatravel, 15: current_quarantined_total] # Rows N-2, N-1 last_rows = (None, None) for i, row in enumerate(reader): last_rows = (last_rows[-1], row) # Skip the first few rows to avoid going None in last_rows, # and to skip the header row. if i < 3: continue # Make sure all the rows we need are present, as they are not all # updated at the same time. if any(row[i] == '' for i in [0, 4, 11]): continue try: num_new_cases = int(last_rows[-1][4]) - int(last_rows[-2][4]) num_isolated = last_rows[-1][12] num_isolated = num_isolated if num_isolated != '' else '(idk)' num_quarantined = last_rows[-1][13] num_quarantined = num_quarantined if num_quarantined != '' else '(idk)' date = last_rows[-1][0] source = last_rows[-1][11] except Exception as e: # We don't really care if it failed because a row was missing or w/e. pass return (f'On {date}, there were {num_new_cases} new cases reported in ' + f'Canton {canton}, with {num_isolated} in isolation and ' + f'{num_quarantined} in quarantine. Source: {source}')
def _read_wiki_data(corpus_filename): """Reads the data from the compressed Wikipedia file into memory In an effort to cut down on runtime, only the first 1200000 bytes are read into memory. This is a high estimate of the amount of data we want. A later step refines this number This function also counts the number of occurances of each character. Any character which appears less than 0.05% of the time is removed from the data :return: The raw data from disk """ import tarfile with tarfile.open(corpus_filename, 'r:xz') as tar_file: raw_data = '' for member in tar_file.getmembers(): _log.info('Reading from file %s' % member.name) member_stream = tar_file.extractfile(member) count = 0 binary_chunks = iter(functools.partial(member_stream.read, 1), "") for unicode_chunk in codecs.iterdecode(binary_chunks, 'utf-8'): raw_data += unicode_chunk count += 1 if count % 10000 == 0: _log.info('Read in %s characters' % count) # 32K words * 10 characters per word = 320000 characters total # This is a super high estimate, but all well. if count >= 320000: break #character_frequencies = defaultdict(int) #character_increment = 1.0 / len(raw_data) #for char in raw_data: # character_frequencies[char.lower()] += character_increment #_log.info('Counted occurrences of each character') #data_filtered = [char.lower() for char in raw_data if character_frequencies[char.lower()] > 0.005] #_log.info('Filtered out uncommon characters') #return ''.join(data_filtered) return raw_data
def merge_namespaces(): """Serves the page for merging bel namespaces""" form = MergeNamespaceForm() if not form.validate_on_submit(): return render_template('merge_namespaces.html', form=form) log.warning(form.file) files = request.files.getlist("file") names = set() for file in files: log.warning('file: %s', file) resource = parse_bel_resource(codecs.iterdecode(file, 'utf-8')) names |= set(resource['Values']) si = StringIO() write_namespace( namespace_name=form.name.data, namespace_keyword=form.keyword.data, namespace_species=form.species.data, namespace_description=form.description.data, author_name=current_user.name, author_contact=current_user.email, citation_name=form.citation.data, citation_description= 'This namespace was created by the PyBEL Web namespace merge service', namespace_domain=form.domain.data, author_copyright=form.licenses.data, values=names, cacheable=False, file=si) output = make_response(si.getvalue()) output.headers[ "Content-Disposition"] = "attachment; filename={}.belns".format( form.keyword.data) output.headers["Content-type"] = "text/plain" return output
def readBase(csvFile): labels = [] base = [] listaPorEmocao = {'anger':"", 'boredom':"", 'empty':"", 'enthusiasm':"", 'fun':"", 'happiness':"", 'hate':"", 'love':"", 'neutral':"", 'relief':"", 'sadness':"", 'sentiment':"", 'surprise':"", 'worry':""} qtdEmocao = {'anger': 0, 'boredom': 0, 'empty': 0, 'enthusiasm': 0, 'fun': 0, 'happiness': 0, 'hate': 0, 'love': 0, 'neutral': 0, 'relief': 0, 'sadness': 0, 'sentiment': 0, 'surprise': 0, 'worry': 0} with open(csvFile) as csvfile: import codecs ifile = open(csvFile, "rb") read = csv.reader(codecs.iterdecode(ifile, 'utf-8')) for row in read: try: temp2 = str(row[0]) labels.append(temp2) temp1 = str(row[1]) temp1 = temp1.split() temp3 = [] #aplica steammer for u in range(len(temp1)): #temp1[u] = stemmer.stem(temp1[u]) temp1[u] = lemmatizer.lemmatize(temp1[u]) if(temp1[u] in stopwords): temp3.append(temp1[u]) temp1 = " ".join(temp1) #remove stopwords if(len(temp3)>0): for u in temp3: temp1 = temp1.replace(u,"") # filtering temp1 = re.sub('[^A-Za-z]+', ' ', temp1) base.append(temp1) listaPorEmocao[temp2] = listaPorEmocao[temp2] + " " + temp1 qtdEmocao[temp2] = qtdEmocao[temp2] + 1 except IndexError: pass return base, labels, listaPorEmocao, qtdEmocao
def read_doc_annotations(archive_file, force_redownload=False, pos_type='DOCUMENT_PNEUMONIA_YES'): print('Reading annotations from file : ' + archive_file) if 'http' in archive_file: if force_redownload or not os.path.isfile(archive_file): print('Downloading remote file : ' + archive_file) urllib.request.urlretrieve(archive_file, archive_file) filename = archive_file.split('/')[-1] else: filename = archive_file annotated_doc_map = {} print('Opening local file : ' + filename) z = zipfile.ZipFile(filename, "r") zinfo = z.namelist() for name in zinfo: if name.endswith('.txt') or name.endswith('.ann'): basename = name.split('.')[0].split('/')[-1] if basename not in annotated_doc_map: annotated_doc_map[basename] = AnnotatedDocument() anno_doc = annotated_doc_map[basename] # handle text and BRAT annotation files (.ann) differently if name.endswith('.txt'): with z.open(name) as f1: anno_doc.text = f1.read().decode('utf8') else: with z.open(name) as f1: # handle this as utf8 or we get back byte arrays # print(name) anno_doc.annotations = read_brat_annotations(codecs.iterdecode(f1, 'utf8')) # now let's finally assign a 0 or 1 to each document based on whether we see our expected type for the pneumonia label for key, anno_doc in annotated_doc_map.items(): annos = anno_doc.annotations anno_doc.positive_label = 0 for anno in annos: # NOTE : This "positive_label" relates to positive/possible cases of pneumonia if anno.type == pos_type: anno_doc.positive_label = 1 return annotated_doc_map
def load_votes_from_stream(stream, filename): res = {} rd = [] if filename.endswith(".csv"): if isinstance(stream, io.BytesIO): stream = codecs.iterdecode(stream, 'utf-8') for row in csv.reader(stream, skipinitialspace=True): rd.append(row) elif filename.endswith(".xlsx"): book = openpyxl.load_workbook(stream) sheet = book.active for row in sheet.rows: rd.append([cell.value for cell in row]) else: return None, None, None res["constituencies"] = [row[0] for row in rd[1:]] for row in rd: del (row[0]) if rd[0][0].lower() == "cons": res["constituency_seats"] = [ int(row[0]) if row[0] else 0 for row in rd[1:] ] for row in rd: del (row[0]) if rd[0][0].lower() == "adj": res["constituency_adjustment_seats"] = [ int(row[0]) if row[0] else 0 for row in rd[1:] ] for row in rd: del (row[0]) num_parties = 0 while (num_parties < len(rd[0]) and rd[0][num_parties]): num_parties += 1 res["parties"] = rd[0][:num_parties] res["votes"] = [[int(v) if v else 0 for v in row[:num_parties]] for row in rd[1:]] return res
def getrates(archivo): """Procedure to update change rates""" rates = {} url = "http://www.bankofcanada.ca/en/markets/csv/exchange_eng.csv" fh = urllib.request.urlopen(url) data = csv.reader(codecs.iterdecode(fh, "utf-8")) for row in data: if row[0].startswith("Date "): date = row[-1] elif not row[0].startswith("#"): value = float(row[-1]) rates[row[1][1:].replace("_NOON", "").lower()] = value del rates["iexe0124"] del rates["iexe0125"] rates["cad"] = 1. for rate in rates: rates[rate] = rates[rate] / rates["usd"] rates["date"] = date json.dump(rates, open(archivo, "w"), indent=4)
def download_products(db, catalog, token, CURRENT_VERSION): session = requests.Session() with closing( session.get(catalog["CatalogCSVUrl"], stream=True, cookies={'grs': token})) as r: reader = csv.reader(codecs.iterdecode(r.iter_lines(), encoding='utf-8')) line_count = 0 for row in reader: # print(row) if line_count == 0: line_count += 1 elif row != [] and len(row) > 1 and line_count > 0: add_product(db, catalog, row, CURRENT_VERSION) line_count += 1 else: print(row)
def requests_get(url, result_type='text'): """ :param url: url to GET :param result_type: text (default), json, or csv """ logger.debug(f'GET {url}') response = requests.get(url, allow_redirects=True) if response.status_code != 200: message = f"GET {url}: HTTP {response.status_code}: {response.text}" logger.error(message) raise requests.exceptions.HTTPError(message) if result_type == 'json': return response.json() if result_type == 'csv': reader = csv.DictReader( codecs.iterdecode(response.iter_lines(), 'utf-8')) # returns list of dicts return list(reader) return response.text
def test_refresh(): """ Test token expiration and refresh. """ test_client = make_test_app().test_client() with patch('time.time', Mock(return_value=time.time())) as time_1: # authenticate and get an ID token cookie auth_redirect = test_client.get('/') callback_redirect = test_client.get(callback_url_for(auth_redirect)) actual_page = test_client.get(callback_redirect.headers['Location']) page_text = ''.join(codecs.iterdecode(actual_page.response, 'utf-8')) assert page_text == 'too many secrets', "Authentication failed" # app should now try to use the refresh token with patch('time.time', Mock(return_value=time.time() + 10)) as time_2: test_client.get('/') body = parse_qs(last_request['body']) assert body.get('refresh_token') == ['mock_refresh_token'], \ "App should have tried to refresh credentials"
def insert_into_table(url): data = urllib.request.urlopen(url) response = data.read() encoding = data.headers.get_content_charset('utf-8') data_string = response.decode(encoding) dialect = csv.Sniffer().sniff(data_string[0:101]) data = urllib.request.urlopen(url) reader = csv.reader(codecs.iterdecode(data, 'utf-8'), delimiter=dialect.delimiter) iterator = 0 year = year_extraction(url) sqlQuery = "INSERT INTO masini (judet, categorie_nationala, categorie_comunitara, marca, descriere_comerciala, total, an) VALUES (%s, %s, %s, %s, %s, %s, %s)" for record in reader: if iterator > 0: mycursor.execute( sqlQuery, (record[0], record[1], record[2], text_extract( record[3]), record[4], record[5], year)) iterator = iterator + 1 mydb.commit()
def verify_encode(file_obj, encoding, blocks=1, chunk_size=4096): """ Iterate through the file chunking the data into blocks and decoding them. Here we can adjust how the size of blocks and how many to validate. By default, we are just going to check the first 4K block. """ good = True file_obj.seek(0) binary_chunks = iter(functools.partial(file_obj.read, chunk_size), b"") try: for unicode_chunk in codecs.iterdecode(binary_chunks, encoding): # noqa if blocks: blocks -= 1 else: break except Exception: good = False return good
def _get_job_version(self): for record in self: if record.archive: with record._get_zipfile() as zf: filename = 'jobInfo.properties' # INFO: can't use configparser because this file # has no section with zf.open(filename) as f: reader = csv.reader(codecs.iterdecode( f.readlines(), 'utf-8'), delimiter='=', escapechar='\\', quoting=csv.QUOTE_NONE) for row in reader: if row[0] == 'jobVersion': record.version = row[1] else: record.version = max(record._get_all_children().filtered( 'version').mapped('version'), default='')
def import_csv(): """ Imports a CSV and returns a dictionary of key: value pairs. Inputs ------ filename (str): name of the CSV file. This should be pipe (`|`) delimited and include only the Primary Key (in string format) and the JSON data you want to add in the new column """ id_to_json = {} url = "https://cockroach-university-public.s3.amazonaws.com/10000row_json_column.csv" ftpstream = urllib.request.urlopen(url) csvfile = csv.reader(codecs.iterdecode(ftpstream, 'utf-8'), delimiter='|') for row in csvfile: id_to_json[row[0]] = row[1] return id_to_json
def parse_csv_file(fp_or_filename, has_title=False, has_header=False, encoding='utf-8'): fp = fp_or_filename if isinstance(fp, str): fp = open(fp, newline='', encoding=encoding) else: fp = codecs.iterdecode(fp, encoding) reader = csv.reader(fp, delimiter=',') data = [] try: title = next(reader) if has_title else None header = next(reader) if has_header else None for row in reader: data.append(row) except csv.Error as e: _logger.error('CSV Loading error!') return str(e) return data
def test_run(self, fetcher, response, file_num, *args, **kwargs): # validate that the decorator is working as intended. should not # provide a response object, since it's the objective of the test assert response is None # validate settings assert fetcher.settings.start_date == utils.get_expected_start_date() expected_reader = utils.get_expected_data_files_as_csv( finra.source, file_num) for response in fetcher.run(show_progress=False): assert response is not None assert response.status_code == 200 reader = csv.reader( codecs.iterdecode(response.iter_lines(), 'utf-8', errors="replace")) for row in reader: assert row == next(expected_reader)
def main(): url = "https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv" stream = urlopen(url) csv_file = csv.reader(codecs.iterdecode(stream, 'utf-8')) iris_lst = [] [iris_lst.append(ExtendedList(raw).lst) for raw in csv_file] for item in iris_lst: iris_value_list = [] converted_iris_lst = ExtendedList.next_val(item) for element in converted_iris_lst: iris_value_list.append(element) if len(iris_value_list) == 0: continue else: iris_value_list = tuple(iris_value_list) insert_iris_list(iris_value_list) print('Done!')
def read_whois(npc_name='list'): response = {} response['text'] = '' npcs = {} url = config['WHOIS_CSV'] with closing(requests.get(url, stream=True)) as r: reader = csv.reader(codecs.iterdecode(r.iter_lines(), 'utf-8'), delimiter = ',', quotechar='"') for row in reader: name = row[0] description = row[1] npcs[name] = description if npc_name.lower() == 'list': response['title'] = 'List of NPCs' for npc_name in list(npcs.keys()): response['text'] = response['text'] + '\n' + npc_name return response search_match = process.extractOne(npc_name, list(npcs.keys())) response['title'] = search_match[0] response['text'] = npcs[search_match[0]] return response
def read_text_resource(finput, encoding='utf-8', ignore_prefix='#'): """Read a text resource ignoring comments beginning with pound sign :param finput: path or file handle :type finput: str, file :param encoding: which encoding to use (default: UTF-8) :type encoding: str :param ignore_prefix: lines matching this prefix will be skipped :type ignore_prefix: str, unicode :rtype: generator """ ctx = joint_context(codecs.iterdecode(finput, encoding=encoding)) \ if isiterable(finput) \ else codecs.open(finput, 'r', encoding=encoding) with ctx as fhandle: for line in fhandle: if ignore_prefix is not None: line = line.split(ignore_prefix)[0] line = line.strip() if line: yield line