def _read_one_data(self, url, params): params = { "index_col": 0, "parse_dates": [0], "date_parser": _parse_date_famafrench, } # headers in these files are not valid if self.symbols.endswith("_Breakpoints"): if self.symbols.find("-") > -1: c = ["<=0", ">0"] else: c = ["Count"] r = list(range(0, 105, 5)) params["names"] = ["Date"] + c + list(zip(r, r[1:])) if self.symbols != "Prior_2-12_Breakpoints": params["skiprows"] = 1 else: params["skiprows"] = 3 doc_chunks, tables = [], [] data = self._read_zipfile(url) for chunk in data.split(2 * "\r\n"): if len(chunk) < 800: doc_chunks.append(chunk.replace("\r\n", " ").strip()) else: tables.append(chunk) datasets, table_desc = {}, [] for i, src in enumerate(tables): match = re.search(r"^\s*,", src, re.M) # the table starts there start = 0 if not match else match.start() df = read_csv(StringIO("Date" + src[start:]), **params) try: idx_name = df.index.name # hack for pandas 0.16.2 df = df.to_period(df.index.inferred_freq[:1]) df.index.name = idx_name except Exception: pass df = df.truncate(self.start, self.end) datasets[i] = df title = src[:start].replace("\r\n", " ").strip() shape = "({0} rows x {1} cols)".format(*df.shape) table_desc.append("{0} {1}".format(title, shape).strip()) descr = "{0}\n{1}\n\n".format(self.symbols.replace("_", " "), len(self.symbols) * "-") if doc_chunks: descr += " ".join(doc_chunks).replace(2 * " ", " ") + "\n\n" table_descr = map(lambda x: "{0:3} : {1}".format(*x), enumerate(table_desc)) datasets["DESCR"] = descr + "\n".join(table_descr) return datasets
def _read_one_data(self, url, params): params = { 'index_col': 0, 'parse_dates': [0], 'date_parser': _parse_date_famafrench } # headers in these files are not valid if self.symbols.endswith('_Breakpoints'): if self.symbols.find('-') > -1: c = ['<=0', '>0'] else: c = ['Count'] r = list(range(0, 105, 5)) params['names'] = ['Date'] + c + list(zip(r, r[1:])) if self.symbols != 'Prior_2-12_Breakpoints': params['skiprows'] = 1 else: params['skiprows'] = 3 doc_chunks, tables = [], [] data = self._read_zipfile(url) for chunk in data.split(2 * '\r\n'): if len(chunk) < 800: doc_chunks.append(chunk.replace('\r\n', ' ').strip()) else: tables.append(chunk) datasets, table_desc = {}, [] for i, src in enumerate(tables): match = re.search(r'^\s*,', src, re.M) # the table starts there start = 0 if not match else match.start() df = read_csv(StringIO('Date' + src[start:]), **params) try: idx_name = df.index.name # hack for pandas 0.16.2 df = df.to_period(df.index.inferred_freq[:1]) df.index.name = idx_name except Exception: pass df = df.truncate(self.start, self.end) datasets[i] = df title = src[:start].replace('\r\n', ' ').strip() shape = '({0} rows x {1} cols)'.format(*df.shape) table_desc.append('{0} {1}'.format(title, shape).strip()) descr = '{0}\n{1}\n\n'.format(self.symbols.replace('_', ' '), len(self.symbols) * '-') if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n' table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc)) datasets['DESCR'] = descr + '\n'.join(table_descr) return datasets
def read_all_boards(self): """Read all data from every board for every ticker""" markets_n_engines, boards = self._get_metadata() try: self.__markets_n_engines = markets_n_engines urls = self.url # generate urls per symbols dfs = [] # an array of pandas dataframes per symbol to concatenate for i in range(len(urls)): out_list = [] date_column = None while True: # read in a loop with small date intervals if len(out_list) > 0: if date_column is None: date_column = out_list[0].split(";").index("TRADEDATE") # get the last downloaded date start_str = out_list[-1].split(";", 4)[date_column] start = dt.datetime.strptime(start_str, "%Y-%m-%d").date() else: start_str = self.start.strftime("%Y-%m-%d") start = self.start if start > self.end or start > dt.date.today(): break params = self._get_params(start_str) strings_out = self._read_url_as_String( urls[i], params ).splitlines()[2:] strings_out = list(filter(lambda x: x.strip(), strings_out)) if len(out_list) == 0: out_list = strings_out if len(strings_out) < 101: # all data received - break break else: out_list += strings_out[1:] # remove a CSV head line if len(strings_out) < 100: # all data recevied - break break if len(out_list) > 0: str_io = StringIO("\r\n".join(out_list)) dfs.append(self._read_lines(str_io)) # add a new DataFrame finally: self.close() if len(dfs) == 0: raise IOError( "{} returned no data; " "check URL or correct a date".format(self.__class__.__name__) ) elif len(dfs) > 1: b = concat(dfs, axis=0, join="outer", sort=True) else: b = dfs[0] return b
def read(self): """Read data""" try: self.__markets, self.__engines = self._get_metadata() urls = self.url # generate urls per symbols dfs = [] # an array of pandas dataframes per symbol to concatenate for i in range(len(self.symbols)): out_list = [] date_column = None while True: # read in a loop with small date intervals if len(out_list) > 0: if date_column is None: date_column = out_list[0].split(";").index( "TRADEDATE") # get the last downloaded date start_str = out_list[-1].split(";", 4)[date_column] start = dt.datetime.strptime(start_str, "%Y-%m-%d").date() else: start_str = self.start.strftime("%Y-%m-%d") start = self.start if start >= self.end or start >= dt.date.today(): break params = self._get_params(start_str) strings_out = self._read_url_as_String( urls[i], params).splitlines()[2:] strings_out = list(filter(lambda x: x.strip(), strings_out)) if len(out_list) == 0: out_list = strings_out if len(strings_out) < 101: # all data received - break break else: out_list += strings_out[1:] # remove a CSV head line if len(strings_out) < 100: # all data recevied - break break str_io = StringIO("\r\n".join(out_list)) dfs.append(self._read_lines(str_io)) # add a new DataFrame finally: self.close() if len(dfs) > 1: return concat(dfs, axis=0, join="outer", sort=True) else: return dfs[0]
def _read_url_as_StringIO(self, url, params=None): """ Open url (and retry) """ response = self._get_response(url, params=params) text = self._sanitize_response(response) out = StringIO() if len(text) == 0: service = self.__class__.__name__ raise IOError("{} request returned no data; check URL for invalid " "inputs: {}".format(service, self.url)) if isinstance(text, binary_type): out.write(bytes_to_str(text)) else: out.write(text) out.seek(0) return out
def _download_nasdaq_symbols(timeout): """ @param timeout: the time to wait for the FTP connection """ try: ftp_session = FTP(_NASDAQ_FTP_SERVER, timeout=timeout) ftp_session.login() except all_errors as err: raise RemoteDataError("Error connecting to %r: %s" % (_NASDAQ_FTP_SERVER, err)) lines = [] try: ftp_session.retrlines("RETR " + _NASDAQ_TICKER_LOC, lines.append) except all_errors as err: raise RemoteDataError("Error downloading from %r: %s" % (_NASDAQ_FTP_SERVER, err)) finally: ftp_session.close() # Sanity Checking if not lines[-1].startswith("File Creation Time:"): raise RemoteDataError("Missing expected footer. Found %r" % lines[-1]) # Convert Y/N to True/False. converter_map = dict( (col, _bool_converter) for col, t in _TICKER_DTYPE if t is bool) # For pandas >= 0.20.0, the Python parser issues a warning if # both a converter and dtype are specified for the same column. # However, this measure is probably temporary until the read_csv # behavior is better formalized. with warnings.catch_warnings(record=True): data = read_csv( StringIO("\n".join(lines[:-1])), "|", dtype=_TICKER_DTYPE, converters=converter_map, index_col=1, ) # Properly cast enumerations for cat in _CATEGORICAL: data[cat] = data[cat].astype("category") return data
def _read(self): snapshot_id = self.get_current_snapshot_id(self._dataset_id) exported_data = self.get_snapshot_export(snapshot_id) # TODO: Retry? decoded_data = exported_data.decode("utf-8") return pd.read_csv(StringIO(decoded_data))