Exemple #1
0
    def _read_one_data(self, url, params):

        params = {
            "index_col": 0,
            "parse_dates": [0],
            "date_parser": _parse_date_famafrench,
        }

        # headers in these files are not valid
        if self.symbols.endswith("_Breakpoints"):

            if self.symbols.find("-") > -1:
                c = ["<=0", ">0"]
            else:
                c = ["Count"]
            r = list(range(0, 105, 5))
            params["names"] = ["Date"] + c + list(zip(r, r[1:]))

            if self.symbols != "Prior_2-12_Breakpoints":
                params["skiprows"] = 1
            else:
                params["skiprows"] = 3

        doc_chunks, tables = [], []
        data = self._read_zipfile(url)

        for chunk in data.split(2 * "\r\n"):
            if len(chunk) < 800:
                doc_chunks.append(chunk.replace("\r\n", " ").strip())
            else:
                tables.append(chunk)

        datasets, table_desc = {}, []
        for i, src in enumerate(tables):
            match = re.search(r"^\s*,", src, re.M)  # the table starts there
            start = 0 if not match else match.start()

            df = read_csv(StringIO("Date" + src[start:]), **params)
            try:
                idx_name = df.index.name  # hack for pandas 0.16.2
                df = df.to_period(df.index.inferred_freq[:1])
                df.index.name = idx_name
            except Exception:
                pass
            df = df.truncate(self.start, self.end)
            datasets[i] = df

            title = src[:start].replace("\r\n", " ").strip()
            shape = "({0} rows x {1} cols)".format(*df.shape)
            table_desc.append("{0} {1}".format(title, shape).strip())

        descr = "{0}\n{1}\n\n".format(self.symbols.replace("_", " "),
                                      len(self.symbols) * "-")
        if doc_chunks:
            descr += " ".join(doc_chunks).replace(2 * " ", " ") + "\n\n"
        table_descr = map(lambda x: "{0:3} : {1}".format(*x),
                          enumerate(table_desc))
        datasets["DESCR"] = descr + "\n".join(table_descr)

        return datasets
Exemple #2
0
    def _read_one_data(self, url, params):

        params = {
            'index_col': 0,
            'parse_dates': [0],
            'date_parser': _parse_date_famafrench
        }

        # headers in these files are not valid
        if self.symbols.endswith('_Breakpoints'):

            if self.symbols.find('-') > -1:
                c = ['<=0', '>0']
            else:
                c = ['Count']
            r = list(range(0, 105, 5))
            params['names'] = ['Date'] + c + list(zip(r, r[1:]))

            if self.symbols != 'Prior_2-12_Breakpoints':
                params['skiprows'] = 1
            else:
                params['skiprows'] = 3

        doc_chunks, tables = [], []
        data = self._read_zipfile(url)

        for chunk in data.split(2 * '\r\n'):
            if len(chunk) < 800:
                doc_chunks.append(chunk.replace('\r\n', ' ').strip())
            else:
                tables.append(chunk)

        datasets, table_desc = {}, []
        for i, src in enumerate(tables):
            match = re.search(r'^\s*,', src, re.M)  # the table starts there
            start = 0 if not match else match.start()

            df = read_csv(StringIO('Date' + src[start:]), **params)
            try:
                idx_name = df.index.name  # hack for pandas 0.16.2
                df = df.to_period(df.index.inferred_freq[:1])
                df.index.name = idx_name
            except Exception:
                pass
            df = df.truncate(self.start, self.end)
            datasets[i] = df

            title = src[:start].replace('\r\n', ' ').strip()
            shape = '({0} rows x {1} cols)'.format(*df.shape)
            table_desc.append('{0} {1}'.format(title, shape).strip())

        descr = '{0}\n{1}\n\n'.format(self.symbols.replace('_', ' '),
                                      len(self.symbols) * '-')
        if doc_chunks:
            descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'
        table_descr = map(lambda x: '{0:3} : {1}'.format(*x),
                          enumerate(table_desc))
        datasets['DESCR'] = descr + '\n'.join(table_descr)

        return datasets
Exemple #3
0
    def read_all_boards(self):
        """Read all data from every board for every ticker"""

        markets_n_engines, boards = self._get_metadata()
        try:
            self.__markets_n_engines = markets_n_engines

            urls = self.url  # generate urls per symbols
            dfs = []  # an array of pandas dataframes per symbol to concatenate

            for i in range(len(urls)):
                out_list = []
                date_column = None

                while True:  # read in a loop with small date intervals
                    if len(out_list) > 0:
                        if date_column is None:
                            date_column = out_list[0].split(";").index("TRADEDATE")

                        # get the last downloaded date
                        start_str = out_list[-1].split(";", 4)[date_column]
                        start = dt.datetime.strptime(start_str, "%Y-%m-%d").date()
                    else:
                        start_str = self.start.strftime("%Y-%m-%d")
                        start = self.start

                    if start > self.end or start > dt.date.today():
                        break

                    params = self._get_params(start_str)
                    strings_out = self._read_url_as_String(
                        urls[i], params
                    ).splitlines()[2:]
                    strings_out = list(filter(lambda x: x.strip(), strings_out))

                    if len(out_list) == 0:
                        out_list = strings_out
                        if len(strings_out) < 101:  # all data received - break
                            break
                    else:
                        out_list += strings_out[1:]  # remove a CSV head line
                        if len(strings_out) < 100:  # all data recevied - break
                            break

                if len(out_list) > 0:
                    str_io = StringIO("\r\n".join(out_list))
                    dfs.append(self._read_lines(str_io))  # add a new DataFrame
        finally:
            self.close()

        if len(dfs) == 0:
            raise IOError(
                "{} returned no data; "
                "check URL or correct a date".format(self.__class__.__name__)
            )
        elif len(dfs) > 1:
            b = concat(dfs, axis=0, join="outer", sort=True)
        else:
            b = dfs[0]
        return b
Exemple #4
0
    def read(self):
        """Read data"""

        try:
            self.__markets, self.__engines = self._get_metadata()
            urls = self.url  # generate urls per symbols
            dfs = []  # an array of pandas dataframes per symbol to concatenate

            for i in range(len(self.symbols)):
                out_list = []
                date_column = None

                while True:  # read in a loop with small date intervals
                    if len(out_list) > 0:
                        if date_column is None:
                            date_column = out_list[0].split(";").index(
                                "TRADEDATE")

                        # get the last downloaded date
                        start_str = out_list[-1].split(";", 4)[date_column]
                        start = dt.datetime.strptime(start_str,
                                                     "%Y-%m-%d").date()
                    else:
                        start_str = self.start.strftime("%Y-%m-%d")
                        start = self.start

                    if start >= self.end or start >= dt.date.today():
                        break

                    params = self._get_params(start_str)
                    strings_out = self._read_url_as_String(
                        urls[i], params).splitlines()[2:]
                    strings_out = list(filter(lambda x: x.strip(),
                                              strings_out))

                    if len(out_list) == 0:
                        out_list = strings_out
                        if len(strings_out) < 101:  # all data received - break
                            break
                    else:
                        out_list += strings_out[1:]  # remove a CSV head line
                        if len(strings_out) < 100:  # all data recevied - break
                            break
                str_io = StringIO("\r\n".join(out_list))
                dfs.append(self._read_lines(str_io))  # add a new DataFrame
        finally:
            self.close()

        if len(dfs) > 1:
            return concat(dfs, axis=0, join="outer", sort=True)
        else:
            return dfs[0]
Exemple #5
0
 def _read_url_as_StringIO(self, url, params=None):
     """
     Open url (and retry)
     """
     response = self._get_response(url, params=params)
     text = self._sanitize_response(response)
     out = StringIO()
     if len(text) == 0:
         service = self.__class__.__name__
         raise IOError("{} request returned no data; check URL for invalid "
                       "inputs: {}".format(service, self.url))
     if isinstance(text, binary_type):
         out.write(bytes_to_str(text))
     else:
         out.write(text)
     out.seek(0)
     return out
Exemple #6
0
def _download_nasdaq_symbols(timeout):
    """
    @param timeout: the time to wait for the FTP connection
    """
    try:
        ftp_session = FTP(_NASDAQ_FTP_SERVER, timeout=timeout)
        ftp_session.login()
    except all_errors as err:
        raise RemoteDataError("Error connecting to %r: %s" %
                              (_NASDAQ_FTP_SERVER, err))

    lines = []
    try:
        ftp_session.retrlines("RETR " + _NASDAQ_TICKER_LOC, lines.append)
    except all_errors as err:
        raise RemoteDataError("Error downloading from %r: %s" %
                              (_NASDAQ_FTP_SERVER, err))
    finally:
        ftp_session.close()

    # Sanity Checking
    if not lines[-1].startswith("File Creation Time:"):
        raise RemoteDataError("Missing expected footer. Found %r" % lines[-1])

    # Convert Y/N to True/False.
    converter_map = dict(
        (col, _bool_converter) for col, t in _TICKER_DTYPE if t is bool)

    # For pandas >= 0.20.0, the Python parser issues a warning if
    # both a converter and dtype are specified for the same column.
    # However, this measure is probably temporary until the read_csv
    # behavior is better formalized.
    with warnings.catch_warnings(record=True):
        data = read_csv(
            StringIO("\n".join(lines[:-1])),
            "|",
            dtype=_TICKER_DTYPE,
            converters=converter_map,
            index_col=1,
        )

    # Properly cast enumerations
    for cat in _CATEGORICAL:
        data[cat] = data[cat].astype("category")

    return data
Exemple #7
0
 def _read(self):
     snapshot_id = self.get_current_snapshot_id(self._dataset_id)
     exported_data = self.get_snapshot_export(snapshot_id)  # TODO: Retry?
     decoded_data = exported_data.decode("utf-8")
     return pd.read_csv(StringIO(decoded_data))