Example #1
0
    def test_filename(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        pos = ["c:\\test",
               "c:\\test.txt",
               "..",
               ".txt",
               "r.",
               "r",
               "mqldkfnqmodnsc/\\y"]

        for p in pos:
            if not is_file_string(p):
                raise Exception(p)

        neg = ["h\ng",
               "r\tr",
               "cd:ggd.h"]

        for p in neg:
            if is_file_string(p):
                raise Exception(p)
    def test_zip_to_df(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        dirname = os.path.abspath(os.path.dirname(__file__))
        name = os.path.join(dirname, "data", "mynotebooks.zip")
        self.assertEqual(os.path.exists(name), True)
        self.assertEqual(is_file_string(name), True)
        dfs = read_csv(name, encoding="utf8",
                       fvalid=lambda n: n != 'bank-names.txt')
        assert isinstance(dfs, dict)
        self.assertEqual(len(dfs), 3)
        fLOG(list(dfs.keys()))
        full = dfs["bank-full.csv"]
        assert isinstance(full, pandas.DataFrame)
Example #3
0
    def __init__(self, tick, url="yahoo", folder="cache",
                 begin=None, end=None, sep=",",
                 intern=False):
        """
        Loads a stock price from either a url or a folder where the data was cached.
        If a filename ``<folder>/<tick>.<day1>.<day2>.txt`` already exists, it takes it from here.
        Otherwise, it downloads it.

        If url is yahoo, the data will be download using ``http://finance.yahoo.com/q/cp?s=^FCHI+Components``.
        The CAC40 composition is described `here <http://fr.wikipedia.org/wiki/CAC_40>`_.

        @param      tick        tick name, ex ``BNP.PA``
        @param      url         if yahoo, downloads the data from there if it was not done before
        @param      folder      cache folder (created if it does not exists
        @param      begin       first day (datetime), see below
        @param      end         last day (datetime), see below
        @param      sep         column separator
        @param      intern      do not use unless you know what to do (see :meth:`__getitem__ <pyensae.finance.astock.StockPrices.__getitem__>`)

        If begin is None, the date will 2000/01/03 (it seems Yahoo Finance does not provide
        prices for a date before this one).
        If end is None, the date will the date of yesterday.

        .. exref::
            :title: Compute the average returns and correlation matrix

            ::

                import pyensae, pandas
                from pyensae import StockPrices

                # download the CAC 40 composition from my website
                pyensae.download_data('cac40_2013_11_11.txt', website = 'xd')

                # download all the prices (if not already done) and store them into files
                actions = pandas.read_csv("cac40_2013_11_11.txt", sep = "\t")

                # we remove stocks with not enough historical data
                stocks = { k:StockPrices(tick = k) for k,v in actions.values  if k != "SOLB.PA"}
                dates = StockPrices.available_dates( stocks.values() )
                stocks = { k:v for k,v in stocks.items() if len(v.missing(dates)) <= 10 }
                print ("nb left", len(stocks))

                # we remove dates with missing prices
                dates = StockPrices.available_dates( stocks.values() )
                ok    = dates[ dates["missing"] == 0 ]
                print ("all dates before", len(dates), " after:" , len(ok))
                for k in stocks : stocks[k] = stocks[k].keep_dates(ok)

                # we compute correlation matrix and returns
                ret, cor = StockPrices.covariance(stocks.values(), cov = False, ret = True)

        You should also look at `pyensae et notebook <http://www.xavierdupre.fr/blog/notebooks/example%20pyensae.html>`_.
        If you use Google Finance as a provider, the tick name is usually
        prefixed by the market places (NASDAQ for example). The export
        does not work for all markets places.

        """
        if isinstance(url, pandas.DataFrame):
            self.datadf = url
            self.tickname = tick
            if "Date" not in url.columns:
                raise Exception(
                    "the dataframe does not contain any column 'Date': {0}".format(
                        ",".join(
                            _ for _ in url.columns)))
        elif isinstance(tick, str) and is_file_string(tick) and os.path.exists(tick):
            with open(tick, "r") as f:
                for line in f.readlines():
                    if line.startswith('<!DOCTYPE html PUBLIC'):
                        raise Exception(
                            "pandas cannot parse the file, check your have access to internet: " + str(tick))
                    break
            try:
                self.datadf = pandas.read_csv(tick, sep=sep)
            except Exception as e:
                with open(tick, "r") as t:
                    content = t.read()
                if "Firewall Authentication" in content:
                    raise Exception(
                        "pandas cannot parse the file, check your have access to internet: " + str(tick)) from e
                else:
                    raise e
        else:
            if not os.path.exists(folder):
                try:
                    os.mkdir(folder)
                except PermissionError as e:
                    raise Exception(
                        "PermissionError, unable to create directory " +
                        folder +
                        ", check you execute the program in a folder you have permission to modify (" +
                        os.getcwd() +
                        ")") from e
            self.tickname = tick

            if begin is None:
                begin = datetime.datetime(2000, 1, 3)
            if end is None:
                now = datetime.datetime.now()
                end = now - datetime.timedelta(1)

            sbeg = begin.strftime("%Y-%m-%d")
            send = end.strftime("%Y-%m-%d")
            name = os.path.join(
                folder,
                tick.replace(":", "_") +
                ".{0}.{1}.txt".format(
                    sbeg,
                    send))

            if not os.path.exists(name):
                if url == "yahoo":
                    url = "http://ichart.finance.yahoo.com/table.csv?s=%s&d={0}&e={1}&f={2}&g=d&a={3}&b={4}&c={5}&ignore=.csv".format(
                        end.month - 1, end.day, end.year,
                        begin.month - 1, begin.day, begin.year)
                    url = url % tick
                    use_url = True
                elif url in("yahoo", "google", "fred", "famafrench"):
                    import pandas_datareader.data as web
                    df = web.DataReader(self.tickname, url,
                                        begin, end).reset_index(drop=False)
                    df.to_csv(name, sep=sep, index=False)
                    use_url = False
                else:
                    raise Exception(
                        "unable to download data from the following website" + str(tick) + " - " +
                        url)

                if use_url:
                    try:
                        u = urllib.request.urlopen(url)
                        text = u.read()
                        u.close()
                    except urllib.error.HTTPError as e:
                        raise Exception(
                            "HTTPError, unable to load tick " + tick + "\nURL: " + url) from e

                    if len(text) < 10:
                        raise Exception("nothing to download for " + tick +
                                        " less than 10 downloaded bytes")

                    try:
                        f = open(name, "wb")
                        f.write(text)
                        f.close()
                    except PermissionError as e:
                        raise Exception(
                            "PermissionError, unable to create directory " +
                            folder +
                            ", check you execute the program in a folder you have permission to modify (" +
                            os.getcwd() +
                            ")") from e

            try:
                self.datadf = pandas.read_csv(name, sep=sep)
            except Exception as e:
                with open(tick, "r") as t:
                    content = t.read()
                if "Firewall Authentication" in content:
                    raise Exception(
                        "pandas cannot parse the file, check your have access to internet" + str(tick)) from e
                else:
                    raise e

        if not intern:
            try:
                self.datadf = self.datadf.sort_values("Date")
            except AttributeError:
                self.datadf = self.datadf.sort("Date")
            except KeyError as e:
                raise StockPricesException("schema: {}".format(
                    ",".join(self.datadf.columns))) from e
            self.datadf.reset_index(drop=True, inplace=True)
            self.datadf.set_index("Date", drop=False, inplace=True)
Example #4
0
    def hive_submit(
        self, hive_file_or_query, params=None, redirection="redirection.hive", no_exception=True, fLOG=noLOG
    ):
        """
        submits a PIG script, it first upload the script
        to the default folder and submit it

        @param      hive_file_or_query  pig script (local)
        @param      params              parameters to send to the job
        @param      redirection         string empty or not
        @param      no_exception        sent to @see me execute_command
        @param      fLOG                logging function
        @return                         out, err from @see me execute_command

        If *redirection* is not empty, the job is submitted but
        the function returns after the standard output and error were
        redirected to ``redirection.hive.out`` and ``redirection.hive.err``.

        The function executes the command line::

            hive -f <filename>

        Or::

            hive -e <query>

        With redirection::

            hive -execute -f <filename> 2> redirection.hive.err 1> redirection.hive.out &

        If there is no redirection, the function
        waits and return the output.

        .. exref::
            :title: Submit a HIVE query
            :tag: Hadoop

            ::

                client = ASSHClient()

                hive_sql = '''
                    DROP TABLE IF EXISTS bikes20;
                    CREATE TABLE bikes20 (sjson STRING);
                    LOAD DATA INPATH "/user/__USERNAME__/unittest2/paris*.txt" INTO TABLE bikes20;
                    SELECT * FROM bikes20 LIMIT 10;
                    '''.replace("__USERNAME__", self.client.username)

                out,err = client.hive_submit(hive_sql, redirection=None)

        .. versionadded:: 1.1
        """
        if is_file_string(hive_file_or_query) and os.path.exists(hive_file_or_query):
            dest = os.path.split(hive_file_or_query)[-1]
            self.upload(hive_file_or_query, dest)
            command = "-f"
        else:
            command = "-e"
            dest = hive_file_or_query.replace("\n", " ").replace("\r", "").replace("\t", " ")
            dest = dest.replace("'", "\\'")
            dest = "'{}'".format(dest.strip())

        if params is not None:
            sparams = ASSHClient.build_command_line_parameters(params, "-hiveconf")
            if len(sparams) > 0:
                sparams = " " + sparams
        else:
            sparams = ""

        if redirection is None:
            cmd = "hive {0} {1}{2}".format(command, dest, sparams)
        else:
            cmd = "hive {0} {1}{2} 2> {3}.err 1> {3}.out &".format(command, dest, sparams, redirection)

        if isinstance(cmd, list):
            raise TypeError("this should not happen:" + str(cmd))

        warnings.warn("Hive submission is not tested. It will probably fail.")

        fLOG("[hive_submit]:", cmd)
        out, err = self.execute_command(cmd, no_exception=no_exception)
        return out, err
Example #5
0
    def __init__(self,
                 tick,
                 url="yahoo",
                 folder="cache",
                 begin=None,
                 end=None,
                 sep=",",
                 intern=False):
        """
        Loads a stock price from either a url or a folder where the data was cached.
        If a filename ``<folder>/<tick>.<day1>.<day2>.txt`` already exists, it takes it from here.
        Otherwise, it downloads it.

        If url is yahoo, the data will be download using ``http://finance.yahoo.com/q/cp?s=^FCHI+Components``.
        The CAC40 composition is described `here <http://fr.wikipedia.org/wiki/CAC_40>`_.

        @param      tick        tick name, ex ``BNP.PA``
        @param      url         if yahoo, downloads the data from there if it was not done before
        @param      folder      cache folder (created if it does not exists
        @param      begin       first day (datetime), see below
        @param      end         last day (datetime), see below
        @param      sep         column separator
        @param      intern      do not use unless you know what to do (see :meth:`__getitem__ <pyensae.finance.astock.StockPrices.__getitem__>`)

        If begin is None, the date will 2000/01/03 (it seems Yahoo Finance does not provide
        prices for a date before this one).
        If end is None, the date will the date of yesterday.

        .. exref::
            :title: Compute the average returns and correlation matrix

            ::

                import pyensae, pandas
                from pyensae import StockPrices

                # download the CAC 40 composition from my website
                pyensae.download_data('cac40_2013_11_11.txt', website = 'xd')

                # download all the prices (if not already done) and store them into files
                actions = pandas.read_csv("cac40_2013_11_11.txt", sep = "\t")

                # we remove stocks with not enough historical data
                stocks = { k:StockPrices(tick = k) for k,v in actions.values  if k != "SOLB.PA"}
                dates = StockPrices.available_dates( stocks.values() )
                stocks = { k:v for k,v in stocks.items() if len(v.missing(dates)) <= 10 }
                print ("nb left", len(stocks))

                # we remove dates with missing prices
                dates = StockPrices.available_dates( stocks.values() )
                ok    = dates[ dates["missing"] == 0 ]
                print ("all dates before", len(dates), " after:" , len(ok))
                for k in stocks : stocks[k] = stocks[k].keep_dates(ok)

                # we compute correlation matrix and returns
                ret, cor = StockPrices.covariance(stocks.values(), cov = False, ret = True)

        You should also look at `pyensae et notebook <http://www.xavierdupre.fr/blog/notebooks/example%20pyensae.html>`_.
        If you use Google Finance as a provider, the tick name is usually
        prefixed by the market places (NASDAQ for example). The export
        does not work for all markets places.

        """
        if isinstance(url, pandas.DataFrame):
            self.datadf = url
            self.tickname = tick
            if "Date" not in url.columns:
                raise Exception(
                    "the dataframe does not contain any column 'Date': {0}".
                    format(",".join(_ for _ in url.columns)))
        elif isinstance(tick,
                        str) and is_file_string(tick) and os.path.exists(tick):
            with open(tick, "r") as f:
                for line in f.readlines():
                    if line.startswith('<!DOCTYPE html PUBLIC'):
                        raise Exception(
                            "pandas cannot parse the file, check your have access to internet: "
                            + str(tick))
                    break
            try:
                self.datadf = pandas.read_csv(tick, sep=sep)
            except Exception as e:
                with open(tick, "r") as t:
                    content = t.read()
                if "Firewall Authentication" in content:
                    raise Exception(
                        "pandas cannot parse the file, check your have access to internet: "
                        + str(tick)) from e
                else:
                    raise e
        else:
            if not os.path.exists(folder):
                try:
                    os.mkdir(folder)
                except PermissionError as e:
                    raise Exception(
                        "PermissionError, unable to create directory " +
                        folder +
                        ", check you execute the program in a folder you have permission to modify ("
                        + os.getcwd() + ")") from e
            self.tickname = tick

            if begin is None:
                begin = datetime.datetime(2000, 1, 3)
            if end is None:
                now = datetime.datetime.now()
                end = now - datetime.timedelta(1)

            sbeg = begin.strftime("%Y-%m-%d")
            send = end.strftime("%Y-%m-%d")
            name = os.path.join(
                folder,
                tick.replace(":", "_") + ".{0}.{1}.txt".format(sbeg, send))

            if not os.path.exists(name):
                if url == "yahoo":
                    url = "http://ichart.finance.yahoo.com/table.csv?s=%s&d={0}&e={1}&f={2}&g=d&a={3}&b={4}&c={5}&ignore=.csv".format(
                        end.month - 1, end.day, end.year, begin.month - 1,
                        begin.day, begin.year)
                    url = url % tick
                    use_url = True
                elif url in ("yahoo", "google", "fred", "famafrench"):
                    import pandas_datareader.data as web
                    df = web.DataReader(self.tickname, url, begin,
                                        end).reset_index(drop=False)
                    df.to_csv(name, sep=sep, index=False)
                    use_url = False
                else:
                    raise Exception(
                        "unable to download data from the following website" +
                        str(tick) + " - " + url)

                if use_url:
                    try:
                        u = urllib.request.urlopen(url)
                        text = u.read()
                        u.close()
                    except urllib.error.HTTPError as e:
                        raise Exception("HTTPError, unable to load tick " +
                                        tick + "\nURL: " + url) from e

                    if len(text) < 10:
                        raise Exception("nothing to download for " + tick +
                                        " less than 10 downloaded bytes")

                    try:
                        f = open(name, "wb")
                        f.write(text)
                        f.close()
                    except PermissionError as e:
                        raise Exception(
                            "PermissionError, unable to create directory " +
                            folder +
                            ", check you execute the program in a folder you have permission to modify ("
                            + os.getcwd() + ")") from e

            try:
                self.datadf = pandas.read_csv(name, sep=sep)
            except Exception as e:
                with open(tick, "r") as t:
                    content = t.read()
                if "Firewall Authentication" in content:
                    raise Exception(
                        "pandas cannot parse the file, check your have access to internet"
                        + str(tick)) from e
                else:
                    raise e

        if not intern:
            try:
                self.datadf = self.datadf.sort_values("Date")
            except AttributeError:
                self.datadf = self.datadf.sort("Date")
            except KeyError as e:
                raise StockPricesException("schema: {}".format(",".join(
                    self.datadf.columns))) from e
            self.datadf.reset_index(drop=True, inplace=True)
            self.datadf.set_index("Date", drop=False, inplace=True)