Example #1
0
    def ticker_history(self, start, end, ticker, info='quote', start_dic={}):
        """
        Gets and returns the historic prices for a given ticker for between
        the time period provided. Inclusive.
        """

        start_str = start.strftime('%Y%m%d')
        end_str = end.strftime('%Y%m%d')

        # info = 'quote', 'dividend', 'split'
        try:
            data = yqd.load_yahoo_quote(ticker, start_str, end_str, info=info)
        except (HTTPError, URLError, gaierror) as e:
            LOGGER.info("Yahoo request failed. Blocked?")
            return []

        titles = tuple(t.replace(' ', '_').lower() for t in data[0].split(','))

        history = []
        for row in data[1:-1]:
            history_row = {k: v for k, v in start_dic.items()}
            iter_list = row.split(',')

            for element, title in zip(iter_list, titles):
                converted = self.convert_yahoo_element(element)
                history_row[title] = converted
            history.append(history_row)
        return history
Example #2
0
    def train(self):
        LOGGER.info("Starting to train...")
        
        ticker_names = get_ticker_names(self.db, "TSX")[:10]
        random.shuffle(ticker_names)

        sdate = datetime.today().date() - timedelta(days=2*365)
        edate = sdate + timedelta(days=365)

        data, targets = [], []
        for t in ticker_names:
            c = self.rc.calculate_return(t, sdate, edate)
            if c is None:
                continue

            d = self.ff.fetch(t, sdate, edate)
            d = list(itertools.chain.from_iterable(d))
            
            targets.append(c)
            data.append(d)

        train_tickers = ceil(len(targets) * self.training_perc)

        self.training_data = data[:train_tickers]
        self.training_target  = targets[:train_tickers]
        
        self.goal_data   = data[train_tickers:]
        self.goal_target = targets[train_tickers:]

        LOGGER.info("Starting to train...")
        print("TEST", self.training_data, self.training_target)
        self.lr.fit(self.training_data, self.training_target)
Example #3
0
    def binary_train(self):
        LOGGER.info("Starting to train...")

        train_data, train_targets, test_data, test_targets = self.fh.fetch_binary_feature_data(
        )
        train_tickers, test_tickers = self.fh.fetch_feature_tickers()

        scaler = StandardScaler()
        train_data = scaler.fit_transform(train_data)
        test_data = scaler.fit_transform(test_data)

        print("Shapes", train_data.shape, train_targets.shape, test_data.shape,
              test_targets.shape)

        self.mlpc.fit(train_data, train_targets)

        predictions = self.mlpc.predict(test_data)

        acc_score = accuracy_score(test_targets, predictions)
        roc_score = roc_auc_score(test_targets, predictions)

        print('???', accuracy_score(test_targets, [True] * len(test_targets)))

        print("Accuracy Score", acc_score, 'ROC Score', roc_score)
        print("Average True Return", sum(train_targets) / len(train_targets))
Example #4
0
    def train(self):
        LOGGER.info("Starting to train...")

        train_data, train_targets, test_data, test_targets = self.fh.fetch_feature_data(
        )
        tickers = self.fh.fetch_feature_tickers()

        print("Shapes", train_data.shape, train_targets.shape, test_data.shape,
              test_targets.shape)

        features_names = self.fh.fetch_feature_names()
        self.kbest.fit(train_data, train_targets)

        feature_scores = self.kbest.scores_

        combined = [
            tuple((s, f)) for s, f in zip(feature_scores, features_names)
            if not np.isnan(s)
        ]
        best_features = sorted(combined, key=lambda x: -x[0])
        scores, names = zip(*best_features)

        self.graph(scores, names)
        print('Best Features', scores[:5])
        print('Worst Features', best_features[-5:])
Example #5
0
    def get_historic_events(self):
        """
        Gets all the historical events from yahoo, updating only the new entries
        based on the date of the last fetch.
        """
        exchange = "TSX"
        listings = self.session.query(
            Listings.ticker,
            Listings.dateoflisting).filter(Listings.exchange == exchange)

        dict_fields = ["index", "action", "value"]
        fields = ["exchange", "ticker", "date", "action", "value"]
        total_listings = listings.count()

        for counter, (ticker, listdate) in enumerate(listings):
            lastdate, = self.session.query(func.max(
                EventHistory.updatedate)).filter(
                    EventHistory.exchange == exchange,
                    EventHistory.ticker == ticker).first()

            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            rows = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"

                dividend_dict = self.ticker_history(startdate,
                                                    self.today,
                                                    yahoo_ticker,
                                                    info='dividend')
                split_dict = self.ticker_history(startdate,
                                                 self.today,
                                                 yahoo_ticker,
                                                 info='split')
                rows = []
                for row in dividend_dict:
                    rows.append([
                        exchange, ticker, row["date"], "DIVIDEND",
                        row["dividends"], self.today
                    ])
                for row in split_dict:
                    rows.append([
                        exchange, ticker, row["date"], "SPLIT",
                        row["stock_splits"], self.today
                    ])

            if rows:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter + 1, total_listings, ticker, startdate,
                    self.today))
                stmt = insert(EventHistory).values(
                    rows).on_conflict_do_nothing(
                        constraint='event_history_pkey')
                self.session.execute(stmt)
                self.session.commit()
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter + 1, total_listings, ticker))
Example #6
0
    def dic_parse(self, session, url, html):
        def innerHtml(ele):
            return ele.decode_contents(formatter="html")

        soup = BeautifulSoup(html, "lxml")
        ticker = self.url_ticker_pat.search(url).group(1)
        exchange = "TSX"

        on_yahoo = soup.find('section', attrs={'data-test': 'lookup-page'
                                               }) is None
        session.query(Listings).filter(Listings.exchange == exchange,
                                       Listings.ticker == ticker).update(
                                           {Listings.onyahoo: on_yahoo})

        if not on_yahoo:  # if quote not found, exit
            LOGGER.error("Failed to find quote for {} skipping".format(url))
            return

        div_test = soup.find('section', attrs={'data-test': 'qsp-statistics'})
        if div_test is None:
            LOGGER.error("Unknown error for {} skipping".format(url))
            return

        db_dic = {}
        for table in div_test.find_all('table'):
            for row in table.find_all('tr'):
                td_list = row.find_all('td')
                title = innerHtml(td_list[0].find('span'))
                val = innerHtml(td_list[1]) if td_list[1].find(
                    'span') is None else innerHtml(td_list[1].find('span'))
                if title in self.y_to_db_map:
                    db_dic[self.y_to_db_map[title]] = self.parse_numeric(val)

        if db_dic:
            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange
            exists = session.query(KeyStatistics).filter_by(
                **db_dic).scalar() is not None

            if exists:
                LOGGER.info("Skipping {} due to prior existence".format(url))
            else:
                db_dic["update_date"] = self.today

                stmt = insert(KeyStatistics).values(
                    db_dic).on_conflict_do_nothing(
                        constraint='key_statistics_pkey', )
                session.execute(stmt)
                session.commit()

                LOGGER.info("Done parsing {}".format(url))
        else:
            LOGGER.info("Skipping {}".format(url))
Example #7
0
    def fetch_all(self, exchange):
        extension = '.TO'
        tickers = tuple(x.ticker + extension for x in self.session.query(Listings.ticker).filter(Listings.exchange == exchange, or_(Listings.onyahoo == True, Listings.onyahoo is None)))

        ticker_groups = self.chunks(tickers, 200)

        LOGGER.info("Fetching/Updating {} urls.".format(len(ticker_groups)))

        for ticker_group in ticker_groups:
            url = self.create_url(ticker_group)
            self.handle_url(ticker_group, url, exchange)
            sleep(1) # limit requests to 1/s
Example #8
0
    def dic_parse(self, db, url, html):
        def innerHtml(ele):
            return ele.decode_contents(formatter="html")

        soup = BeautifulSoup(html, "lxml")
        ticker = self.url_ticker_pat.search(url).group(1)
        exchange = "TSX"

        on_yahoo = soup.find('div', attrs={'data-test': 'unknown-quote'
                                           }) is None
        db.update("listings", ["onyahoo"], [on_yahoo],
                  "exchange=%s AND ticker=%s", [exchange, ticker])

        if not on_yahoo:  # if quote not found, exit
            LOGGER.error("Failed to find quote for", url, "skipping")
            return

        div_test = soup.find('div', attrs={'data-test': 'qsp-statistics'})
        if div_test is None:
            LOGGER.error("Unknown error for", url, "skipping")
            return

        db_dic = {}
        for table in div_test.find_all('table'):
            for row in table.find_all('tr'):
                td_list = row.find_all('td')
                title = innerHtml(td_list[0].find('span'))
                val = innerHtml(td_list[1]) if td_list[1].find(
                    'span') is None else innerHtml(td_list[1].find('span'))
                if title in self.y_to_db_map:
                    db_dic[self.y_to_db_map[title]] = self.parse_numeric(val)

        if db_dic:
            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange
            col_names, vals = list(db_dic.keys()), list(db_dic.values())
            where = db.create_conditional_string(col_names)
            if db.exists("key_statistics", where, vals):
                LOGGER.info("Skipping {} due to prior existence".format(url))
            else:
                col_names.append("update_date")
                vals.append(self.today)
                db.insert_into("key_statistics",
                               col_names,
                               vals,
                               multiple=False)
                LOGGER.info("Done parsing {}".format(url))
        else:
            LOGGER.info("Skipping {}".format(url))
Example #9
0
    def binary_train(self):
        LOGGER.info("Starting to train...")

        train_data, train_targets, test_data, test_targets = self.fh.fetch_binary_feature_data()
        train_tickers, test_tickers = self.fh.fetch_feature_tickers()

        print("Shapes", train_data.shape, train_targets.shape, test_data.shape, test_targets.shape)

        self.lc.fit(train_data, train_targets)

        predictions = self.lc.predict(test_data)

        acc_score = accuracy_score(test_targets, predictions)
        roc_score = roc_auc_score(test_targets, predictions)
        print("Accuracy Score", acc_score, 'ROC Score', roc_score)
        print("Average True Return", sum(train_targets) / len(train_targets), sum(test_targets) / len(test_targets))
Example #10
0
    def get_historic_prices(self):
        """
        Gets all the historical prices from yahoo, updating only the new entries
        based on the date of the last fetch.
        """

        exchange = "TSX"

        listings = list(
            self.session.query(
                Listings.ticker,
                Listings.dateoflisting).filter(Listings.exchange == exchange))
        total_listings = len(listings)

        for counter, (ticker, listdate) in enumerate(listings):
            lastdate, = self.session.query(func.max(PriceHistory.date)).filter(
                PriceHistory.exchange == exchange,
                PriceHistory.ticker == ticker).first()

            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            his_dict = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"
                start_dic = {"exchange": exchange, "ticker": ticker}
                his_dict = self.ticker_history(startdate,
                                               self.today,
                                               yahoo_ticker,
                                               info="quote",
                                               start_dic=start_dic)

            if his_dict:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter, total_listings, ticker, startdate, self.today))

                for d in his_dict:
                    stmt = insert(PriceHistory).values(
                        d).on_conflict_do_update(
                            constraint='price_history_pkey', set_=d)
                    self.session.execute(stmt)

                self.session.commit()
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter, total_listings, ticker))
Example #11
0
def get_html(urlQ, callback, xpath_hooks):
    """
    This page takes a url from the URL Queue (urlQ) and
    calls a callbac that will handle the page source.

    xpage_hooks is a list used to determine when the page is loaded,
    see the docs for more details (e.g. ["//div[@data-test='whatever']"] ).
    """
    svr = webkit_server.Server()
    svrconn = webkit_server.ServerConnection(server=svr)
    driver = dryscrape.driver.webkit.Driver(connection=svrconn)

    sess = dryscrape.Session(driver=driver)
    sess.set_header(
        "User-Agent",
        "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
    )
    sess.set_attribute("auto_load_images", False)

    valid_page_func = lambda: any(
        sess.at_xpath(xpath) for xpath in xpath_hooks)
    session = Session()

    while not urlQ.empty():
        url = urlQ.get()

        try:
            sess.visit(url)
        except webkit_server.InvalidResponseError:
            LOGGER.error(
                "Got invalid response from something? Skipping {}".format(url))
            continue

        try:
            sess.wait_for(valid_page_func, interval=1, timeout=15)
        except dryscrape.mixins.WaitTimeoutError:
            LOGGER.error("Timeout so skipping {}".format(url))
            continue

        response = sess.body()
        callback(session, url, response)
        sess.reset()

    svr.kill()
    session.close()
Example #12
0
    def train(self):
        LOGGER.info("Starting to train...")

        train_data, train_targets, test_data, test_targets = self.fh.fetch_feature_data(
        )
        tickers = self.fh.fetch_feature_tickers()

        print("Shapes", train_data.shape, train_targets.shape, test_data.shape,
              test_targets.shape)

        self.svm.fit(train_data, train_targets)

        predictions = self.svm.predict(test_data)

        mean_err = mean_absolute_error(test_targets, predictions)
        mean_s_err = mean_squared_error(test_targets, predictions)
        print("Got Mean error", mean_err, "Squared Err", mean_s_err)
        print("Average Expected Return", sum(predictions) / len(predictions))
        print("Average True Return", sum(train_targets) / len(train_targets))
Example #13
0
    def fetch_all(self, exchange):
        q = self.session.query(Listings).filter(
            Listings.exchange == exchange,
            or_(Listings.onyahoo == True, Listings.onyahoo is None))

        extension = '.TO'
        urls = [
            "https://ca.finance.yahoo.com/quote/{}{}/key-statistics".format(
                l.ticker, extension) for l in q
        ]

        xpath_hooks = [
            "//section[@data-test='qsp-statistics']",
            "//section[@data-test='lookup-page']"
        ]

        LOGGER.info("Fetching/Updating {} urls.".format(len(urls)))

        jsps = JSPageScraper(self.dic_parse, xpath_hooks, "key_statistics")
        jsps.go(urls)
Example #14
0
    def convert_yahoo_element(self, element):
        converted = None
        try:
            converted = float(element)
        except ValueError:
            try:
                converted = datetime.strptime(element, "%Y-%m-%d")
            except ValueError:
                if element == 'null':
                    converted = None
                elif '/' in element:
                    try:
                        a, b = element.split('/')
                        converted = float(a) / float(b)
                    except ValueError:
                        LOGGER.info("Unable to convert {}".format(element))
                else:
                    LOGGER.info("Unable to convert {}".format(element))

        return converted
Example #15
0
    def go(self, urls):
        LOGGER.info("Preparing threads...")

        manager = Manager()
        urlQ = manager.Queue()
        for url in urls:
            urlQ.put(url)

        procs = [
            Process(target=get_html,
                    args=(urlQ, self.callback, self.xpath_hooks),
                    daemon=True) for i in range(self.nproc)
        ]

        LOGGER.info("Threads started. Fetching n' parsing!")
        for proc in procs:
            proc.start()

        for proc in procs:
            proc.join()
Example #16
0
    def get_historic_prices(self):
        """
        Gets all the historical prices from yahoo, updating only the new entries
        based on the date of the last fetch.
        """

        exchange = "TSX"
        listings = self.db.select("ticker, dateoflisting",
                                  "listings",
                                  where="exchange = %s",
                                  vals=[exchange])
        dict_fields = ["Adj Close", "High", "Close", "Open", "Low", "Date"]
        fields = ["exchange", "ticker"] + [x.lower() for x in dict_fields]

        total_listings = len(listings)
        for counter, (ticker, listdate) in enumerate(listings):
            lastdate = self.db.select("MAX(date)",
                                      "price_history",
                                      fetch="one",
                                      where="exchange = %s AND ticker = %s",
                                      vals=[exchange, ticker],
                                      unroll=True)
            startdate = listdate if lastdate is None else lastdate + timedelta(
                days=1)

            his_dict = []
            if startdate < self.today:
                yahoo_ticker = ticker + ".TO"
                his_dict = self.ticker_price_history(startdate, self.today,
                                                     yahoo_ticker)

            if his_dict:
                LOGGER.info("{}/{} Inserting {} from {} to {}".format(
                    counter, total_listings, ticker, startdate, self.today))
                rows = [[exchange, ticker] + [row[k] for k in dict_fields]
                        for row in his_dict]
                self.db.insert_into("price_history", fields, rows)
            else:
                LOGGER.info("{}/{} Skipping ticker {}".format(
                    counter, total_listings, ticker))
    def get_key_stats(self, ticker, db_exchange="TSX"):
        """
        This function get key statistics from
        Morning Star.
        """
        url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&"
               "culture=en-CA&region=CAN&order=asc&r={}").format(
                   ticker, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'
        if on_morningstar:
            LOGGER.info("Getting key statistics for {}... ".format(ticker))
        else:
            LOGGER.info("Skipping", ticker)
            return 1

        col_names, vals = self.parse_csv(
            csv_r,
            10,
            self.special_key_titles,
            self.column_key_map,
            extra_cols=["ticker", "exchange", "update_date"],
            extra_vals=[ticker, db_exchange, self.today])

        self.db.insert_into("ms_key_statistics",
                            col_names,
                            vals,
                            unique_conflict=True)
        LOGGER.info("Done")
        return 0
    def get_financial(self, ticker, period_name, exchange="XTSE"):
        """
        This function get yearly and quartly information from
        Morning Star.
        
        period_name: "quarter" or "annual"
        exchanges: XTSE (TSX),
        """

        # this converts the morning star exchange name to our database name
        if exchange in self.exchange_map:
            db_exchange = self.exchange_map[exchange]
        else:
            raise ValueError("Exchange unsupported {}".format(exchange))

        period = 3 if period_name == "quarter" else 12

        url = (
            "http://financials.morningstar.com/ajax/ReportProcess4CSV.html?&t="
            "{}:{}&region=can&culture=en-US&cur=&reportType=is&period={}&"
            "dataType=A&order=desc&columnYear=5&curYearPart=1st5year&"
            "rounding=1&view=raw&r={}&denominatorView=raw&number=1").format(
                exchange, ticker, period, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)

        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'

        if on_morningstar:
            LOGGER.info("Getting {} financial data for {}... ".format(
                period_name, ticker))
        else:
            LOGGER.info("Skipping {}".format(ticker))
            return 1

        num_cols = 6 if period_name == "quarter" else 5  # skip last column if not quarter view (removes TTM)
        return_dics = self.parse_csv(csv_r,
                                     num_cols,
                                     self.special_financials_titles,
                                     self.column_financials_map,
                                     start_dic={
                                         "ticker": ticker,
                                         "exchange": db_exchange,
                                         "period": period,
                                         "update_date": self.today
                                     })

        for d in return_dics:
            stmt = insert(MorningStarFinancials).values(
                d).on_conflict_do_update(constraint='fiscal_year_unique',
                                         set_=d)
            self.session.execute(stmt)

        self.session.commit()

        LOGGER.info("Done")
        return 0
Example #19
0
    def screen_and_save_feature_data(self):
        train_ticker_names, test_ticker_names = self.fetch_feature_tickers()
        train_data, train_targets, test_data, test_targets = self.fetch_feature_data(
        )

        tickers = set(find_small_cap_tickers(
            self.sess))  # finds ticker < 10m value
        train_rm_indexes = []
        for i, (ticker,
                target) in enumerate(zip(train_ticker_names, train_targets)):
            if target > 10 or ticker in tickers:
                train_rm_indexes.append(i)
        test_rm_indexes = []
        for i, (ticker,
                target) in enumerate(zip(test_ticker_names, test_targets)):
            if target > 10 or ticker in tickers:
                test_rm_indexes.append(i)

        train_ticker_names = np.delete(train_ticker_names,
                                       train_rm_indexes,
                                       axis=0)
        train_data = np.delete(train_data, train_rm_indexes, axis=0)
        train_targets = np.delete(train_targets, train_rm_indexes, axis=0)
        test_ticker_names = np.delete(test_ticker_names,
                                      test_rm_indexes,
                                      axis=0)
        test_data = np.delete(test_data, test_rm_indexes, axis=0)
        test_targets = np.delete(test_targets, test_rm_indexes, axis=0)

        LOGGER.info("Saving file at: {}".format(self.file_path))

        np.savez(self.file_path,
                 train_data=train_data,
                 train_targets=train_targets,
                 train_ticker_names=train_ticker_names,
                 test_data=test_data,
                 test_targets=test_targets,
                 test_ticker_names=test_ticker_names)
Example #20
0
    def handle_url(self, tickers, url, exchange):
        """
        Fetches the url and inserts the data into the appropriate cols in the DB.
        """
        LOGGER.info("Starting to add url: {} ...".format(url))

        req =  urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        db_list = []
        for row, ticker in zip(csv_r, tickers):
            assert(len(row) == len(self.url_flags))

            db_dic = {db_col: self.handle_csv_string(cell) for cell, db_col in zip(row, self.db_entries)}

            onyahoo = any(v is not None for v in db_dic.values())

            self.session.query(Listings).filter(Listings.exchange == exchange,
                                                Listings.ticker == ticker
            ).update({Listings.onyahoo: onyahoo})

            if not onyahoo: # not found, skip
                LOGGER.error("Failed to find quote for {} skipping".format(ticker))
                continue

            db_dic["ticker"] = ticker
            db_dic["exchange"] = exchange

            exists = self.session.query(YahooKeyStatistics).filter_by(**db_dic).scalar() is not None
            if exists:
                LOGGER.info("Skipping {} due to prior existence".format(ticker))
                continue

            db_dic["update_date"] = self.today

            # Annoyingly enough, sqlalchemy doesn't allow PostgreSQL bulk inserts
            # when checking constraints, RIP performance
            stmt = insert(YahooKeyStatistics).values(db_dic).on_conflict_do_nothing(
                constraint = 'yahoo_key_statistics_pkey',
            )
            self.session.execute(stmt)
        self.session.commit()

        LOGGER.info("Done url.")
    def get_key_stats(self, ticker, db_exchange="TSX"):
        """
        This function get key statistics from
        Morning Star.
        """
        url = ("http://financials.morningstar.com/ajax/exportKR2CSV.html?t={}&"
               "culture=en-CA&region=CAN&order=asc&r={}").format(
                   ticker, randint(1, 500000))
        req = urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req)
        csv_r = csv.reader(codecs.iterdecode(resp, 'utf-8'))

        on_morningstar = csv_r and resp.headers['content-length'] != '0'
        if on_morningstar:
            LOGGER.info("Getting key statistics for {}... ".format(ticker))
        else:
            LOGGER.info("Skipping {}".format(ticker))
            return 1

        return_dics = self.parse_csv(csv_r,
                                     10,
                                     self.special_key_titles,
                                     self.column_key_map,
                                     start_dic={
                                         "ticker": ticker,
                                         "exchange": db_exchange,
                                         "update_date": self.today
                                     })

        for d in return_dics:
            stmt = insert(MorningStarKeyStatistics).values(
                d).on_conflict_do_update(constraint='ms_key_statistics_pkey',
                                         set_=d)
            self.session.execute(stmt)

        self.session.commit()

        LOGGER.info("Done")
        return 0
Example #22
0
    def generate_and_save_feature_data(self, independent=True):
        rc = ReturnCalculator()

        ticker_names = sorted(get_ms_ticker_names(self.sess, "TSX"))
        num_tickers = len(ticker_names)

        train_data, train_targets = [], []
        train_ticker_names = []
        test_data, test_targets = [], []
        test_ticker_names = []

        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

        for i, t in enumerate(ticker_names, 1):
            LOGGER.info("[{:d}/{:d}] Working on {}...".format(
                i, num_tickers, t))

            dates = self.ff.ms_key_stats_date(self.sess, t)

            if len(dates) < 1:
                continue

            date_gap = dates[1] - dates[0] if len(dates) > 2 else timedelta(
                days=365)
            last_date = dates[-1]

            rows = self.ff.ms_key_stats_data(self.sess, t)

            if not independent:
                # Window sliding for time series
                empty_row = tuple((None, )) * len(rows[0])
                new_rows = []
                for i in range(len(rows)):
                    first_part = rows[i - 1] if i > 0 else empty_row
                    second_part = rows[i]
                    new_rows.append(first_part + second_part)
                rows = new_rows

            # Add the start date to the list of dates
            return_dates = [dates[0] - date_gap] + dates

            returns = rc.calculate_return_between_dates(t, return_dates)
            for row, date, ret in zip(rows, dates, returns):
                if ret is None:  # if return date are out of range
                    continue

                if date == last_date:
                    test_data.append(row)
                    test_targets.append(ret)
                    test_ticker_names.append(t)
                else:
                    train_data.append(row)
                    train_targets.append(ret)
                    train_ticker_names.append(t)

        # Convert the python lists to numpy arrays and fill missing values
        train_data = np.array(train_data, dtype=np.float)
        imp = imp.fit(train_data)

        train_ticker_names = np.array(train_ticker_names, dtype=np.str)
        train_data = imp.transform(train_data)
        train_targets = np.array(train_targets, dtype=np.float)
        test_ticker_names = np.array(test_ticker_names, dtype=np.str)
        test_data = imp.transform(np.array(test_data, dtype=np.float))
        test_targets = np.array(test_targets, dtype=np.float)

        if not os.path.exists(self.dir_path):
            os.makedirs(self.dir_path)

        LOGGER.info("Saving file at: {}".format(self.file_path))

        np.savez(self.file_path,
                 train_data=train_data,
                 train_targets=train_targets,
                 train_ticker_names=train_ticker_names,
                 test_data=test_data,
                 test_targets=test_targets,
                 test_ticker_names=test_ticker_names)
Example #23
0
 def dic_parse(db, url, html):
     LOGGER.notice("Got url {} and html with length {}".format(
         url, len(html)))
Example #24
0
 def dic_parse(db, url, html):
     LOGGER.notice("Got url", url, "and html with length", len(html))
Example #25
0
    def get_quotes(self):
        """
        This function gets the tickers and various other random information
        from the TSX website from a hardcoded file and inserts it into the database
        """
        recent_date = self.db.select("MAX(updatedate)",
                                     "listings",
                                     fetch="one",
                                     unroll=True)

        if self.url.startswith("http"):
            req = create_url_request(self.url)
            self.url = urllib.request.urlopen(req)

        sheet = pd.read_excel(self.url,
                              skiprows=5,
                              header=1,
                              keep_default_na=False)
        sheet.fillna('', inplace=True)
        sheet.rename(columns=self.cleanse_str, inplace=True)

        file_date = self.find_date_in_list(list(sheet.columns.values))

        if recent_date is None or (file_date > recent_date):
            xlsx_dict = sheet.to_dict(orient="records")
            recent_date = file_date
            if self.cache_path:
                self.write_cache(recent_date, sheet)
        else:
            LOGGER.info("Already up to date")
            return

        row_names = [
            "updatedate",
            "ticker",
            "exchange",
            "name",
            "sector",
            "osshares",
            "dateoflisting",
            "listingtype",
            "volume",
            "value",
        ]

        all_excel_names = tuple(xlsx_dict[0].keys())
        base_wanted_excel_names = [
            "Root Ticker",
            "Exchange",
            "Name",
            "Sector",
            "O/S",
            "Date of TSX Listing",
            "Listing Type",
            "Volume YTD",
            "Value (C$)",
        ]
        types = [
            "str",
            "str",
            "str",
            "str",
            "int",
            "date",
            "str",
            "int",
            "int",
        ]

        wanted_excel_names = []
        for bxn in base_wanted_excel_names:
            for xn in all_excel_names:
                if xn.startswith(bxn):
                    wanted_excel_names.append(xn)
                    break

        num_rows = len(wanted_excel_names)
        table_name = "listings"
        values = []
        for row in xlsx_dict:
            value_list = [recent_date]
            for i in range(num_rows):
                excel_name = wanted_excel_names[i]
                val = row[excel_name]
                if types[i] == "date":
                    val = datetime.strptime(str(val),
                                            "%Y%m%d")  # assume YYYYMMDD
                value_list.append(val)
            values.append(value_list)

        self.db.insert_into(table_name, row_names, values)
Example #26
0
 def write_cache(self, date, sheet):
     os.makedirs(self.cache_path, exist_ok=True)
     json_name = date.strftime('TSX-%Y-%m-%d.json')
     full_path = os.path.join(self.cache_path, json_name)
     sheet.to_json(full_path, orient="records")
     LOGGER.info("Wrote file to {}".format(full_path))
Example #27
0
    def get_quotes(self):
        """
        This function gets the tickers and various other random information
        from the TSX website from a hardcoded file and inserts it into the database
        """
        recent_date, = self.session.query(func.max(
            Listings.updatedate)).first()

        if self.url.startswith("http"):
            req = create_url_request(self.url)
            self.url = urllib.request.urlopen(req)

        sheet = pd.read_excel(self.url,
                              skiprows=5,
                              header=1,
                              keep_default_na=False)
        sheet.fillna('', inplace=True)
        sheet.rename(columns=self.cleanse_str, inplace=True)

        file_date = self.find_date_in_list(list(sheet.columns.values))

        if recent_date is None or (file_date > recent_date):
            xlsx_dict = sheet.to_dict(orient="records")
            recent_date = file_date
        else:
            LOGGER.info("Already up to date")
            return

        row_names = [
            "ticker",
            "exchange",
            "name",
            "sector",
            "osshares",
            "dateoflisting",
            "listingtype",
            "volume",
            "value",
        ]

        all_excel_names = tuple(xlsx_dict[0].keys())
        base_wanted_excel_names = [
            "Root Ticker",
            "Exchange",
            "Name",
            "Sector",
            "O/S",
            "Date of TSX Listing",
            "Listing Type",
            "Volume YTD",
            "Value (C$)",
        ]
        wanted_excel_names = []
        for bxn in base_wanted_excel_names:
            for xn in all_excel_names:
                if xn.startswith(bxn):
                    wanted_excel_names.append(xn)
                    break

        assert (len(base_wanted_excel_names) == len(wanted_excel_names) ==
                len(row_names))

        value_dics = []
        for row in xlsx_dict:
            value_dic = {"updatedate": recent_date}
            for excel_name, row_name in zip(wanted_excel_names, row_names):
                val = row[excel_name]
                if row_name == "dateoflisting":
                    val = datetime.strptime(str(val),
                                            "%Y%m%d")  # assume YYYYMMDD
                if val == '':
                    val = None
                value_dic[row_name] = val
            value_dics.append(value_dic)

        self.session.execute(insert(Listings).values(value_dics))
        self.session.commit()