Example #1
0
 def get_summary_data(self, tickers, threads=True):
     tickers = utils.format_tickers(tickers)
     # Clear shared._DSSS
     shared._DSSS = {}
     if threads:
         if threads is True:
             threadsQty = min([len(tickers), _multitasking.cpu_count() * 2])
             _multitasking.set_max_threads(threadsQty)
             for i, ticker in enumerate(tickers):
                 self.download_summary_threaded(ticker)
             while len(shared._DSSS) < len(tickers):
                 _time.sleep(0.01)
         else:
             if len(tickers) == 1:
                 ticker = tickers[0]
                 resultSummary = self.download_summary(ticker)
                 shared._DSSS[ticker.upper()] = resultSummary
             elif len(tickers) <= 5:
                 for i, ticker in enumerate(tickers):
                     resultSummary = self.download_summary(ticker)
                     shared._DSSS[ticker.upper()] = resultSummary
             else:
                 self.get_summary(tickers, threads=True)
     data = shared._DSSS
     return data
 def get_prices_data(self,
                     tickers,
                     start=0,
                     end=9999999999,
                     interval="1d",
                     threads=True):
     tickers = utils.format_tickers(tickers)
     shared._DSHP = {}
     if threads:
         if threads is True:
             threadsQty = min([len(tickers), _multitasking.cpu_count() * 2])
             _multitasking.set_max_threads(threadsQty)
             for i, ticker in enumerate(tickers):
                 self.download_prices_threaded(ticker, start, end, interval)
             while len(shared._DSHP) < len(tickers):
                 _time.sleep(0.01)
         else:
             if len(tickers) == 1:
                 ticker = tickers[0]
                 resultPrices = self.download_prices(
                     ticker, start, end, interval)
                 shared._DSHP[ticker.upper()] = resultPrices
             elif len(tickers) <= 5:
                 for i, ticker in enumerate(tickers):
                     resultPrices = self.download_prices(
                         ticker, start, end, interval)
                     shared._DSHP[ticker.upper()] = resultPrices
             else:
                 self.get_prices(tickers,
                                 start,
                                 end,
                                 interval,
                                 threads=True)
     return shared._DSHP
Example #3
0
def download_advanced_stats(symbol_list, module_name_map, threads=True):
    """
    Downloads advanced yahoo stats for many tickers by doing one request per ticker.
    """
    num_requests = len(symbol_list)
    if threads:
        num_threads = min([num_requests, multitasking.cpu_count() * 2])
        multitasking.set_max_threads(num_threads)

    # get raw responses
    for request_idx, symbol in enumerate(symbol_list):
        if threads:
            get_ticker_stats_threaded(symbol, symbol, module_name_map)
        else:
            shared.response_dict[symbol] = get_ticker_stats(symbol, module_name_map)

    if threads:
        while len(shared.response_dict) < num_requests:
            time.sleep(0.01)

    # construct stats table from responses
    stats_table = []
    for symbol, retrieved_modules_dict in shared.response_dict.items():

        stats_list = [symbol]

        for module_name, stat_name_dict in module_name_map.items():
            retrieved_module_dict = None
            if retrieved_modules_dict is not None and module_name in retrieved_modules_dict:
                retrieved_module_dict = retrieved_modules_dict[module_name]

            if retrieved_module_dict is not None:
                for stat_name in stat_name_dict.keys():
                    stat_val = 'N/A'
                    if stat_name in retrieved_module_dict:
                        stat = retrieved_module_dict[stat_name]
                        if isinstance(stat, dict):
                            if stat:  # only if non-empty otherwise N/A
                                stat_val = stat['raw']
                        elif isinstance(stat, str) or isinstance(stat, numbers.Number):
                            stat_val = stat
                        else:
                            raise TypeError('Expected dictionary, string or number.')
                    stats_list.append(stat_val)
            else:
                stats_list.extend(['N/A'] * len(stat_name_dict))

        stats_table.append(stats_list)

    # reset for future reuse
    shared.response_dict = {}

    columns = ['Symbol']
    for stat_name_dict in module_name_map.values():
        columns.extend(list(stat_name_dict.values()))

    financial_data_df = pd.DataFrame(stats_table, columns=columns)
    financial_data_df.set_index('Symbol', inplace=True)

    return financial_data_df
Example #4
0
def estimate_matches(tickers: list, mu: np.array, tt: np.array) -> dict:
    """
    It estimates matches of correlated stocks.

    Parameters
    ----------
    tickers: list
        List of tickers
    mu: np.array
        Parameters of regression polynomial.
    tt: np.array
        Array of times corresponding to days of trading.

    Returns
    -------
    matches: dict
        For each symbol, this dictionary contains a corresponding `match` symbol, the `index` of the match symbol in the
        list of symbols and the computed `distance` between the two.
    """
    dtt = np.arange(1, tt.shape[0])[:, None] * tt[1:] / tt[1, None]
    dlogp_est = np.dot(mu[:, 1:], dtt)
    num_stocks = len(tickers)

    try:
        assert num_stocks <= 2000
        match_dist = np.sum((dlogp_est[:, None] - dlogp_est[None])**2, 2)
        match_minidx = np.argsort(match_dist, 1)[:, 1]
        match_mindist = np.sort(match_dist, 1)[:, 1]
        matches = {
            tickers[i]: {
                "match": tickers[match_minidx[i]],
                "index": match_minidx[i],
                "distance": match_mindist[i]
            }
            for i in range(num_stocks)
        }
    except:
        num_threads = min([len(tickers), multitasking.cpu_count() * 2])
        multitasking.set_max_threads(num_threads)

        matches = {}

        @multitasking.task
        def _estimate_one(i, tickers, dlogp_est):
            match_dist = np.sum((dlogp_est[i] - dlogp_est)**2, 1)
            match_minidx = np.argsort(match_dist)[1]
            match_mindist = np.sort(match_dist)[1]
            matches[tickers[i]] = {
                "match": tickers[match_minidx],
                "index": match_minidx,
                "distance": match_mindist
            }

        for i in range(num_stocks):
            _estimate_one(i, tickers, dlogp_est)

    return matches
Example #5
0
    def calculate(self, alphas='ALL', threaded=True, groupby='alpha'):
        """
        Calculate alphas, must be implemented
        :param alphas: str or list, default download all
        :param threaded: use multithreded feature, default be True
        :param groupby: str if == stock, returned df be group by stock
        :return: dict, saved in result
        """
        # check param validity
        if isinstance(alphas, str):
            # find num of attributes in the instance
            num_of_alpha = sum(['alpha_' in i for i in dir(self.apbase)])
            print(num_of_alpha)
            # return list of alpha number
            alphas = list(range(1, num_of_alpha + 1))
        elif not isinstance(alphas, list):
            raise TypeError("alphas input should be 'ALL' or list of int")

        # reset result
        self.result = {}

        # calculate using threads
        if threaded:
            # set the threads
            threads = min(len(alphas), _multitasking.cpu_count() * 2)
            # set maximum threads
            _multitasking.set_max_threads(threads=threads)
            for n in alphas:
                self._calculate_one_threaded(n)
            while len(self.result) < len(alphas):
                _time.sleep(0.01)
        else:
            for n in alphas:
                data = self._calculate_one(n)
                self.result['alpha_' + str(n)] = data

        # concatenating results
        data = _pd.concat(self.result.values(),
                          axis=1,
                          keys=self.result.keys())

        if groupby == 'stock':
            data.columns = data.columns.swaplevel(0, 1)
            # data.sort_index(level=0, axis=1, inplace=True)

        return data
Example #6
0
def estimate_clusters(tickers: list, mu: np.array, tt: np.array):
    dtt = np.arange(1, tt.shape[0])[:, None] * tt[1:] / tt[1, None]
    dlogp_est = np.dot(mu[:, 1:], dtt)
    num_stocks = len(tickers)

    num_threads = min([len(tickers), multitasking.cpu_count() * 2])
    multitasking.set_max_threads(num_threads)

    clusters = []

    def _unite_clusters(clusters):
        k = 0
        flag = 0
        while k < len(clusters):
            for j in range(k + 1, len(clusters)):
                if clusters[j] & clusters[k]:
                    clusters[j] = clusters[j].union(clusters[k])
                    flag = 1
                    break
            if flag:
                del clusters[k]
                flag = 0
            else:
                k += 1
        return clusters

    def _estimate_one(i, dlogp_est):
        dist = np.sum((dlogp_est[i] - dlogp_est)**2, 1)
        clusters.append(set(np.argsort(dist)[:2].tolist()))
        return _unite_clusters(clusters)

    for i in range(num_stocks):
        clusters = _estimate_one(i, dlogp_est)

    return [
        np.where([j in clusters[k] for k in range(len(clusters))])[0][0]
        for j in range(num_stocks)
    ]
Example #7
0
import multitasking
import time


multitasking.set_max_threads(10)
counter = 0


@multitasking.task
def count(n):
    global counter
    for _ in range(n):
        counter += 1


if __name__ == "__main__":
    start = time.time()
    k = 10
    n = 1000000
    for _ in range(k):
        count(n)
    multitasking.wait_for_tasks()
    end = time.time()
    expected = k * n
    print(f'done actual: {counter} expected: {expected}. Missing: {expected-counter}')
    print(f'Elapsed time {end-start}')
def download(tickers,
             start=None,
             end=None,
             actions=False,
             threads=True,
             group_by='column',
             auto_adjust=False,
             progress=True,
             period="max",
             interval="1d",
             prepost=False,
             proxy=None,
             **kwargs):
    """Download yahoo tickers
    :Parameters:
        tickers : str, list
            List of tickers to download
        period : str
            Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
            Either Use period parameter or use start and end
        interval : str
            Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
            Intraday data cannot extend last 60 days
        start: str
            Download start date string (YYYY-MM-DD) or _datetime.
            Default is 1900-01-01
        end: str
            Download end date string (YYYY-MM-DD) or _datetime.
            Default is now
        group_by : str
            Group by 'ticker' or 'column' (default)
        prepost : bool
            Include Pre and Post market data in results?
            Default is False
        auto_adjust: bool
            Adjust all OHLC automatically? Default is False
        actions: bool
            Download dividend + stock splits data. Default is False
        threads: bool / int
            How many threads to use for mass downloading. Default is True
        proxy: str
            Optional. Proxy server URL scheme. Default is None
    """
    global _PROGRESS_BAR, _DFS

    # create ticker list
    tickers = tickers if isinstance(tickers, list) else tickers.replace(
        ',', ' ').split()
    tickers = list(set([ticker.upper() for ticker in tickers]))

    if progress:
        _PROGRESS_BAR = _ProgressBar(len(tickers), 'downloaded')

    # reset _DFS
    _DFS = {}

    # download using threads
    if threads:
        if threads is True:
            threads = min([len(tickers), _multitasking.cpu_count() * 2])
        _multitasking.set_max_threads(threads)
        for i, ticker in enumerate(tickers):
            _download_one_threaded(ticker,
                                   period=period,
                                   interval=interval,
                                   start=start,
                                   end=end,
                                   prepost=prepost,
                                   actions=actions,
                                   auto_adjust=auto_adjust,
                                   progress=(progress and i > 0),
                                   proxy=proxy)
        while len(_DFS) < len(tickers):
            _time.sleep(0.01)

    # download synchronously
    else:
        for i, ticker in enumerate(tickers):
            data = _download_one(ticker,
                                 period=period,
                                 interval=interval,
                                 start=start,
                                 end=end,
                                 prepost=prepost,
                                 actions=actions,
                                 auto_adjust=auto_adjust)
            _DFS[ticker.upper()] = data
            if progress:
                _PROGRESS_BAR.animate()

    if progress:
        _PROGRESS_BAR.completed()

    if len(tickers) == 1:
        return _DFS[tickers[0]]

    try:
        data = _pd.concat(_DFS.values(), axis=1, keys=_DFS.keys())
    except Exception:
        _realign_dfs()
        data = _pd.concat(_DFS.values(), axis=1, keys=_DFS.keys())

    if group_by == 'column':
        data.columns = data.columns.swaplevel(0, 1)
        data.sort_index(level=0, axis=1, inplace=True)

    return data
Example #9
0
import multitasking
import time
import random
import signal

multitasking.set_max_threads(multitasking.config["CPU_CORES"] * 2)
# kill all tasks on ctrl-c
signal.signal(signal.SIGINT, multitasking.killall)

# or, wait for task to finish on ctrl-c:
# signal.signal(signal.SIGINT, multitasking.wait_for_tasks)


@multitasking.task  # <== this is all it takes :-)
def hello(count):
    sleep = random.randint(1, 3) / 2
    print("Hello %s (sleeping for %ss)" % (count, sleep))
    time.sleep(sleep)
    print("Goodbye %s (after for %ss)" % (count, sleep))


for i in range(0, 10):
    hello(i + 1)
Example #10
0
def download(tickers,
             start=None,
             end=None,
             actions=False,
             threads=True,
             group_by='column',
             auto_adjust=False,
             back_adjust=False,
             progress=True,
             period="max",
             interval="1d",
             prepost=False,
             proxy=None,
             rounding=False,
             **kwargs):
    """Download yahoo tickers
    :Parameters:
        tickers : str, list
            List of tickers to download
        period : str
            Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
            Either Use period parameter or use start and end
        interval : str
            Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
            Intraday data cannot extend last 60 days
        start: str
            Download start date string (YYYY-MM-DD) or _datetime.
            Default is 1900-01-01
        end: str
            Download end date string (YYYY-MM-DD) or _datetime.
            Default is now
        group_by : str
            Group by 'ticker' or 'column' (default)
        prepost : bool
            Include Pre and Post market data in results?
            Default is False
        auto_adjust: bool
            Adjust all OHLC automatically? Default is False
        actions: bool
            Download dividend + stock splits data. Default is False
        threads: bool / int
            How many threads to use for mass downloading. Default is True
        proxy: str
            Optional. Proxy server URL scheme. Default is None
        rounding: bool
            Optional. Round values to 2 decimal places?
    """

    # create ticker list
    # Replace dots with dashes so class stocks work
    tickers = [tickers.replace(".", "-") for ticker in tickers]
    tickers = tickers if isinstance(tickers,
                                    (list, set, tuple)) else tickers.replace(
                                        ',', ' ').split()

    tickers = list(set([ticker.upper() for ticker in tickers]))

    if progress:
        shared._PROGRESS_BAR = utils.ProgressBar(len(tickers), 'completed')

    # reset shared._DFS
    shared._DFS = {}
    shared._ERRORS = {}

    # download using threads
    if threads:
        if threads is True:
            threads = min([len(tickers), _multitasking.cpu_count() * 2])
        _multitasking.set_max_threads(threads)
        for i, ticker in enumerate(tickers):
            _download_one_threaded(ticker,
                                   period=period,
                                   interval=interval,
                                   start=start,
                                   end=end,
                                   prepost=prepost,
                                   actions=actions,
                                   auto_adjust=auto_adjust,
                                   back_adjust=back_adjust,
                                   progress=(progress and i > 0),
                                   proxy=proxy,
                                   rounding=rounding)
        while len(shared._DFS) < len(tickers):
            _time.sleep(0.01)

    # download synchronously
    else:
        for i, ticker in enumerate(tickers):
            data = _download_one(ticker,
                                 period=period,
                                 interval=interval,
                                 start=start,
                                 end=end,
                                 prepost=prepost,
                                 actions=actions,
                                 auto_adjust=auto_adjust,
                                 back_adjust=back_adjust,
                                 proxy=proxy,
                                 rounding=rounding)
            shared._DFS[ticker.upper()] = data
            if progress:
                shared._PROGRESS_BAR.animate()

    if progress:
        shared._PROGRESS_BAR.completed()

    if shared._ERRORS:
        print('\n%.f Failed download%s:' %
              (len(shared._ERRORS), 's' if len(shared._ERRORS) > 1 else ''))
        # print(shared._ERRORS)
        print("\n".join(['- %s: %s' % v
                         for v in list(shared._ERRORS.items())]))

    if len(tickers) == 1:
        return shared._DFS[tickers[0]]

    try:
        data = _pd.concat(shared._DFS.values(),
                          axis=1,
                          keys=shared._DFS.keys())
    except Exception:
        _realign_dfs()
        data = _pd.concat(shared._DFS.values(),
                          axis=1,
                          keys=shared._DFS.keys())

    if group_by == 'column':
        data.columns = data.columns.swaplevel(0, 1)
        data.sort_index(level=0, axis=1, inplace=True)

    return data
import os
import pickle
import random
import signal
from collections import defaultdict
from random import shuffle

import multitasking
import numpy as np
import pandas as pd
from tqdm import tqdm

from utils import Logger, evaluate

max_threads = multitasking.config['CPU_CORES']
multitasking.set_max_threads(max_threads)
multitasking.set_engine('process')
signal.signal(signal.SIGINT, multitasking.killall)

random.seed(2020)

# 命令行参数
parser = argparse.ArgumentParser(description='binetwork 召回')
parser.add_argument('--mode', default='valid')
parser.add_argument('--logfile', default='test.log')

args = parser.parse_args()

mode = args.mode
logfile = args.logfile
Example #12
0
def download(tickers: list,
             start: Union[str, int] = None,
             end: Union[str, int] = None,
             interval: str = "1d") -> dict:
    """
    Download historical data for tickers in the list.

    Parameters
    ----------
    tickers: list
        Tickers for which to download historical information.
    start: str or int
        Start download data from this date.
    end: str or int
        End download data at this date.
    interval: str
        Frequency between data.

    Returns
    -------
    data: dict
        Dictionary including the following keys:
        - tickers: list of tickers
        - logp: array of log-adjusted closing prices, shape=(num stocks, length period);
        - volume: array of volumes, shape=(num stocks, length period);
        - sectors: dictionary of stock sector for each ticker;
        - industries: dictionary of stock industry for each ticker.
    """
    tickers = tickers if isinstance(tickers,
                                    (list, set, tuple)) else tickers.replace(
                                        ',', ' ').split()
    tickers = list(set([ticker.upper() for ticker in tickers]))

    data = {}
    si_columns = ["SYMBOL", "CURRENCY", "SECTOR", "INDUSTRY"]
    si_filename = "stock_info.csv"
    if not os.path.exists(si_filename):
        # create a .csv to store stock information
        with open(si_filename, 'w') as file:
            wr = csv.writer(file)
            wr.writerow(si_columns)
    # load stock information file
    si = pd.read_csv(si_filename)
    missing_tickers = [
        ticker for ticker in tickers if ticker not in si['SYMBOL'].values
    ]
    missing_si, na_si = {}, {}
    currencies = {}

    if end is None:
        end = int(dt.datetime.timestamp(dt.datetime.today()))
    elif type(end) is str:
        end = int(dt.datetime.timestamp(dt.datetime.strptime(end, '%Y-%m-%d')))
    if start is None:
        start = int(
            dt.datetime.timestamp(dt.datetime.today() - dt.timedelta(365)))
    elif type(start) is str:
        start = int(
            dt.datetime.timestamp(dt.datetime.strptime(start, '%Y-%m-%d')))

    @multitasking.task
    def _download_one_threaded(ticker: str,
                               start: str,
                               end: str,
                               interval: str = "1d"):
        """
        Download historical data for a single ticker with multithreading. Plus, it scrapes missing stock information.

        Parameters
        ----------
        ticker: str
            Ticker for which to download historical information.
        interval: str
            Frequency between data.
        start: str
            Start download data from this date.
        end: str
            End download data at this date.
        """
        data_one = _download_one(ticker, start, end, interval)

        try:
            data_one = data_one["chart"]["result"][0]
            data[ticker] = _parse_quotes(data_one)

            if ticker in missing_tickers:
                currencies[ticker] = data_one['meta']['currency']
                try:
                    html = requests.get(
                        url='https://finance.yahoo.com/quote/' + ticker).text
                    json_str = html.split('root.App.main =')[1].split(
                        '(this)')[0].split(';\n}')[0].strip()
                    info = json.loads(json_str)['context']['dispatcher'][
                        'stores']['QuoteSummaryStore']['summaryProfile']
                    assert (len(info['sector']) > 0) and (len(info['industry'])
                                                          > 0)
                    missing_si[ticker] = dict(sector=info["sector"],
                                              industry=info["industry"])
                except:
                    pass
        except:
            pass
        progress.animate()

    num_threads = min([len(tickers), multitasking.cpu_count() * 2])
    multitasking.set_max_threads(num_threads)

    progress = ProgressBar(len(tickers), 'completed')

    for ticker in tickers:
        _download_one_threaded(ticker, start, end, interval)
    multitasking.wait_for_tasks()

    progress.completed()

    if len(data) == 0:
        raise Exception("No symbol with full information is available.")

    data = pd.concat(data.values(), keys=data.keys(), axis=1, sort=True)
    data.drop(
        columns=data.columns[data.isnull().sum(0) > 0.33 * data.shape[0]],
        inplace=True)
    data = data.fillna(method='bfill').fillna(method='ffill').drop_duplicates()

    info = zip(list(missing_si.keys()),
               [currencies[ticker] for ticker in missing_si.keys()],
               [v['sector'] for v in missing_si.values()],
               [v['industry'] for v in missing_si.values()])
    with open(si_filename, 'a+', newline='') as file:
        wr = csv.writer(file)
        for row in info:
            wr.writerow(row)
    si = pd.read_csv('stock_info.csv').set_index("SYMBOL").to_dict(
        orient='index')

    missing_tickers = [
        ticker for ticker in tickers
        if ticker not in data.columns.get_level_values(0)[::2].tolist()
    ]
    tickers = data.columns.get_level_values(0)[::2].tolist()
    if len(missing_tickers) > 0:
        print(
            '\nRemoving {} from list of symbols because we could not collect full information.'
            .format(missing_tickers))

    # download exchange rates and convert to most common currency
    currencies = [
        si[ticker]['CURRENCY'] if ticker in si else currencies[ticker]
        for ticker in tickers
    ]
    ucurrencies, counts = np.unique(currencies, return_counts=True)
    default_currency = ucurrencies[np.argmax(counts)]
    xrates = get_exchange_rates(currencies, default_currency, data.index,
                                start, end, interval)

    return dict(tickers=tickers,
                dates=pd.to_datetime(data.index),
                price=data.iloc[:,
                                data.columns.get_level_values(1) ==
                                'Adj Close'].to_numpy().T,
                volume=data.iloc[:,
                                 data.columns.get_level_values(1) ==
                                 'Volume'].to_numpy().T,
                currencies=currencies,
                exchange_rates=xrates,
                default_currency=default_currency,
                sectors={
                    ticker:
                    si[ticker]['SECTOR'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                },
                industries={
                    ticker:
                    si[ticker]['INDUSTRY'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                })
Example #13
0
def download_quick_stats(symbol_list, quick_stats_dict, threads=True):
    """
    Downloads select ("quick") stats for many tickers using minimal number of http requests. Splits the ticker list
    into groups of 1000 and performs one request per group. eg if list has 2350 tickers, will split into 2 groups of
    1000 tickers and one group with the remaining 350 tickers, and will get quick stats with only 3 http requests. Only
    returns those tickers that are valid, thus can be used to validate tickers efficiently.
    """
    # through trial and error, 1179 was the max without returning an error, but that number feels too arbitrary
    max_params = 1000
    num_requests = math.ceil(len(symbol_list) / max_params)
    last_request_size = len(symbol_list) % max_params
    if last_request_size == 0:
        last_request_size = max_params

    if threads:
        num_threads = min([num_requests, multitasking.cpu_count() * 2])
        multitasking.set_max_threads(num_threads)

    # get raw responses
    for request_idx in range(num_requests):

        if request_idx == num_requests - 1:
            num_symbols = last_request_size
        else:
            num_symbols = max_params

        request_symbol_list = symbol_list[request_idx *
                                          max_params:request_idx * max_params +
                                          num_symbols]

        if threads:
            quick_stats_request_threaded(request_idx, request_symbol_list,
                                         list(quick_stats_dict.keys()))
        else:
            shared.response_dict[request_idx] = quick_stats_request(
                request_symbol_list, list(quick_stats_dict.keys()))

    if threads:
        while len(shared.response_dict) < num_requests:
            time.sleep(0.01)

    # construct stats table from responses
    stats_table = []
    for response_list in shared.response_dict.values():
        # each iteration is one symbol; (eg SIGL, AAPL)
        for retrieved_stats_dict in response_list:
            symbol = retrieved_stats_dict['symbol']
            stats_list = [symbol]
            if retrieved_stats_dict is not None:
                for quick_stat_name in quick_stats_dict.keys():
                    stat_val = 'N/A'
                    if quick_stat_name in retrieved_stats_dict:
                        stat = retrieved_stats_dict[quick_stat_name]
                        if isinstance(stat, dict):
                            if stat:  # only if non-empty otherwise N/A
                                if quick_stat_name == "floatShares":
                                    stat_val = stat['fmt']
                                else:
                                    stat_val = stat['raw']
                        elif isinstance(stat, str) or isinstance(
                                stat, numbers.Number):
                            stat_val = stat
                        else:
                            raise TypeError(
                                'Expected dictionary, string or number.')
                    stats_list.append(stat_val)
            else:
                stats_list.extend(['N/A'] * len(quick_stats_dict.keys()))

            stats_table.append(stats_list)

    # reset for future reuse
    shared.response_dict = {}

    # construct dataframe
    columns = ['Symbol'] + list(quick_stats_dict.values())
    stats_df = pd.DataFrame(stats_table, columns=columns)
    stats_df.set_index('Symbol', inplace=True)

    return stats_df
Example #14
0
def download(tickers: list, interval: str = "1d", period: str = "1y"):
    """
    Download historical data for tickers in the list.

    Parameters
    ----------
    tickers: list
        Tickers for which to download historical information.
    interval: str
        Frequency between data.
    period: str
        Data period to download.

    Returns
    -------
    data: dict
        Dictionary including the following keys:
        - tickers: list of tickers
        - logp: array of log-adjusted closing prices, shape=(num stocks, length period);
        - volume: array of volumes, shape=(num stocks, length period);
        - sectors: list of stock sectors;
        - industries: list stock industries.
    """
    tickers = tickers if isinstance(tickers,
                                    (list, set, tuple)) else tickers.replace(
                                        ',', ' ').split()
    tickers = list(set([ticker.upper() for ticker in tickers]))

    data = {}
    si_columns = ["SYMBOL", "SECTOR", "INDUSTRY"]
    si_filename = "stock_info.csv"
    if not os.path.exists(si_filename):
        # create a .csv to store stock information
        with open(si_filename, 'w') as file:
            wr = csv.writer(file)
            for row in zip([[c] for c in si_columns]):
                wr.writerow(row)
    # load stock information file
    si = pd.read_csv(si_filename)
    missing_tickers = [
        ticker for ticker in tickers if ticker not in si['SYMBOL'].values
    ]
    missing_si, na_si = {}, {}

    @multitasking.task
    def _download_one_threaded(ticker: str,
                               interval: str = "1d",
                               period: str = "1y"):
        """
        Download historical data for a single ticker with multithreading. Plus, it scrapes missing stock information.

        Parameters
        ----------
        ticker: str
            Ticker for which to download historical information.
        interval: str
            Frequency between data.
        period: str
            Data period to download.
        """
        data_one = _download_one(ticker, interval, period)

        try:
            data[ticker] = parse_quotes(data_one["chart"]["result"][0])

            if ticker in missing_tickers:
                try:
                    html = requests.get(
                        url='https://finance.yahoo.com/quote/' + ticker).text
                    json_str = html.split('root.App.main =')[1].split(
                        '(this)')[0].split(';\n}')[0].strip()
                    info = json.loads(json_str)['context']['dispatcher'][
                        'stores']['QuoteSummaryStore']['summaryProfile']
                    assert (len(info['sector']) > 0) and (len(info['industry'])
                                                          > 0)
                    missing_si[ticker] = dict(sector=info["sector"],
                                              industry=info["industry"])
                except:
                    pass
        except:
            pass
        progress.animate()

    num_threads = min([len(tickers), multitasking.cpu_count() * 2])
    multitasking.set_max_threads(num_threads)

    progress = ProgressBar(len(tickers), 'completed')

    for ticker in tickers:
        _download_one_threaded(ticker, interval, period)
    multitasking.wait_for_tasks()

    progress.completed()

    if len(data) == 0:
        raise Exception("No symbol with full information is available.")

    data = pd.concat(data.values(), keys=data.keys(), axis=1)
    data.drop(
        columns=data.columns[data.isnull().sum(0) > 0.33 * data.shape[0]],
        inplace=True)
    data = data.fillna(method='bfill').fillna(method='ffill').drop_duplicates()

    info = zip(list(missing_si.keys()),
               [v['sector'] for v in missing_si.values()],
               [v['industry'] for v in missing_si.values()])
    with open(si_filename, 'a+', newline='') as file:
        wr = csv.writer(file)
        for row in info:
            wr.writerow(row)
    si = pd.read_csv('stock_info.csv').set_index("SYMBOL").to_dict(
        orient='index')

    missing_tickers = [
        ticker for ticker in tickers
        if ticker not in data.columns.get_level_values(0)[::2].tolist()
    ]
    tickers = data.columns.get_level_values(0)[::2].tolist()
    if len(missing_tickers) > 0:
        print(
            '\nRemoving {} from list of symbols because we could not collect full information.'
            .format(missing_tickers))

    return dict(tickers=tickers,
                dates=pd.to_datetime(data.index),
                logp=np.log(data.iloc[:,
                                      data.columns.get_level_values(1) ==
                                      'Adj Close'].to_numpy().T),
                volume=data.iloc[:,
                                 data.columns.get_level_values(1) ==
                                 'Volume'].to_numpy().T,
                sectors=[
                    si[ticker]['SECTOR'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                ],
                industries=[
                    si[ticker]['INDUSTRY'] if ticker in si else "NA_" + ticker
                    for ticker in tickers
                ])
Example #15
0
def download(tickers,
             start=None,
             end=None,
             threads=True,
             group_by='column',
             interval="1d"):
    """Download tickers
    :parameter
        tickers: str,list
            List of tickers to be dowmload
        interval: str
            default same as ticker
        start: str
            default same as ticker
        end: str
            default same as ticker
        threads:bool
            multi threads downloading,default be true
    """
    global _DFS

    # reset DFS
    _DFS = {}

    # create ticker list
    tickers = tickers if isinstance(tickers,
                                    (list, set, tuple)) else tickers.replace(
                                        ',', ' ').split()

    tickers = list(set([ticker.upper() for ticker in tickers]))

    # download using threads
    if threads:
        # set the threads
        threads = min([len(tickers), _multitasking.cpu_count() * 2])
        # set the maximum threads
        _multitasking.set_max_threads(threads)
        for i, ticker in enumerate(tickers):
            _download_one_threaded(ticker=ticker,
                                   start=start,
                                   end=end,
                                   interval=interval)
        while len(_DFS) < len(tickers):
            _time.sleep(0.01)

    else:
        for i, ticker in enumerate(tickers):
            data = _download_one(ticker,
                                 start=start,
                                 end=end,
                                 interval=interval)
            _DFS[ticker.upper()] = data

    # concating the results
    try:
        data = _pd.concat(_DFS.values(), axis=1, keys=_DFS.keys())
    except Exception:
        data = _pd.concat(_DFS.values(), axis=1, keys=_DFS.keys())

    if group_by == 'column':
        data.columns = data.columns.swaplevel(0, 1)
        data.sort_index(level=0, axis=1, inplace=True)

    return data