Example #1
0
def create_historical_prices_csv_link(stockSummaryDict) :
    fromDate = (datetime.now() - timedelta(days=365*3)).strftime('%Y-%m-%d')
    toDate = datetime.now().strftime('%Y-%m-%d')
    csvLink =  "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/functions/csv_prices.php?"
    csvLink += ("default=" + stockSummaryDict["Ticker"] + "&" + "fd=" + fromDate + "&" + "td=" + toDate)
    logger.info("Pulling historical price data from: " + csvLink)
    return csvLink
Example #2
0
def send_files_to_server():
    """
    This method is used to retrieve all pdf files from the temp folder and send them to the appropriate URL
    """
    fileList = os.listdir("temp")
    fileIteration = 0
    pdfIteration = 0
    destinationURL = getDestinationURL()
    logger.info("Sending files to: " + destinationURL)

    for file in fileList:
        fileIteration += 1
        fileJSON = {}
        if file.endswith(".pdf"):
            pdfIteration += 1
            with open(os.path.join(r'temp', file), 'rb') as fileContent:
                fileJSON[file] = fileContent
                r = requests.post(destinationURL, files=fileJSON)
            logger.info("Sent file: " + file)
            printProgressBar(
                fileIteration,
                len(fileList),
                prefix='Saving {} data'.format(file).ljust(24),
                suffix='| {} files completed'.format(pdfIteration),
                length=10)
Example #3
0
def get_browser():
    """
    Creates a chrome driver which will be used by selenium to conduct the website navigation
    Sets the following options to aid in webscraping
        - Auto file download
        - Removal the images
        - Disables internal pdf viewer

    Returns:
        webdriver: Driver for site navigation
    """
    # Set up driver options
    chromeOptions = Options()
    chromeOptions.add_argument('log-level=3')  # Remove warnings
    chromeOptions.add_argument('--disable-gpu')
    chromeOptions.add_argument('headless')
    chromeOptions.add_argument("--proxy-server='direct://'")
    chromeOptions.add_argument("--proxy-bypass-list=*")
    chromeOptions.add_argument('--no-proxy-server')
    prefs = {
        "download.default_directory":
        downloadDirectory,  # Sets default directory for downloads
        "directory_upgrade":
        True,  # Provides write permissions to the directory
        "plugins.always_open_pdf_externally":
        True,  # Disables the built-in pdf viewer (Helps with pdf download)
        "safebrowsing.enabled":
        True,  # Tells  driver all file downloads and sites are safe
        "download.prompt_for_download":
        False,  # Auto downloads files into default directory
        "profile.managed_default_content_settings.images": 2
    }  # Removes images for faster load times
    chromeOptions.add_experimental_option("prefs", prefs)
    browser = webdriver.Chrome(chromeDriverLocation,
                               chrome_options=chromeOptions)  # Apply options
    browser.command_executor._commands["send_command"] = (
        "POST", '/session/$sessionId/chromium/send_command')
    params = {
        'cmd': 'Page.setDownloadBehavior',
        'params': {
            'behavior': 'allow',
            'downloadPath': downloadDirectory
        }
    }
    browser.execute("send_command", params)
    homeURL = "https://library.aut.ac.nz/databases/nzx-deep-archive"

    browser.get(homeURL)

    delay = 15  # seconds
    # Wait 15 seconds for the driver to get started and get to the landing page
    try:
        myElem = WebDriverWait(browser, delay).until(
            EC.presence_of_element_located((By.CLASS_NAME, "form-field")))
        logger.info("Browser is ready!")
    except TimeoutException:
        logger.error("Loading took too much time!")
    logger.info("get_browser() complete")
    print("Chromium open")
    return browser
Example #4
0
def score_companies(stockDataArray):
    """
    Scores each company based on their own values compared to other companies.
	For this, we are using the geometric average to get a more accurate represention.
    Args:
        stockDataArray (List): dictionary of all company information
    """

    normalisationRanges = find_normal_ranges(stockDataArray)

    for stock in stockDataArray:
        debtEquityIndexValue = findDebtEquityIndexValue(
            stock, normalisationRanges['Debt Equity Max'],
            normalisationRanges['Debt Equity Min'])
        netDividendYield = findNetDividendYield(
            stock, normalisationRanges['Dividend Yield Max'],
            normalisationRanges['Dividend Yield Min'])
        sharpeRatioIndexValue = findSharpeRatioIndexValue(
            stock, normalisationRanges['Sharpe Ratio Max'],
            normalisationRanges['Sharpe Ratio Min'])
        returnOnEquityIndexValue = findReturnOnEquityIndexValue(
            stock, normalisationRanges['Return on Equity Max'],
            normalisationRanges['Return on Equity Min'])

        # Geometric average to make score more accurate
        score = (debtEquityIndexValue * sharpeRatioIndexValue *
                 returnOnEquityIndexValue * netDividendYield)**0.25
        stock['Summary']['Score'] = score
        logger.info("{} | Score: {}".format(stock['Summary']['Ticker'], score))
        print("{} got a score of: {}".format(stock['Summary']['Ticker'],
                                             score))
Example #5
0
def create_historical_dividends_csv_link(stockTicker):
    """
    Creates a csv link used to download the historical dividends csv. Using todays date, and 3 years prior

    Args:
        stockTicker (String): Contains the ticker of the company

    Returns:
        csvLink (String): url which holds the csv
    """
    csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/divhistory_csv.php?selection=" + stockTicker
    logger.info("Pulling historical dividend data from: " + csvLink)
    return csvLink
Example #6
0
def print_overview_sheet(workbook, stockDataArray, formats):
    logger.info("Printing Overview")
    overviewSheet = workbook.add_worksheet("Overview")
    overviewSheet.write_string(0, 0, "Stocks")
    row = 1
    col = 0
    for stock in stockDataArray:
        overviewSheet.write_url(
            row,
            col,
            "internal:" + stock.stockSummaryDict["Ticker"] + "_Summary!A1",
            string=stock.stockSummaryDict["Name"])
        row += 1
Example #7
0
def send_to_server(scrapeInsert):
    """
    Sends the given JSON object to the appropriate URL

    Args:
        scrapeInsert (JSON): JSON object with all company information
    """
    destinationURL = getDestinationURL()
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    r = requests.post(destinationURL,
                      data=json.dumps(scrapeInsert),
                      headers=headers)
    logger.info("Sent JSON data to {}".format(destinationURL))
    logger.info("Received response {}".format(r.status_code))
Example #8
0
def findSharpeRatioIndexValue(stock, max, min):
    """
    Args:
        stock (Dict): dictionary of company information
        max (Float): the maximum sharpe ratio within this scrape + 1
        min (Float): the minimum sharpe ratio within this scrape 1

    Returns:
	    index (Float): The normalised value of the company's sharpe ratio (Always between 0 and 1)
    """
    stockSharpeRatio = stock['Ratio']['Sharpe Ratio']
    index = (stockSharpeRatio - min) / (max - min)
    stock['Summary']['Sharpe Ratio Index'] = index
    logger.info("{} | Sharpe: {}".format(stock['Summary']['Ticker'], index))
    return index
Example #9
0
def findReturnOnEquityIndexValue(stock, max, min):
    """
    Args:
        stock (Dict): dictionary of company information
        max (Float): the maximum return on equity within this scrape + 1
        min (Float): the minimum return on equity within this scrape 1

    Returns:
	    index (Float): The normalised value of the company's return on equity (Always between 0 and 1)
    """
    stockRoE = stock['Ratio']['Return on Equity']
    index = (stockRoE - min) / (max - min)
    stock['Summary']['Return on Equity Index'] = index
    logger.info("{} | RoE Index: {}".format(stock['Summary']['Ticker'], index))
    return index
Example #10
0
def findNetDividendYield(stock, max, min):
    """
    Args:
        stock (Dict): dictionary of company information
        max (Float): the maximum dividend yield within this scrape + 1
        min (Float): the minimum dividend yield within this scrape 1

    Returns:
	    index (Float): The normalised value of the company's dividend yield (Always between 0 and 1)
    """
    netDividendYield = stock['Ratio']['Net Yield']
    index = (netDividendYield - min) / (max - min)
    stock['Summary']['Net Dividend Yield Index'] = index
    logger.info("{} | Yield: {}".format(stock['Summary']['Ticker'], index))
    return index
Example #11
0
def findDebtEquityIndexValue(stock, max, min):
    """
    Args:
        stock (Dict): dictionary of company information
        max (Float): the maximum debt equity within this scrape + 1
        min (Float): the minimum debt equity within this scrape 1

    Returns:
	    index (Float): The normalised value of the company's debt equity (Always between 0 and 1)
    """
    stockDebtEquity = stock['Ratio']['Debt Equity']
    index = 1 - ((stockDebtEquity - min) / (max - min))
    stock['Summary']['Debt Equity Index'] = index
    logger.info("{} | Debt Equity: {}".format(stock['Summary']['Ticker'],
                                              index))
    return index
Example #12
0
def analyse_company_risk(stockDataArray):
    """
	For each company, creates a list of that company's stock price.
	The standard deviation of this list is used as an indicator for risk.
    Saves the calculate risk score into the Summary Dictionary.

    Args:
		stockDataArray (List): dictionary of all company information
	"""
    for stock in stockDataArray:
        priceData = stock['HistoricalPrices']
        priceList = []
        for price in priceData:
            priceList.append(price['Last'])
        risk = statistics.stdev(priceList)
        logger.info("{} | Risk: {}".format(stock['Summary']['Ticker'], risk))
        stock['Summary']['Risk'] = risk
Example #13
0
def create_historical_prices_csv_link(stockTicker):
    """
    Creates a csv link used to download the historical prices csv. Using todays date, and 3 years prior

    Args:
        stockTicker (String): Contains the ticker of the company

    Returns:
        csvLink (String): url which holds the csv
    """
    fromDate = (datetime.now() - timedelta(days=365 * 3)).strftime('%Y-%m-%d')
    toDate = datetime.now().strftime('%Y-%m-%d')
    csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/functions/csv_prices.php?"
    csvLink += ("default=" + stockTicker + "&" + "fd=" + fromDate + "&" +
                "td=" + toDate)
    logger.info("Pulling historical price data from: " + csvLink)
    return csvLink
Example #14
0
def print_summary_sheet(workbook, stock, formats):
    logger.info("       Printing Summary & Ratios for " +
                stock.stockSummaryDict["Ticker"])
    row = 0
    col = 0
    worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] +
                                       "_Summary")
    for key, value in stock.stockSummaryDict.items():
        worksheet.write_string(row, col, key)
        worksheet.write_string(row, col + 2, value)
        row += 1

    worksheet.write_url(0, col + 5, "internal:Overview!A1", string="BACK")
    worksheet.write_url(0,
                        col + 7,
                        "internal:" + stock.stockSummaryDict["Ticker"] +
                        "_HistoricalPrices!A1",
                        string="Historical Prices")
Example #15
0
def save_log_to_pastebin():
    """
    This method is used to retrieve logs from Heroku, where it would otherwise be impossible
    It sends any log files to Pastebin, where we can monitor how it is functioning
    """
    pastebinApiURL = 'https://pastebin.com/api/api_post.php'
    dev_key = '5f996bee7fa49af7481927ddce874367'
    user_key = '77787566e1fa286ab849d7b0e22169c9'

    # Check number of pastes
    dataList = {}
    dataList['api_dev_key'] = dev_key
    dataList['api_option'] = 'list'
    dataList['api_user_key'] = user_key
    r = requests.post(pastebinApiURL, data=dataList)
    pastesString = "<pastes>" + r.text + "</pastes>"
    root = ElementTree.fromstring(pastesString)
    numPastes = len(root.findall('paste'))

    # Find the oldest paste then delete it
    if numPastes == 10:
        logger.info("10 Pastes found, deleting one to make paste for next one")
        oldestPaste = root[0][0].text
        oldestDate = int(root[0][1].text)
        for paste in root:
            if int(paste[1].text) < oldestDate:
                oldestPaste = paste[0].text
                oldestDate = int(paste[1].text)

        dataDelete = {}
        dataDelete['api_dev_key'] = dev_key
        dataDelete['api_option'] = 'delete'
        dataDelete['api_user_key'] = user_key
        dataDelete['api_paste_key'] = oldestPaste
        r = requests.post(pastebinApiURL, data=dataDelete)
        logger.info("Deleted {} paste".format(oldestPaste))

    logger.info("Sending logs to Pastebin")
    logger.info("Bye, Felicia")

    dataPaste = {}
    dataPaste['api_dev_key'] = dev_key
    dataPaste['api_option'] = 'paste'
    with open("python_logging.log", "r") as logging_file:
        dataPaste['api_paste_code'] = logging_file.read()
    dataPaste['api_user_key'] = user_key
    dataPaste['api_paste_name'] = 'JSON Scrape Backup ' + str(datetime.now())
    # dataPaste['api_paste_format'] = 'json'
    dataPaste['api_paste_private'] = '2'
    dataPaste['api_paste_expire_date'] = '6M'

    r = requests.post(pastebinApiURL, data=dataPaste)
    print("New Paste at: " + r.text)
Example #16
0
def print_financial_profile_sheet(workbook, stock, formats):
    logger.info("       Printing Financial Profile for " +
                stock.stockSummaryDict["Ticker"])
    row = 0
    col = 0

    # Create sheet
    worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] +
                                       "_FinancialProfile")
    # Print Headers & Values

    logger.debug(stock.stockFinancialProfileDict.items())
    for key, value in stock.stockFinancialProfileDict.items():
        worksheet.write_string(row, col, key)
        worksheet.write_string(row, col + 1, value)
        row += 1
    worksheet.write_url(0,
                        13,
                        "internal:" + stock.stockSummaryDict["Ticker"] +
                        "_Summary!A1",
                        string="BACK")
Example #17
0
def print_historical_dividends_sheet(workbook, stock, formats):
    if stock.stockHistoricalDividendsDict is not None:
        logger.info("       Printing Historical Dividends for " +
                    stock.stockSummaryDict["Ticker"])
        row = 0
        col = 0

        # Create sheet
        worksheet = workbook.add_worksheet(stock.stockSummaryDict["Ticker"] +
                                           "_HistoricalDividends")
        # Print Headers
        keys = stock.stockHistoricalDividendsDict[0].keys()
        logger.debug(stock.stockHistoricalDividendsDict[0].keys())
        for key in keys:
            worksheet.write_string(row, col, key)
            col += 1
        worksheet.write_url(row,
                            col + 13,
                            "internal:" + stock.stockSummaryDict["Ticker"] +
                            "_Summary!A1",
                            string="BACK")

        row = 1
        col = 0

        # Print Items
        for rowItems in stock.stockHistoricalDividendsDict:
            logger.debug(rowItems)
            for key, value in rowItems.items():
                logger.debug(value)
                if (key == 'Date'):
                    worksheet.write_datetime(
                        row, col, datetime.strptime(value, '%d %b %Y'),
                        formats['dateFormat'])
                else:
                    worksheet.write_number(row, col, float(value))
                col += 1
            row += 1
            col = 0
Example #18
0
def print_excel(stockDataArray):
    logger.info("Printing excel document")
    # Create excel workbook
    workbook = xlsxwriter.Workbook('StockDB.xlsx')

    # Excel Cell Formats
    formats = {}
    formats['dateFormat'] = workbook.add_format({'num_format': 'd mmm yyyy'})
    formats['moneyFormat'] = workbook.add_format({'num_format': '$#,##0'})
    formats['number2decFormat'] = workbook.add_format({'num_format': '#.##'})

    print_overview_sheet(workbook, stockDataArray, formats)

    for stock in stockDataArray:
        print_summary_sheet(workbook, stock, formats)
        print_historical_prices_sheet(workbook, stock, formats)
        print_Directors(workbook, stock, formats)
        print_company_profile(workbook, stock, formats)
        print_historical_dividends_sheet(workbook, stock, formats)
        print_financial_profile_sheet(workbook, stock, formats)

    print_ratios_db(workbook, stockDataArray, formats)

    workbook.close()
Example #19
0
def find_normal_ranges(stockDataArray):
    """
	Finds the max and min for each index. To prevent a company from receiving a perfect or 0 zero, a buffer of 1 has been added.

    Args:
        stockDataArray (List): dictionary of all company information

    Returns:
		Dict: Contains the max and min of each index
	"""
    normalisationRanges = {
        "Dividend Yield Max": 0,
        "Dividend Yield Min": 0,
        "Return on Equity Max": 0,
        "Return on Equity Min": 0,
        "Sharpe Ratio Max": 0,
        "Sharpe Ratio Min": 0,
        "Debt Equity Max": 0,
        "Debt Equity Min": 0,
    }
    for stock in stockDataArray:
        # Dividend Yield Ranges
        if stock['Ratio']['Net Yield'] >= normalisationRanges[
                'Dividend Yield Max']:
            normalisationRanges[
                'Dividend Yield Max'] = stock['Ratio']['Net Yield'] + 1
        if stock['Ratio']['Net Yield'] <= normalisationRanges[
                'Dividend Yield Min']:
            normalisationRanges[
                'Dividend Yield Min'] = stock['Ratio']['Net Yield'] - 1

        # Return on Equity Ranges
        netIncome = stock['FinancialProfile']['Data']['Income']['Net Income']
        shareholderEquity = stock['FinancialProfile']['Data']['Balance'][
            'Total Equity']
        stock['Ratio']['Return on Equity'] = (netIncome /
                                              shareholderEquity) * 100

        if stock['Ratio']['Return on Equity'] >= normalisationRanges[
                'Return on Equity Max']:
            normalisationRanges['Return on Equity Max'] = stock['Ratio'][
                'Return on Equity'] + 1
        if stock['Ratio']['Return on Equity'] <= normalisationRanges[
                'Return on Equity Min']:
            normalisationRanges['Return on Equity Min'] = stock['Ratio'][
                'Return on Equity'] - 1

        # Sharpe Ratio
        if stock['Ratio']['Sharpe Ratio'] >= normalisationRanges[
                'Sharpe Ratio Max']:
            normalisationRanges[
                'Sharpe Ratio Max'] = stock['Ratio']['Sharpe Ratio'] + 1
        if stock['Ratio']['Sharpe Ratio'] <= normalisationRanges[
                'Sharpe Ratio Min']:
            normalisationRanges[
                'Sharpe Ratio Min'] = stock['Ratio']['Sharpe Ratio'] - 1

        # Debt Equity
        totalLiability = stock['FinancialProfile']['Data']['Balance'][
            'Total Liabilities']
        totalEquity = stock['FinancialProfile']['Data']['Balance'][
            'Total Equity']
        stock['Ratio']['Debt Equity'] = totalLiability / totalEquity

        if stock['Ratio']['Debt Equity'] >= normalisationRanges[
                'Debt Equity Max']:
            normalisationRanges[
                'Debt Equity Max'] = stock['Ratio']['Debt Equity'] + 1
        if stock['Ratio']['Debt Equity'] <= normalisationRanges[
                'Debt Equity Min']:
            normalisationRanges[
                'Debt Equity Min'] = stock['Ratio']['Debt Equity'] - 1
    logger.info(normalisationRanges)
    return normalisationRanges
Example #20
0
def save_data(stockDataArray, success):
    """
    Constructs a dictionary of company information. Converts it JSON, and sends it externally using send_to_server()

    Args:
        stockDataArray (List): dictionary of all company information
        success (Boolean): To indicate whether the scraping was succesful, to identify if processing needs to occur
    """
    currentTimeStamp = datetime.now().strftime('%Y/%m/%d')
    scrapeInsert = {currentTimeStamp: {'Date': currentTimeStamp}}
    if success:
        logger.info("Saving data")
        print("Saving data")

        dividendInsert = {'Data': {}, 'Name': 'HistoricalDividends'}
        priceInsert = {'Data': {}, 'Name': 'HistoricalPrices'}

        stockIteration = 0
        # Select stock
        for stock in stockDataArray:
            currentStockTicker = stock['Summary']['Ticker']
            logger.info("Saving data for: " + currentStockTicker)
            stockInsert = {}

            # Create stock dict from scraped data
            for sectionKey, sectionData in stock.items():
                logger.info(sectionKey)
                sectionInsert = {}
                if sectionKey == 'HistoricalPrices':
                    for line in sectionData:
                        logger.debug(line)
                        dateString = line.pop('Date')
                        dateString = (datetime.strptime(
                            dateString, '%d %b %Y')).strftime("%Y-%m-%d")
                        sectionInsert[dateString] = line
                    stockInsert[sectionKey] = sectionInsert
                elif sectionKey == 'HistoricalDividends':
                    try:
                        for line in sectionData:
                            logger.debug(line)
                            dateString = line.pop('Date')
                            dateString = (datetime.strptime(
                                dateString, '%d %b %Y')).strftime("%Y-%m-%d")
                            sectionInsert[dateString] = line.pop(
                                'Dividend Paid')
                        stockInsert[sectionKey] = sectionInsert
                    except TypeError:
                        pass
                else:
                    for elementKey, elementValue in sectionData.items():
                        sectionInsert[elementKey] = elementValue
                    stockInsert[sectionKey] = sectionInsert

            scrapeInsert[currentTimeStamp][stock['Summary']
                                           ['Ticker']] = stockInsert
            stockIteration += 1
            printProgressBar(
                stockIteration,
                len(stockDataArray),
                prefix='Saving {} data'.format(stock['Summary']['Ticker']),
                suffix='of {} companies completed'.format(len(stockDataArray)))

        with open('data.txt', 'w') as outfile:
            json.dump(scrapeInsert, outfile, indent=4)

        # save_result_to_pastebin(scrapeInsert, currentTimeStamp)
        send_to_server(scrapeInsert)
        send_files_to_server()
    else:
        scrapeInsert[currentTimeStamp] = {}
        # save_result_to_pastebin(scrapeInsert, currentTimeStamp)
        send_to_server(scrapeInsert)
Example #21
0
from nzxscraper import logger

startTime = time()
browser = get_browser()

try:
    stockTickersList = list_companies(browser)

    # Initialise the array which is  going to store Stock class objects
    stockDataArray = []

    # For each ticker in the list, find the link to the respective summary page
    for stock in stockTickersList:
        stockData = scrape_company(browser, stock)
        stockDataArray.append(stockData)
    logger.info("Scraping complete")
finally:
    browser.quit()
    logger.info("Temporary files deleted")
    shutil.rmtree(downloadDirectory)

    # print_excel(stockDataArray)
    # logger.info("Excel ready")

    if DEBUG: endTime = time()
    logger.info("That took a total of: " + str(round(endTime - startTime)) +
                " seconds.")
    logger.info(
        str(round((endTime - startTime) / COMPANIES)) +
        " seconds per company.")
Example #22
0
def create_historical_dividends_csv_link(stockTicker) :
    csvLink = "https://companyresearch-nzx-com.ezproxy.aut.ac.nz/deep_ar/divhistory_csv.php?selection=" + stockTicker
    logger.info("Pulling historical dividend data from: " + csvLink)
    return csvLink
Example #23
0
def scrape_company(browser, stock):
    """
    Contains the logic behind the scraping of an entire company's data

    Navigating to pages, downloading files

    Args:
        browser (Selenium.WebDriver): The automated Chrome browser
        stock (String): The stock ticker currently being scraped

    Returns:
        stockData (Stock): Class containing dictionaries of data
    """
    logger.info("Current Stock: " + stock)
    stockInnerIteration = 0
    numFuncs = 10
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))

    # Arrive at Summary & Ratios page and pull information
    browser.find_element_by_link_text(stock).click()
    summarySoup = BeautifulSoup(browser.page_source, 'lxml')
    logger.info("Pulling ratio information")
    stockSummaryDict = get_stock_summary(summarySoup)
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))
    stockRatioDict = get_ratios(summarySoup)
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))

    # Create csv link for historical prices and pull it into a temporary folder
    csvLink = create_historical_prices_csv_link(stock)
    logger.info("Pulling historical prices information")
    browser.get(csvLink)

    # Create csv link for dividends and pull it into a temporary folder
    csvLink = create_historical_dividends_csv_link(stock)
    logger.info("Pulling historical dividends information")
    browser.get(csvLink)

    # Arrive at Annual Reports and pull latest annual report
    # TODO May require refactor of xpath to shorten it (Looks nicer)
    # TODO change dl directory outside temp
    # Create try catch block
    try:
        logger.info("Pulling annual report")
        year = int(datetime.now().strftime('%Y'))
        annualReportLink = create_annual_report_link(stock, str(year))
        browser.get(annualReportLink)
        if browser.find_element_by_xpath(
                ".//title[contains(text(), '404 Not Found')]"):
            browser.execute_script(
                "window.history.go(-1)")  # Go back to summary page
            annualReportLink = create_annual_report_link(stock, str(year - 1))
            browser.get(annualReportLink)
            if browser.find_element_by_xpath(
                    ".//title[contains(text(), '404 Not Found')]"):
                browser.execute_script(
                    "window.history.go(-1)")  # Go back to summary page
    except:
        pass
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))
    # browser.execute_script("window.history.go(-1)") # Go back to summary page

    # Create and get the tear sheet for the company
    tearSheetLink = 'https://companyresearch-nzx-com.ezproxy.aut.ac.nz/tearsheets/' + stock + '.pdf'
    browser.get(tearSheetLink)
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))

    # Arrive at Company Directory and pull directors information
    browser.find_element_by_xpath(
        ".//span[contains(text(), 'Company Directory')]").click()
    directorSoup = BeautifulSoup(browser.page_source, 'lxml')
    logger.info("Pulling Director's information")
    stockDirectorDict = get_director_information(directorSoup)
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))
    browser.execute_script("window.history.go(-1)")  # Go back to summary page

    # Arrive at Company Profile and pull description information
    browser.find_element_by_xpath(
        ".//span[contains(text(), 'Company Profile')]").click()
    profileSoup = BeautifulSoup(browser.page_source, 'lxml')
    logger.info("Pulling company description")
    stockProfileDict = get_company_profile(profileSoup)
    logger.debug(stockProfileDict)
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))
    browser.execute_script("window.history.go(-1)")  # Go back to summary page

    # Arrive at Financial Profile and pull debt-equity information
    browser.find_element_by_xpath(
        ".//span[contains(text(), 'Financial Profile')]").click()
    stockSoup = BeautifulSoup(browser.page_source, 'lxml')
    logger.info("Pulling financial profile information")
    stockFinancialProfileDict = get_financial_profile(stockSoup)
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))
    browser.execute_script("window.history.go(-1)")  # Go back to summary page

    # Read in the pries csv
    stockHistoricalPricesDict = get_stock_historical_prices(
        tempDirectory + stock + " Historical Prices.csv")
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))

    # Read in dividends csv
    stockHistoricalDividendsDict = get_stock_historical_dividends(
        tempDirectory + stock + " Historical Dividends.csv")
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))

    # Go back to the stock ticker page
    logger.info("Back to company listings")
    browser.execute_script("window.history.go(-1)")
    stockInnerIteration += 1
    printProgressBar(stockInnerIteration,
                     numFuncs,
                     prefix='Scraping {} data'.format(stock),
                     suffix='of {} completed'.format(stock))

    # Create the stock obj and store it in an array
    stockData = {
        'Summary': stockSummaryDict,
        'Ratio': stockRatioDict,
        'HistoricalPrices': stockHistoricalPricesDict,
        'HistoricalDividends': stockHistoricalDividendsDict,
        'FinancialProfile': stockFinancialProfileDict,
        'Profile': stockProfileDict,
        'Directors': stockDirectorDict
    }

    return stockData
Example #24
0
def list_companies(browser):
    """
    Creates a list which will be used to iterate through selected companies

    Args:
        browser (Selenium.WebDriver): The automated Chrome browser

    Returns:
        stockNames (List): list of company tickers to be scraped
    """
    # Login
    browser.find_element_by_xpath('//*[@id="username"]').send_keys(username)
    browser.find_element_by_xpath('//*[@id="password"]').send_keys(password)
    browser.find_element_by_xpath('//*[@id="login"]/section[4]/button').click()
    logger.info("Logged into NZX System")

    # Arrive at Market Activity Page
    browser.find_element_by_xpath(
        ".//a[contains(text(), 'Company Research')]").click()
    logger.info("Arrived at Market Activity Page")
    # Click "View all" for main market
    browser.find_elements_by_xpath(
        ".//a[contains(text(), 'view all')]")[0].click()
    logger.info("Arrived at Market Overview Page")
    # Sort in descending order by clicking the 26th "a" tag
    browser.find_elements_by_css_selector('td > a')[25].click()
    logger.info(
        "Arrived at Market Overview sorted by marketcap in descending order")

    # Parse the page source into BeautifulSoup
    # The page is the list of stocks in Descending order of Market Cap
    html = browser.page_source
    htmlSoup = BeautifulSoup(html, 'lxml')
    logger.info("Market Overview Page parsed")

    # Put all the stock tickers into a list
    stocksSoup = htmlSoup.find_all('a', {'class': 'text'}, limit=COMPANIES)
    stockNames = []
    for stock in stocksSoup:
        stockNames.append(stock.getText())

    logger.info("List of companies to scrape finalised")
    return stockNames
Example #25
0
def save_data():
    logger.info("Im free")