Esempio n. 1
0
    def update_stocker_stats(self, num_urls, source, num_nodes):
        if self.verbose:
            utility.sysprint('updating stocker stats at {}'.format(
                self.stats_file))
        logger.debug('updating stocker stats at {}'.format(self.stats_file))

        with open(self.stats_file, 'r') as f:
            data = json.load(f)

        # add urls to json
        if source in data.keys():
            original = data[source]

            # error occuring here
            updated = {}
            updated['num_urls'] = original['num_urls'] + num_urls
            updated['num_nodes'] = original['num_nodes'] + num_nodes
            data.update({source: updated})
        else:
            data.update(
                {source: {
                    'num_urls': num_urls,
                    'num_nodes': num_nodes
                }})

        #write the updated json
        with open(self.stats_file, 'w') as f:
            json.dump(data, f, indent=4)
Esempio n. 2
0
    def update_parsed_urls(self, urls, query):
        """writes parsed links to JSON file to avoid reparsing"""
        if self.verbose:
            utility.sysprint('writing {} link(s) to {}'.format(
                len(urls), self.url_file))
        logger.debug('writing {} link(s) to {}'.format(len(urls),
                                                       self.url_file))

        ticker, source = query.ticker.upper(), query.source.lower()
        with open(self.url_file, 'r') as f:
            data = json.load(f)

        # add urls to object
        if ticker in data.keys():
            if source in data[ticker].keys():
                original = data[ticker][source]
                updated = original + urls
                data[ticker].update({source: updated})
            else:
                data[ticker].update({source: urls})

        else:
            sub_obj = {ticker: {source: urls}}
            data.update(sub_obj)

        with open(self.url_file, 'w') as f:
            json.dump(data, f, indent=4)
Esempio n. 3
0
 def get_nasdaq_top_100(self):
     """returns a list of the NASDAQ top 100 stock tickers"""
     if self.verbose:
         utility.sysprint('Loading NASDAQ100')
     url = 'https://www.cnbc.com/nasdaq-100/'
     resp, err = self.requestHandler.get(url)
     if err:
         logger.error(
             'Error retrieving top nasdaq 100, request returned: {}'.format(
                 err))
         return []
     soup = BeautifulSoup(resp.content, 'html.parser')
     table = soup.find('table', {'class': 'quoteTable'})
     tds = table.find_all('td')
     a_tags = [td.find('a') for td in tds if td.find('a')]
     tickers = [re.sub('\s+', '', a.get_text()) for a in a_tags]
     if self.verbose:
         utility.sysprint('Finished loading NASDAQ100')
     return tickers
Esempio n. 4
0
 def get_nyse(self):
     """ """
     if self.verbose:
         utility.sysprint('Loading NYSE')
     page = 1
     tickers = []
     while page < 64:  #64
         url = 'https://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NYSE&page={}'.format(
             page)
         if self.verbose:
             utility.sysprint('getting url: {}'.format(url))
         resp, err = self.requestHandler.get(url)
         if err:
             logger.error(
                 'Error retrieving page {} when getting NYSE tickers, request returned: {}'
                 .format(page, err))
             return tickers
         page += 1
         soup = BeautifulSoup(resp.content, 'html.parser')
         table = soup.find('table', {'id': 'CompanylistResults'})
         trs = [
             tr for idx, tr in enumerate(table.find_all('tr')) if idx % 2
         ]
         for t in trs:
             td = t.find_all('td')[1]
             ticker = td.find('a').get_text()
             tickers.append(re.sub('\s+', '', ticker))
     if self.verbose:
         utility.sysprint('Finished loading NYSE')
     return tickers
Esempio n. 5
0
    def build_nodes(self, query, urls, flags):
        """uses the urls to build WebNodes to be written to the csv output"""
        nodes = []
        if len(urls) == 0:
            logger.debug(
                'Build nodes found no (new) urls to parse (query: {})'.format(
                    query.string))
            return nodes, None
        j = '.'
        company_info, err = self.financeHelper.get_company_info(query.ticker)
        if err:
            company_info = {}
            logger.error('Error getting company information for {}'.format(
                query.ticker))
        for i, url in enumerate(urls):
            if self.verbose:
                utility.sysprint(
                    'parsing urls for query: {}'.format(query.string) + j *
                    (i % 3))

            if self.is_homepage(url, query.source):
                logger.debug(
                    'Hit a homepage ({}) ... continuing to next iteration.'.
                    format(url))
                continue

            articleParser = ArticleParser(url, query, company_info, **flags)
            node, err = articleParser.generate_web_node()
            if err:
                logger.error(
                    'unable to generate node for {} ... continuing to next iteration'
                    .format(url))
                continue
            nodes.append(node)

        if self.verbose:
            utility.sysprint('built {} Web Nodes'.format(len(nodes)))
        logger.debug('built {} Web Nodes'.format(len(nodes)))

        return nodes, None
Esempio n. 6
0
    def get_stock_movers(self):
        """Includes common stocks, ADRs and REITs listed on NYSE, Nasdaq or NYSE American with a prior day close of $2 a share or higher and volume of at least 50,000"""
        if self.verbose:
            utility.sysprint('Loading stock movers')

        url = 'https://www.wsj.com/market-data/stocks/us/movers'
        resp, err = self.requestHandler.get(url)
        if err:
            logger.error(
                'Error retrieving top nyse 100, request returned: {}'.format(
                    err))
            return []

        tickers = []
        soup = BeautifulSoup(resp.content, 'html.parser')
        rows = soup.find_all('td')
        for row in rows:
            a = row.find('a')
            if a:
                ticker = re.findall(r'\(.*\)', a.get_text()).replace('(', '')
                tickers.append(ticker)
        if self.verbose: utility.sysprint('Finished loading stock movers')
        return list(tickers)
Esempio n. 7
0
    def get_snp_500(self):
        """returns a list of the S&P500 stock tickers"""
        if self.verbose:
            utility.sysprint('Loading SNP500')

        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
        resp, err = self.requestHandler.get(url)
        if err:
            logger.error(
                'Error retrieving snp 500s, request returned: {}'.format(err))
            return []

        soup = BeautifulSoup(resp.content, 'lxml')
        table = soup.find('table', {'id': 'constituents'})
        tickers = []
        for row in table.findAll('tr'):
            col = row.findAll('td')
            if len(col):
                ticker = str(col[0].get_text().strip())
                tickers.append(ticker)

        if self.verbose: utility.sysprint('Finished loading SNP500')
        return tickers
Esempio n. 8
0
    def update_data_file(self, nodes, query):
        """writes the data gathered to a json file"""
        if self.verbose:
            utility.sysprint('writing {} node(s) to {}'.format(
                len(nodes), self.data_file))
        logger.debug('writing {} node(s) to {} for query {}'.format(
            len(nodes), self.data_file, query))

        with open(self.data_file, 'r') as f:
            data = json.load(f)

        for node in nodes:
            self.qualtricsHandler.submit_node(node)
            key = '+'.join([query.ticker, query.source])
            if key in data.keys():
                sub_nodes = data[key]
                sub_nodes.append(dict(node))
                data[key] = sub_nodes
            else:
                data[key] = [dict(node)]

        with open(self.data_file, 'w') as f:
            json.dump(data, f, indent=4)
Esempio n. 9
0
 def build_queries(self, depth=1):
     """creates google queries based on the provided stocks and news sources"""
     i, j = 0, '.'
     tot = len(self.tickers) * len(self.sources) * depth
     for t in self.tickers:
         for s in self.sources:
             if self.verbose:
                 utility.sysprint(
                     'Building {} out of {} queries | {} {} done'.format(
                         i, tot, (i / tot), '%'))
             i += 1
             string1 = t + '+' + s + '+' + 'stock+articles'
             if depth > 1:
                 company_name = self.financeHelper.get_name_from_ticker(t)
                 if company_name:
                     garbage = ['Inc.']
                     string2 = '+'.join([
                         j for j in company_name.split(' ')
                         if j not in garbage
                     ]) + '+' + s + '+stock+news'
                     self.queries.append(utility.Query(t, s, string2))
                     i += 1
             self.queries.append(utility.Query(t, s, string1))
     logger.debug('built {} queries'.format(len(self.queries)))
Esempio n. 10
0
def gather_data():
    """call financial web scraping API with user defined parameters"""

    # rh = RequestHandler()
    # print (rh.generate_proxies())

    financeHelper = FinanceHelper()
    # stock_movers = financeHelper.get_stock_movers()
    # print (stock_movers[:10])
    # print (financeHelper.get_name_from_ticker('AAPL'))
    # print ('sector industry of SAP is: {}'.format(financeHelper.get_company_info('AAPL')))

    nasdaq100 = financeHelper.get_nasdaq_top_100()
    snp500 = financeHelper.get_snp_500()
    nyse = financeHelper.get_nyse()
    stocks = [
        'SAP', 'AAPL', 'GOOG', 'GPRO', 'TSLA', 'APRN', 'FB', 'NVDA', 'SNAP',
        'SPY', 'NFLX', 'AMZN', 'AMD'
    ]
    markets = nyse + snp500 + nasdaq100
    random.shuffle(markets)
    random.shuffle(stocks)
    tickers = stocks + markets

    sources = utility.get_valid_sources()

    worker = Stocker(tickers, sources, configpath='credentials.json')

    flags = {
        'date_checker': False,
        'depth': 1,
        'validate_url': False,
        'length_check': True,
        'min_length': 100,
    }
    worker.stock(flags=flags)
    print('\n\nFinished Process Successfully.')
Esempio n. 11
0
    def stock(self,
              gui=True,
              json_output=True,
              csv_output=True,
              depth=1,
              query=True,
              shuffle=False,
              flags={}):
        """main function for the class. Begins the worker to get the information based on the queries given"""
        if query:
            self.build_queries(depth=depth)

        total = len(self.queries)
        if total == 0:
            return None

        trange_args = {
            'total': total,
            'unit': 'query',
            'desc': self.queries[0].ticker,
            'postfix': {
                'source': self.queries[0].source
            },
            'dynamic_ncols': True,
            'leave': True,
            'miniters': 1
        }

        if gui: t = trange(total, **trange_args)
        else: t = range(total)

        for i in t:
            query = self.queries[i]
            if self.verbose:
                utility.sysprint('Processing query: {}'.format(query.string))
            logger.debug('Processing query: {}'.format(query.string))

            if gui:
                t.set_description(query.ticker.upper())
                t.set_postfix(source=query.source)
                t.update()

            urls = self.get_urls(query)
            if not urls:
                logger.debug('No URLs found.')
                continue

            urls_found = len(urls)
            logger.debug('urls are {}'.format(json.dumps(urls, indent=4)))

            nodes, err = self.build_nodes(query, urls, flags)
            if err:
                logger.error(
                    'Error raised in node building phase: {}'.format(err))

            if len(nodes):
                self.update_stocker_stats(urls_found, query.source, len(nodes))
                self.update_data_file(nodes, query)
                self.update_parsed_urls(urls, query)
            else:
                logger.debug(
                    'Node Dictionary is None or has a length of 0, continuing to next iteration.'
                )

            utility.sysprint('Fished gathering data for query: {}'.format(
                query.string))

        if gui:
            t.close()
        return nodes