def parse_yahoo_stk_hist(url): try: name = url.split("=")[1].split(".")[0] # print("Name = ", name) bs = h.parse_url(url) if bs: table = bs.find('div', {'class': "Pb(10px) Ovx(a) W(100%)"}).find_all("table", {"class": "W(100%) M(0)"})[0] data = [ [td.string.strip() for td in tr.find_all('td') if td.string] for tr in table.find_all('tr')[2:] ][:-1] # print(data) # data.insert(0, ['STK_DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'ACLOSE', 'VOLUME']) df = get_data_frame(data, name) if len(df) > 0: df_columns = list(df) table = "STK_INFO_HISTORY" constraint = ', '.join(['STK_DATE', 'NSE_CODE']) values = "to_date(%s, 'DD-MON-YYYY'), %s, %s, %s, %s, %s, %s" insert_stmt = h.create_update_query(table, df_columns, values, constraint) conn = psycopg2.connect(database="trading", user="******", password="******") conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cursor = conn.cursor() execute_batch(cursor, insert_stmt, df.values) conn.commit() db.close_connection(conn, cursor) except Exception as err: traceback.print_exc() print("Exception = ", str(err))
def get_shares_details(stock_url, thread_count): # Variables declaration results_que = Queue() failed_que = Queue() start = time.time() # Get the shares from money control shares = get_shrs_from_mnctl(stock_url) log.info("Total number of shares returned = {}".format(len(shares))) # shares = {k: shares[k] for k in list(shares)[:50]} if shares and len(shares) > 0: # put into Queue url_que = get_shares_category(shares) log.info("Shares added to Queue to process...") for i in range(thread_count): t = threading.Thread(target=process_queue, args=(url_que, results_que, failed_que)) t.daemon = True t.start() url_que.join() log.info("Failed url count = {}".format(failed_que.qsize())) log.info("Success url count = {}".format(results_que.qsize())) while not failed_que.empty(): log.warning("Failed URL details = {}".format(failed_que.get())) final_data = {} while not results_que.empty(): # final_data.append(results_que.get()) tmp_dict = results_que.get() key = tmp_dict.get("CATEGORY") h.upd_dic_with_sub_list(key, tmp_dict, final_data) pd.set_option('display.max_columns', 15) for category in final_data: cat_up = category.upper() print("CATEGORY = {} and count = {}".format(cat_up, len(final_data[category]))) df = pd.DataFrame(final_data[category]) df = df.set_index("NAME") # Slice it as needed sliced_df = df.loc[:, [ 'MARKET CAP (Rs Cr)', 'EPS (TTM)', 'P/E', 'INDUSTRY P/E', 'BOOK VALUE (Rs)', 'FACE VALUE (Rs)', 'DIV YIELD.(%)' ]] sliced_df = sliced_df.apply(pd.to_numeric, errors='ignore') sorted_df = sliced_df.sort_values(by=['EPS (TTM)', 'P/E'], ascending=[False, False]) writer_orig = pd.ExcelWriter(os.path.join( commons.get_prop('base-path', 'output'), cat_up + '_Listings.xlsx'), engine='xlsxwriter') sorted_df.to_excel(writer_orig, index=True, sheet_name='report') writer_orig.save() # Sort by P/E print("Execution time = {0:.5f}".format(time.time() - start))
def get_ipo_details_perf(url): ipo_details = {} try: bs = h.parse_url(url) print("URL = ", url) if bs: table = bs.find("table", {"class": "table table-bordered table-condensed"}) rows = table.findAll("tr") row = 1 while row < len(rows): if row % 2 == 1: cols = rows[row].find_all('td') i = 0 ele_list = [] __key = None for ele in cols: i += 1 if i == 1: __key = ele.text.strip() __val = ele.text.strip() if i == 4: # date massage day_list = __val.split(" ") if len(day_list) == 3: if len(day_list[1].strip(",")) == 1: day_list[1] = '0' + day_list[1].strip(",") else: day_list[1] = day_list[1].strip(",") __val = "-".join(day_list) else: __val = "-".join(['Jan', '01', '1900']) if not (i == 2 or i == 3): ele_list.append(__val) ipo_details[__key] = ele_list elif row % 2 == 0: divs = rows[row].find_all('div')[0] sub_div = divs.descendants i = 0 for div in sub_div: if i == 5: day_data = get_listed_details(div) h.upd_dic_with_sub_list_ext( __key, day_data, ipo_details) i += 1 row += 1 except Exception as err: traceback.print_exc() print("Exception =", str(err)) return ipo_details
def mc_get_perf_stk_details(bs): comp_details = {} try: std_data = bs.find('div', {'id': 'mktdet_1'}) for each_div in std_data.findAll('div', attrs={'class': 'PA7 brdb'}): sub_div = each_div.descendants __tag_name, __tag_value = None, None for cd in sub_div: if cd.name == 'div' and cd.get('class', '') == ['FL', 'gL_10', 'UC']: __tag_name = cd.text if cd.name == 'div' and cd.get('class', '') == ['FR', 'gD_12']: __tag_value = cd.text # print("BFR tag Name = {} and value = {}".format(__tag_name, __tag_value)) if __tag_name and __tag_value and __tag_name in c.STK_RATIO_CON: __tag_name = c.STK_RATIO_CON[__tag_name] if __tag_name not in ['NAME', 'CATEGORY', 'SUB_CATEGORY']: __tag_value = h.extract_float(__tag_value) # print("AFR tag Name = {} and value = {}".format(__tag_name, __tag_value)) comp_details[__tag_name] = __tag_value __tag_name, __tag_value = None, None # print("COMP DETAILS =", comp_details) except Exception as err: print("While parsing PERF DETAILS {}".format(err)) traceback.print_exc() return comp_details
def mny_ctr_shr_frm_url(cmp_name, cmp_url): comp_details = {} try: comp_details['NAME'] = cmp_name elements = cmp_url.split("/") if len(elements) > 5: key = elements[5] comp_details['CATEGORY'] = key bs = h.parse_url(cmp_url) if bs: std_data = bs.find('div', {'id': 'mktdet_1'}) for each_div in std_data.findAll('div', attrs={'class': 'PA7 brdb'}): sub_div = each_div.descendants __tag_name, __tag_value = None, None for cd in sub_div: if cd.name == 'div' and cd.get('class', '') == ['FL', 'gL_10', 'UC']: __tag_name = cd.text if cd.name == 'div' and cd.get('class', '') == ['FR', 'gD_12']: __tag_value = cd.text if __tag_name and __tag_value: comp_details[__tag_name] = __tag_value __tag_name, __tag_value = None, None # print("COMP DETAILS =", comp_details) except Exception as err: # log.error("mny_ctr_shr_frm_url ERROR = ", str(err)) raise err return comp_details
def mny_ctr_shr_frm_url(cmp_name, cmp_url, cat): with print_lock: # print("\nStarting thread {}".format(threading.current_thread().name)) try: dict_comp = {"NAME": cmp_name} bs = h.parse_url(cmp_url) if bs: print("bs coming") std_data = bs.find('div', {'id': 'mktdet_1'}) for each_div in std_data.findAll('div', attrs={'class': 'PA7 brdb'}): sub_div = each_div.descendants __tag_name, __tag_value = None, None for cd in sub_div: if cd.name == 'div' and cd.get( 'class', '') == ['FL', 'gL_10', 'UC']: __tag_name = cd.text if cd.name == 'div' and cd.get('class', '') == ['FR', 'gD_12']: __tag_value = cd.text if __tag_name and __tag_value: dict_comp[__tag_name] = __tag_value __tag_name, __tag_value = None, None return dict_comp except Exception as err: print("Error is = ", str(err)) raise err return
def get_yahoo_fin_urls(): NO_URL_COUNT = 1500 yahoo = "https://in.finance.yahoo.com" i = 0 url_list = [] while i < NO_URL_COUNT: url = "https://in.finance.yahoo.com/most-active?offset=" + str( i) + "&count=25" url_list.append(url) i += 25 urls = [] for url in url_list: try: bs = h.parse_url(url) if bs: std_data = bs.find('div', { 'class': "Ovx(s)" }).find_all("table", {"class": "W(100%)"})[0].find_all('tr') for tr in std_data: link = tr.find('a', href=True, text=True)['href'] url = (yahoo + link).replace('?p', '/history?p') if url not in urls: urls.append(url) except Exception as err: print("Exception = ", str(err)) print("No of URLs = ", len(urls)) return urls
def load_stk_ratio(): # Variables declaration start = time.time() file_path = os.path.join(commons.get_prop('base-path', 'ratio-input')) files = [os.path.join(file_path, fn) for fn in next(os.walk(file_path))[2]] all_pages = [] try: for file in files: read_lines = h.read_list_from_json_file(file) all_pages.extend(read_lines) # Total number of links to process print("No of urls to process", len(all_pages)) page_bins = h.chunks(THREAD_COUNT, all_pages) pool = ThreadPool(processes=THREAD_COUNT) # use all available cores, otherwise specify the number you want as an argument for link_array in page_bins: pool.apply_async(process_pages, args=(link_array, ), callback=log_result) pool.close() pool.join() for df_frames in result_list: try: result = pd.concat(df_frames, ignore_index=True) if len(result) > 0: df_columns = list(result) table = "STK_PERF_HISTORY" values = "to_date(%s, 'MONYY'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s" constraint = ', '.join(['NAME', 'NSE_CODE', 'STK_YEAR']) # create INSERT INTO table (columns) VALUES('%s',...) insert_stmt = h.create_update_query( table, df_columns, values, constraint) curr, con = db.get_connection() execute_batch(curr, insert_stmt, result.values) con.commit() db.close_connection(con, curr) except Exception as err: print("Exception while inserting data into table ", str(err)) except Exception as err: print(str(err)) print("Execution time = {0:.5f}".format(time.time() - start))
def get_shares_details(stock_url, process_cnt): # Variables declaration failed_data = [] start = time.time() # Get the shares from money control page_list = get_list_of_share_links(stock_url) page_list = page_list[:10] print("Total Process count = {}".format(process_cnt)) print("Total URL count = {}".format(len(page_list))) pool = multi.Pool(processes=process_cnt) # use all available cores, otherwise specify the number you want as an argument results = [pool.apply_async(process_queue, args=(link,)) for link in page_list] pool.close() pool.join() print(results) print("Total SUCCESS URL count = {}".format(len(results))) log.warning("Total FAILURE URL Count = {}".format(len(failed_data))) final_data = {} for ele in results: tmp_dict = ele.get() key = tmp_dict.get("CATEGORY") h.upd_dic_with_sub_list(key, tmp_dict, final_data) pd.set_option('display.max_columns', 15) for category in final_data: cat_up = category.upper() print("CATEGORY = {} and count = {}".format(cat_up, len(final_data[category]))) df = pd.DataFrame(final_data[category]) df = df.set_index("NAME") # Slice it as needed sliced_df = df.loc[:, ['MARKET CAP (Rs Cr)', 'EPS (TTM)', 'P/E', 'INDUSTRY P/E', 'BOOK VALUE (Rs)', 'FACE VALUE (Rs)', 'DIV YIELD.(%)']] sliced_df = sliced_df.apply(pd.to_numeric, errors='ignore') sorted_df = sliced_df.sort_values(by=['EPS (TTM)', 'P/E'], ascending=[False, False]) writer_orig = pd.ExcelWriter(os.path.join(commons.get_prop('base-path', 'output'), cat_up + '_Listings.xlsx'), engine='xlsxwriter') sorted_df.to_excel(writer_orig, index=True, sheet_name='report') writer_orig.save() print("Execution time = {0:.5f}".format(time.time() - start))
def parse_yahoo_stk_hist(url): try: name = url.split("=")[1].split(".")[0] # print("Name = ", name) bs = h.parse_url(url) if bs: table = bs.find('div', { 'class': "Pb(10px) Ovx(a) W(100%)" }).find_all("table", {"class": "W(100%) M(0)"})[0] data = [[ td.string.strip() for td in tr.find_all('td') if td.string ] for tr in table.find_all('tr')[2:]][:-1] # print(data) # data.insert(0, ['STK_DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'ACLOSE', 'VOLUME']) # Set pandas options pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('max_colwidth', 0) df = pd.DataFrame(data, columns=[ 'STK_DATE', 'OPEN', 'HIGH', 'LOW', 'CLOSE', 'ACLOSE', 'VOLUME' ]) df = df.assign(NSE_CODE=Series(name, index=df.index)) df = df.drop(columns='ACLOSE') cols = ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME'] df[cols] = df[cols].replace({'\$': '', ',': ''}, regex=True) # Drop a row by condition df = df[df['OPEN'].notnull()] drop_cols = ['STK_DATE', 'NSE_CODE'] cols = df.columns.drop(drop_cols) df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') df = df.fillna(0) if len(df) > 0: df_columns = list(df) table = "STK_INFO_HISTORY" constraint = ', '.join(['STK_DATE', 'NSE_CODE']) values = "to_date(%s, 'DD-MON-YYYY'), %s, %s, %s, %s, %s, %s" insert_stmt = create_update_query(table, df_columns, values, constraint) conn = psycopg2.connect(database="trading", user="******", password="******") conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cursor = conn.cursor() execute_batch(cursor, insert_stmt, df.values) conn.commit() cursor.close() except Exception as err: traceback.print_exc() print("Exception = ", str(err))
def parse_url(): stock_ratio = {} url = "https://www.moneycontrol.com/financials/yesbank/ratiosVI/YB#YB" bs = h.parse_url(url) if bs: std_data = bs.find('div', { 'class': 'PB10' }).find('div', {'class': 'FL gry10'}) nse_code = (std_data.text.split("|")[1]).split(":")[1].strip() print("NSE_CODE", nse_code) data = [[[td.string.strip() for td in tr.find_all('td') if td.string] for tr in table.find_all('tr')[2:]] for table in bs.find_all("table", {"class": "table4"})[2:]] ele_list = data[0] ratio_elements['STK_YEAR'] = data[0][0] i = 2 while i < len(ele_list) - 4: arr = ele_list[i] if len(arr) > 5: key = ratio_con.get(arr[0]) val = arr[1:] if key: ratio_elements[key] = val i += 1 print("STK RATIO = ", ratio_elements) # Set pandas options pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('max_colwidth', 0) df = pd.DataFrame(ratio_elements) df = df.apply(pd.to_numeric, errors='ignore') df = df.assign(NSE_CODE=Series(nse_code, index=df.index)) if len(df) > 0: df_columns = list(df) table = "STK_PERF_HISTORY" values = "to_date(%s, 'MONYY'), %s, %s, %s, %s, %s, %s, %s, %s, %s" constraint = ', '.join(['NAME', 'NSE_CODE', 'STK_YEAR']) # create INSERT INTO table (columns) VALUES('%s',...) insert_stmt = create_update_query(table, df_columns, values, constraint) print("PERF HIST= ", insert_stmt) conn = psycopg2.connect(database="trading", user="******", password="******") conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cursor = conn.cursor() execute_batch(cursor, insert_stmt, df.values) conn.commit() cursor.close()
def mny_ctr_shr_frm_url(cmp_name, cmp_url): comp_details = {} try: bs = h.parse_url(cmp_url) if bs: base_data = bs.find('div', {'class': 'FL gry10'}) if base_data: bs_txt_arr = base_data.text.split("|") bse_code = bs_txt_arr[0].split(":")[1] nse_code = bs_txt_arr[1].split(":")[1].strip() isin_code = bs_txt_arr[2].split(":")[1].strip() sector = bs_txt_arr[3].split(":")[1].strip() stk_result = {} if nse_code: stk_result = mc_get_day_stk_details(bs, 'content_nse', cmp_url) # print("STK DAY details NSE = ", stk_result) if not stk_result and isin_code: stk_result = mc_get_day_stk_details(bs, 'content_bse', cmp_url) # print("STK DAY details BSE = ", stk_result) if stk_result: comp_details = mc_get_perf_stk_details(bs) comp_details['NAME'] = cmp_name category = 'N/A' sub_category = 'N/A' if sector: cat_list = sector.split("-") if len(cat_list) > 1: category = cat_list[0] sub_category = cat_list[1:] else: category = sector comp_details['CATEGORY'] = category comp_details['SUB_CATEGORY'] = sub_category nse_code = nse_code if nse_code else isin_code comp_details['NSE_CODE'] = nse_code comp_details['URL'] = cmp_url comp_details.update(stk_result) # print("STK complete details = ", comp_details) else: print("COMP {} not listed or errored".format(cmp_url)) else: print("COMP {} not listed or errored".format(cmp_url)) except Exception as err: print("CMP URL = {} with error = {}".format(cmp_url, err)) # raise err return comp_details
def mc_get_day_stk_details(bs, id, cmp_url): data_dict = {} try: # Get date, stock current price and traded volume if bs.find('div', {'id': id}).find('div', {'class': 'brdb PB5'}): bse_data = bs.find('div', {'id': id}).find('div', {'class': 'brdb PB5'}).findAll('div') if bse_data: year = time.strftime("%Y") bse_dt = bse_data[3].text.split(",")[0].strip() if bse_dt and len(bse_dt) > 5: data_dict["STK_DATE"] = year + ' ' + bse_dt data_dict["CURR_PRICE"] = bse_data[4].text.strip() bse_st_vol = h.alpnum_to_num(bse_data[6].text.strip().split("\n")[0]) data_dict["TRADED_VOLUME"] = bse_st_vol.strip() # Get previous and open price of the share if bs.find('div', {'id': id}).find('div', {'class': 'brdb PA5'}): bse_data = bs.find('div', {'id': id}).find('div', {'class': 'brdb PA5'}).findAll('div') stk_prc = 0 for ele in bse_data: if ele.get("class") == ['gD_12', 'PB3']: if stk_prc == 0: data_dict["PREV_PRICE"] = ele.text.strip() elif stk_prc == 1: data_dict["OPEN_PRICE"] = ele.text.strip() stk_prc += 1 # Get low, high and 52 week prices if bs.find('div', {'id': id}).find('div', {'class': "PT10 clearfix"}): bse_data = bs.find('div', {'id': id}).find('div', {'class': "PT10 clearfix"}).findAll('div') stk_p = 0 for ele in bse_data: if ele.get('class') == ["PB3", "gD_11"]: ele_li = ele.text.strip().split("\n") if stk_p == 0: data_dict["LOW_PRICE"] = ele_li[1] data_dict["HIGH_PRICE"] = ele_li[3] elif stk_p == 1: data_dict["LOWEST_PRICE"] = ele_li[1] data_dict["HIGEST_PRICE"] = ele_li[3] stk_p += 1 if not (len(data_dict)) == 9: data_dict = {} except Exception as err: print("While prasing for day stocks = {}".format(err)) return data_dict
def get_shrs_from_mnctl(url): """ This function will parse html and return data :param url: home page url :param shares: dictionary contains result :return: dictionary which contains key as company name and value is url """ shares = {} try: bs = h.parse_url(url) if bs: table = bs.find("table", {"class": "pcq_tbl MT10"}) for row in table.findAll("tr"): for link in row.findAll("a"): shares[link.get("title")] = link.get('href') except Exception as err: log.exception("ERROR in get_shr_from_mnctl = {}".format(err)) return shares
def get_listed_ipo(stock_url): jobs = [] spipe_list = [] # Variables declaration start = time.time() # Get the shares from chitt urls = get_list_of_urls(stock_url) # urls = ['http://www.chittorgarh.com/ipo/ipo_perf_tracker.asp', 'http://www.chittorgarh.com/ipo/ipo_perf_tracker.asp?FormIPOPT_Page=2', 'http://www.chittorgarh.com/ipo/ipo_perf_tracker.asp?FormIPOPT_Page=3', 'http://www.chittorgarh.com/ipo/ipo_perf_tracker.asp?FormIPOPT_Page=4'] print(urls) cpdus = multi.cpu_count() page_bins = h.chunks(cpdus, urls) for cpdu in range(cpdus): recv_end, send_end = multi.Pipe(False) worker = multi.Process(target=process_page, args=(page_bins[cpdu], send_end)) worker.daemon = True jobs.append(worker) spipe_list.append(recv_end) worker.start() for job in jobs: job.join(timeout=10) print("All jobs completed......") try: ipo_size = 0 result_list = [x.recv() for x in spipe_list] curr, con = db.get_connection() statement = create_update_query('IPO_STK_DETAILS') for results in result_list: for data in results: values = [data[k] for k in data] ipo_size += len(values) df = get_data_frame(values) records = df.to_dict(orient='records') print(records) execute_batch(curr, statement, df.values) con.commit() db.close_connection(con, curr) print("IPOs listed so far = {}".format(ipo_size)) except Exception as err: traceback.print_exc() print(str(err)) print("Execution time = {0:.5f}".format(time.time() - start))
def get_stk_history(): jobs = [] # Variables declaration start = time.time() # Get the shares from yahoo urls = get_yahoo_fin_urls() # urls = ["https://in.finance.yahoo.com/quote/3IINFOTECH.NS/history?p=3IINFOTECH.NS", "https://in.finance.yahoo.com/quote/3IINFOTECH.NS/history?p=3IINFOTECH.NS", "https://in.finance.yahoo.com/quote/3IINFOTECH.NS/history?p=3IINFOTECH.NS", "https://in.finance.yahoo.com/quote/3IINFOTECH.NS/history?p=3IINFOTECH.NS"] print("No of URLs = ", len(urls)) cpdus = multi.cpu_count() page_bins = h.chunks(cpdus, urls) for chunk in page_bins: worker = multi.Process(target=process_page, args=(chunk,)) worker.daemon = True jobs.append(worker) worker.start() for job in jobs: job.join(timeout=10) print("All jobs completed......") print("Execution time = {0:.5f}".format(time.time() - start))
def get_ipo_day_info(link): ipo_details = [] try: bs = h.parse_url(link) print("URL DAY=", link) if bs: divs = bs.find("div", { 'class': 'col-lg-12 col-md-12 col-sm-12 main' }).findAll('div') nse = divs[2].text.split(":")[1].strip() face_val = divs[3].text.split(":")[1].strip() isin = divs[6].text.split(":")[1].strip() if nse and len(nse) > 0: ipo_details.append(nse) else: ipo_details.append(isin) ipo_details.append(face_val) table = bs.find( "table", { "class": "table table-condensed table-bordered table-striped table-nonfluid" }) rows = table.findAll("tr") # curr_price = rows[1].findAll("td")[0].find('span').text.strip() open_price = rows[2].findAll("td")[1].text.strip() hl = rows[3].findAll("td")[1].text.split("-") high, low = hl[0].strip(), hl[1].strip() prev_price = rows[4].findAll("td")[1].text.strip() turn_over = rows[8].findAll("td")[1].text.strip() # ipo_details.append(curr_price) ipo_details.append(open_price) ipo_details.append(high) ipo_details.append(low) ipo_details.append(turn_over) ipo_details.append(prev_price) if len(ipo_details) == 8: return ipo_details except Exception as err: traceback.print_exc() print("Exception =", str(err)) return ipo_details
def get_nse_code(url): nse_code, isin_code = None, None try: bs = h.parse_url(url) if bs: divs = bs.findAll("div", {'class': "panel panel-default"})[2].find( 'div', {'class': 'panel-body'}) sub_div = divs.descendants for div in sub_div: if div.name == "li": value = div.text if "NSE Symbol:" in value: nse_code = value.split(":")[1].strip() elif "ISIN:" in value: isin_code = value.split(":")[1].strip() if not nse_code: nse_code = isin_code except Exception as err: print("While parsing for NSE code", str(err)) return nse_code
def get_shares_details(all_pages, first_time_process): # Variables declaration jobs = [] spipe_list = [] failed_que = multi.Queue() start = time.time() cpdus = multi.cpu_count() print("Total Process count = {}".format(cpdus)) print("Total URL count = {}".format(len(all_pages))) page_bins = chunks(cpdus, all_pages) for cpdu in range(cpdus): recv_end, send_end = multi.Pipe(False) worker = multi.Process(target=process_page, args=(page_bins[cpdu], send_end, failed_que,)) worker.daemon = True jobs.append(worker) spipe_list.append(recv_end) worker.start() # end_at = time.time() + (5) # while jobs: # job = jobs.pop() # delta = end_at - time.time() # if delta > 0: # job.join(timeout=delta) # job.terminate() # job.join() for job in jobs: job.join(timeout=10) print("All jobs completed......") # if first_time_process: # result_list = [x.recv() for x in spipe_list] # failed_pages = [] # while not failed_que.empty(): # failed_pages.append(failed_que.get()) # print("Parsing failed page count = {}".format(len(failed_pages))) # get_shares_details(failed_pages, False) try: result_list = [x.recv() for x in spipe_list] final_data = {} ratio_links = [] print("FAILED URL COUNT = {}".format(failed_que.qsize())) for results in result_list: print("size of the results array from result_list = ", len(results)) for tmp_dict in results: key = tmp_dict.get("CATEGORY") link = tmp_dict.get("URL") ratio_links.append(define_mc_ratio_link(link)) h.upd_dic_with_sub_list(key, tmp_dict, final_data) if ratio_links and len(ratio_links) > 0: print("Size of the RATIO array = ", len(ratio_links)) h.write_list_to_json_file(os.path.join( commons.get_prop('base-path', 'output'), "5yrs_stk_ratio.txt"), ratio_links) # Set pandas options pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('max_colwidth', 0) for category in final_data: df = pd.DataFrame(final_data[category]) cols = df.columns.drop(['STK_DATE', 'NSE_CODE', 'NAME', 'CATEGORY', 'SUB_CATEGORY', 'URL']) df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') df = df.fillna(0) # print(df) if len(df) > 0: try: df_columns = list(df) table = "STK_DETAILS" columns = ",".join(df_columns) print("Batch started with count {} to insert into DB = ", len(df.values)) values = "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, to_date(%s, 'YYYMONDD'), %s, %s, %s);" # create INSERT INTO table (columns) VALUES('%s',...) insert_stmt = "INSERT INTO {} ({}) VALUES {}".format(table, columns, values) curr, con = db.get_connection() execute_batch(curr, insert_stmt, df.values) con.commit() db.close_connection(con, curr) print("Batch inserted into DB successfully") except Exception as err: print("While inserting data into DB exception = {}".format(err)) except Exception as err: print("Exception in get_share_details function = {}".format(err)) print("Execution time = {0:.5f}".format(time.time() - start))
def get_ratios_from_mc(url): # url = "https://www.moneycontrol.com/financials/abhinavleasingfinance/ratiosVI/ALF03#ALF03" data_frame = None try: bs = h.parse_url(url) if bs: cmp_name = None title_data = bs.find('div', { 'id': 'nChrtPrc' }).find('h1', {'class': 'b_42 PT20'}) if title_data: cmp_name = title_data.text.strip() std_data = bs.find('div', { 'class': 'PB10' }).find('div', {'class': 'FL gry10'}) if std_data: header_parts = std_data.text.split("|") nse_code, isin, sector = None, None, None for part in header_parts: name, val = part.split(":")[0].strip(), part.split( ":")[1].strip() if name == 'ISIN': isin = val if name == 'NSE': nse_code = val if name == 'SECTOR': sector = val # print("nse code = {}, isin = {}, sector = {}".format(nse_code, isin, sector)) if not nse_code: nse_code = isin data = [[ [ td.string.strip() for td in tr.find_all('td') if td.string ] for tr in table.find_all('tr')[2:] ] for table in bs.find_all("table", {"class": "table4"})[2:]] if data and len(data) > 0: STK_RATIO_ELEMENTS = {} ele_list = data[0] STK_RATIO_ELEMENTS['STK_YEAR'] = data[0][0] ini_stk_ratio_dic(STK_RATIO_ELEMENTS, len(STK_RATIO_ELEMENTS['STK_YEAR'])) i = 2 while i < len(ele_list) - 4: arr = ele_list[i] if len(arr) > 5: key = c.STK_RATIO_CON.get(arr[0]) val = arr[1:] if key: STK_RATIO_ELEMENTS[key] = val i += 1 print("STK RATIO = {} of Processing URL = {}".format( STK_RATIO_ELEMENTS, url)) data_frame = get_data_frame(nse_code, sector, cmp_name, STK_RATIO_ELEMENTS) data_frame.drop_duplicates(subset=['STK_YEAR', 'NSE_CODE'], inplace=True) else: print("Key ratios are not listed to {}".format(url)) except Exception as err: print("Error while parsing URL in get_ratio function = ", str(err)) return data_frame