def dump_feeds(json_paths): print('extracting feeds from JSON files csv...') csv = open('./output/json_feeds.csv', 'w+', encoding="utf-8") csv.write('filePath\tcustomer_id\tcustomer_name\tfeed_id\tfeed_name\tfeed_property\tfeed_key\tfeed_format\tfeed_link\n') for filePath in json_paths: with open(filePath) as json_file: try: data = json.load(json_file) print(filePath + ': OK') except Exception as ex: print(filePath + ': ' + str(ex)) continue idCustomer='' nameCustomer='' for item in data: if item is None: continue if 'nameCustomer' in item: idCustomer = commons.get_prop('idCustomer', item) nameCustomer = commons.get_prop('nameCustomer', item) elif 'feedID' in item: if 'publications' in item: publications = item['publications'] if 'http' in publications: http = publications['http'] for (http_prop, http_val) in http.items(): csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + item['feedID'] + '\t' + item['feedName'] + '\t' + http_prop + '\t' + http_val['key'] + '\t' + http_val['format'] + '\t' + http_val['ecodeLink'] + '\n') else: csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + item['feedID'] + '\t' + item['feedName'] + '\t\t\t\t\n') else: csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + item['feedID'] + '\t' + item['feedName'] + '\t\t\t\t\n') csv.close()
def dump_newsletters(json_paths): print('extracting newsletters from JSON files csv...') default_headers = 'file_path\tidCustomer\tname_customer\tnormalized_customer_name\tnewsletter_id\tnewsletter_name\tnewsletter_subject\tnewsletter_design_format\tnewsletter_design_title\tlogo_url\tprimary_color\tnewsletter_hour\tnewsletter_min\tnewsletter_hour2\tnewsletter_min2\tnewsletter_valuation_to_show\tnewsletter_order_by\tnewsletter_grouping\tnewsletter_num_mentions\tnewsletter_email_to\tnewsletter_email_remitent\tnewsletter_selection\tnewsletter_name_remitent\tnewsletter_charset\tnewsletter_type\tnewsletter_days\tnewsletter_nb_list_to\tnewsletter_list_to' csv = open('./output/json_newsletters.csv', 'w+', encoding="utf-8") csv.write(default_headers + '\tfeed_id\tfeed_valuation_to_show\tfeed_order_by\tfeed_selection\tfeed_grouping\tfeed_feedName\tnormalized_feedName\tfeed_num_mentions\n') for filePath in json_paths: with open(filePath) as json_file: try: data = json.load(json_file) print(filePath + ': OK') except Exception as ex: print(filePath + ': ' + str(ex)) continue for item in data: if item is None: continue if 'module' in item: module =item['module'] if 'newsletter' in module: newsletters = module['newsletter'] nameCustomer = commons.get_prop('nameCustomer', item) idCustomer = commons.get_prop('idCustomer', item) dump_newsletter(filePath,idCustomer, nameCustomer, newsletters, csv) csv.close()
def get_connection(): conn = None cursor = None try: user_name = commons.get_prop('postgres', 'user-name') password = commons.get_prop('postgres', 'password') db_name = commons.get_prop('postgres', 'database') conn = psycopg2.connect(database=db_name, user=user_name, password=password) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cursor = conn.cursor() except psycopg2.DatabaseError as e: print("Error while connecting database: %s", e) return cursor, conn
def get_shares_details(stock_url, thread_count): # Variables declaration results_que = Queue() failed_que = Queue() start = time.time() # Get the shares from money control shares = get_shrs_from_mnctl(stock_url) log.info("Total number of shares returned = {}".format(len(shares))) # shares = {k: shares[k] for k in list(shares)[:50]} if shares and len(shares) > 0: # put into Queue url_que = get_shares_category(shares) log.info("Shares added to Queue to process...") for i in range(thread_count): t = threading.Thread(target=process_queue, args=(url_que, results_que, failed_que)) t.daemon = True t.start() url_que.join() log.info("Failed url count = {}".format(failed_que.qsize())) log.info("Success url count = {}".format(results_que.qsize())) while not failed_que.empty(): log.warning("Failed URL details = {}".format(failed_que.get())) final_data = {} while not results_que.empty(): # final_data.append(results_que.get()) tmp_dict = results_que.get() key = tmp_dict.get("CATEGORY") h.upd_dic_with_sub_list(key, tmp_dict, final_data) pd.set_option('display.max_columns', 15) for category in final_data: cat_up = category.upper() print("CATEGORY = {} and count = {}".format(cat_up, len(final_data[category]))) df = pd.DataFrame(final_data[category]) df = df.set_index("NAME") # Slice it as needed sliced_df = df.loc[:, [ 'MARKET CAP (Rs Cr)', 'EPS (TTM)', 'P/E', 'INDUSTRY P/E', 'BOOK VALUE (Rs)', 'FACE VALUE (Rs)', 'DIV YIELD.(%)' ]] sliced_df = sliced_df.apply(pd.to_numeric, errors='ignore') sorted_df = sliced_df.sort_values(by=['EPS (TTM)', 'P/E'], ascending=[False, False]) writer_orig = pd.ExcelWriter(os.path.join( commons.get_prop('base-path', 'output'), cat_up + '_Listings.xlsx'), engine='xlsxwriter') sorted_df.to_excel(writer_orig, index=True, sheet_name='report') writer_orig.save() # Sort by P/E print("Execution time = {0:.5f}".format(time.time() - start))
def load_stk_ratio(): # Variables declaration start = time.time() file_path = os.path.join(commons.get_prop('base-path', 'ratio-input')) files = [os.path.join(file_path, fn) for fn in next(os.walk(file_path))[2]] all_pages = [] try: for file in files: read_lines = h.read_list_from_json_file(file) all_pages.extend(read_lines) # Total number of links to process print("No of urls to process", len(all_pages)) page_bins = h.chunks(THREAD_COUNT, all_pages) pool = ThreadPool(processes=THREAD_COUNT) # use all available cores, otherwise specify the number you want as an argument for link_array in page_bins: pool.apply_async(process_pages, args=(link_array, ), callback=log_result) pool.close() pool.join() for df_frames in result_list: try: result = pd.concat(df_frames, ignore_index=True) if len(result) > 0: df_columns = list(result) table = "STK_PERF_HISTORY" values = "to_date(%s, 'MONYY'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s" constraint = ', '.join(['NAME', 'NSE_CODE', 'STK_YEAR']) # create INSERT INTO table (columns) VALUES('%s',...) insert_stmt = h.create_update_query( table, df_columns, values, constraint) curr, con = db.get_connection() execute_batch(curr, insert_stmt, result.values) con.commit() db.close_connection(con, curr) except Exception as err: print("Exception while inserting data into table ", str(err)) except Exception as err: print(str(err)) print("Execution time = {0:.5f}".format(time.time() - start))
def get_log_info(): """ Function get all log variables from config file :return: user defined log variables """ log_path = commons.get_prop('common', 'log-path') log_size = int(commons.get_prop('common', 'log-size')) log_format = commons.get_prop('common', 'log-format') log_backup = int(commons.get_prop('common', 'log-backup')) log_level = commons.get_prop('common', 'log-level') date_format = commons.get_prop('common', 'date-format') return log_path, log_size, log_format, log_backup, log_level, date_format
def get_shares_details(stock_url, process_cnt): # Variables declaration failed_data = [] start = time.time() # Get the shares from money control page_list = get_list_of_share_links(stock_url) page_list = page_list[:10] print("Total Process count = {}".format(process_cnt)) print("Total URL count = {}".format(len(page_list))) pool = multi.Pool(processes=process_cnt) # use all available cores, otherwise specify the number you want as an argument results = [pool.apply_async(process_queue, args=(link,)) for link in page_list] pool.close() pool.join() print(results) print("Total SUCCESS URL count = {}".format(len(results))) log.warning("Total FAILURE URL Count = {}".format(len(failed_data))) final_data = {} for ele in results: tmp_dict = ele.get() key = tmp_dict.get("CATEGORY") h.upd_dic_with_sub_list(key, tmp_dict, final_data) pd.set_option('display.max_columns', 15) for category in final_data: cat_up = category.upper() print("CATEGORY = {} and count = {}".format(cat_up, len(final_data[category]))) df = pd.DataFrame(final_data[category]) df = df.set_index("NAME") # Slice it as needed sliced_df = df.loc[:, ['MARKET CAP (Rs Cr)', 'EPS (TTM)', 'P/E', 'INDUSTRY P/E', 'BOOK VALUE (Rs)', 'FACE VALUE (Rs)', 'DIV YIELD.(%)']] sliced_df = sliced_df.apply(pd.to_numeric, errors='ignore') sorted_df = sliced_df.sort_values(by=['EPS (TTM)', 'P/E'], ascending=[False, False]) writer_orig = pd.ExcelWriter(os.path.join(commons.get_prop('base-path', 'output'), cat_up + '_Listings.xlsx'), engine='xlsxwriter') sorted_df.to_excel(writer_orig, index=True, sheet_name='report') writer_orig.save() print("Execution time = {0:.5f}".format(time.time() - start))
def get_shares_details(all_pages, first_time_process): # Variables declaration jobs = [] spipe_list = [] failed_que = multi.Queue() start = time.time() cpdus = multi.cpu_count() print("Total Process count = {}".format(cpdus)) print("Total URL count = {}".format(len(all_pages))) page_bins = chunks(cpdus, all_pages) for cpdu in range(cpdus): recv_end, send_end = multi.Pipe(False) worker = multi.Process(target=process_page, args=(page_bins[cpdu], send_end, failed_que,)) worker.daemon = True jobs.append(worker) spipe_list.append(recv_end) worker.start() # end_at = time.time() + (5) # while jobs: # job = jobs.pop() # delta = end_at - time.time() # if delta > 0: # job.join(timeout=delta) # job.terminate() # job.join() for job in jobs: job.join(timeout=10) print("All jobs completed......") # if first_time_process: # result_list = [x.recv() for x in spipe_list] # failed_pages = [] # while not failed_que.empty(): # failed_pages.append(failed_que.get()) # print("Parsing failed page count = {}".format(len(failed_pages))) # get_shares_details(failed_pages, False) try: result_list = [x.recv() for x in spipe_list] final_data = {} ratio_links = [] print("FAILED URL COUNT = {}".format(failed_que.qsize())) for results in result_list: print("size of the results array from result_list = ", len(results)) for tmp_dict in results: key = tmp_dict.get("CATEGORY") link = tmp_dict.get("URL") ratio_links.append(define_mc_ratio_link(link)) h.upd_dic_with_sub_list(key, tmp_dict, final_data) if ratio_links and len(ratio_links) > 0: print("Size of the RATIO array = ", len(ratio_links)) h.write_list_to_json_file(os.path.join( commons.get_prop('base-path', 'output'), "5yrs_stk_ratio.txt"), ratio_links) # Set pandas options pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('max_colwidth', 0) for category in final_data: df = pd.DataFrame(final_data[category]) cols = df.columns.drop(['STK_DATE', 'NSE_CODE', 'NAME', 'CATEGORY', 'SUB_CATEGORY', 'URL']) df[cols] = df[cols].apply(pd.to_numeric, errors='coerce') df = df.fillna(0) # print(df) if len(df) > 0: try: df_columns = list(df) table = "STK_DETAILS" columns = ",".join(df_columns) print("Batch started with count {} to insert into DB = ", len(df.values)) values = "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \ "%s, %s, %s, %s, %s, %s, to_date(%s, 'YYYMONDD'), %s, %s, %s);" # create INSERT INTO table (columns) VALUES('%s',...) insert_stmt = "INSERT INTO {} ({}) VALUES {}".format(table, columns, values) curr, con = db.get_connection() execute_batch(curr, insert_stmt, df.values) con.commit() db.close_connection(con, curr) print("Batch inserted into DB successfully") except Exception as err: print("While inserting data into DB exception = {}".format(err)) except Exception as err: print("Exception in get_share_details function = {}".format(err)) print("Execution time = {0:.5f}".format(time.time() - start))
import pandas as pd import commons, os import time import logging from commons import Helper as h from commons import Constants as c from commons import logger import multiprocessing as multi, traceback from dao import PostgreSQLCon as db from psycopg2.extras import execute_batch URL = commons.get_prop('common', 'ipo-url') PROCESS_COUNT = commons.get_prop('common', 'process_cnt') logger.init("IPOMC Reader", c.INFO) log = logging.getLogger("IPOMC Reader") DATABASE_COLUMNS = [ 'NAME', 'LISTED_DATE', 'ISSUED_PRICE', 'LISTED_PRICE', 'LISTED_GAIN', 'CURR_PRICE', 'PROFIT_LOSS', 'NSE_CODE', 'OPEN_PRICE', 'HIGH_PRICE', 'LOW_PRICE', 'VOLUME', 'PREV_PRICE' ] def create_update_query(table): """This function creates an upsert query which replaces existing data based on primary key conflicts""" columns = ', '.join(DATABASE_COLUMNS) constraint = 'NSE_CODE' placeholder = "%s, to_date(%s, 'MON-DD-YYYY'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s" updates = ', '.join( [f'{col} = EXCLUDED.{col}' for col in DATABASE_COLUMNS]) query = f"""INSERT INTO {table} ({columns})
def dump_newsletter(filePath, idCustomer, nameCustomer, newsletters, csv): for (k, v) in newsletters.items(): line = k line += '\t' + commons.get_prop('newsletter_name', v) line += '\t' + commons.get_prop('subject', v) line += '\t' + commons.get_prop('design_format', v) line += '\t' + commons.get_prop('design_title', v) line += '\t' + commons.get_prop('logoUrl', v) line += '\t' + commons.get_prop('primaryColor', v) line += '\t' + commons.get_prop('hour', v) line += '\t' + commons.get_prop('min', v) line += '\t' + commons.get_prop('hour2', v) line += '\t' + commons.get_prop('min2', v) line += '\t' + commons.get_prop('valuation_to_show', v) line += '\t' + commons.join_prop('orderShowSearch', v, '|') line += '\t' + commons.get_prop('grouping', v) line += '\t' + commons.get_prop('num_mentions', v) line += '\t' + commons.get_prop('email_to', v) line += '\t' + commons.get_prop('email_remitent', v) line += '\t' + commons.get_prop('selection', v) line += '\t' + commons.get_prop('name_remitent', v) line += '\t' + commons.get_prop('charset', v) line += '\t' + commons.get_prop('type', v) line += '\t' + commons.join_prop('days', v, '|') if 'list' in v: line += '\t' + str(len(v['email_list_to'])) else: line += '\t' line += '\t' + commons.join_prop('email_list_to', v, ';') if 'feeds' not in v: csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + commons.normalized(nameCustomer) + '\t' + line + '\t\t\t\t\t\t\t\t\n') else: feeds = v['feeds'] for (ke, va) in feeds.items(): feedLine = ke feedLine += '\t' + commons.get_prop('valuation_to_show', va) feedLine += '\t' + commons.get_prop('order_by', va) feedLine += '\t' + commons.get_prop('selection', va) feedLine += '\t' + commons.get_prop('grouping', va) feedName = commons.get_prop('feedName', va) feedLine += '\t' + feedName + '\t' + commons.normalized(feedName) feedLine += '\t' + commons.get_prop('num_mentions', va) csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + commons.normalized(nameCustomer) + '\t' + line + '\t' + feedLine + '\n')
def extract_all_newsletters(json_paths): values = [] for file_path in json_paths: with open(file_path) as json_file: try: data = json.load(json_file) for item in data: if item is None: continue customer_id = commons.get_prop('idCustomer', item) customer_name = commons.get_prop('nameCustomer', item) customer_name_normalized = commons.normalized( customer_name) if 'module' in item: module = item['module'] if 'newsletter' in module: counter = 0 newsletters = module['newsletter'] for (k, v) in newsletters.items(): if 'feeds' in v: counter += 1 newsletter_id = k newsletter_name = commons.get_prop( 'newsletter_name', v) subject = commons.get_prop('subject', v) design_format = commons.get_prop( 'design_format', v) design_title = commons.get_prop( 'design_title', v) logoUrl = commons.get_prop('logoUrl', v) primaryColor = commons.get_prop( 'primaryColor', v) hour = commons.get_prop('hour', v) min1 = commons.get_prop('min', v) hour2 = commons.get_prop('hour2', v) min2 = commons.get_prop('min2', v) valuation_to_show = commons.get_prop( 'valuation_to_show', v) order_by = commons.join_prop( 'orderShowSearch', v, '|') grouping = commons.get_prop('grouping', v) num_mentions = commons.get_prop( 'num_mentions', v) email_to = commons.get_prop('email_to', v) email_remitent = commons.get_prop( 'email_remitent', v) selection = commons.get_prop( 'selection', v) name_remitent = commons.get_prop( 'name_remitent', v) charset = commons.get_prop('charset', v) newsletter_type = commons.get_prop( 'type', v) days = commons.join_prop('days', v, '|') if 'email_list_to' in v: nb_list_to = str( len(v['email_list_to'])) email_list_to = commons.join_prop( 'email_list_to', v, ';') else: nb_list_to = '0' email_list_to = None feeds = v['feeds'] for (ke, va) in feeds.items(): feed_id = ke feed_name = commons.get_prop( 'feedName', va) feed_name_normalized = commons.normalized( feed_name) feed_valuation_to_show = commons.get_prop( 'valuation_to_show', va) feed_order_by = commons.get_prop( 'order_by', va) feed_selection = commons.get_prop( 'selection', va) feed_grouping = commons.get_prop( 'grouping', va) feed_num_mentions = commons.get_prop( 'num_mentions', va) values.append( (file_path, customer_id, customer_name, customer_name_normalized, newsletter_id, newsletter_name, subject, design_format, design_title, logoUrl, primaryColor, hour, min1, hour2, min2, valuation_to_show, order_by, grouping, num_mentions, email_to, email_remitent, selection, name_remitent, charset, newsletter_type, days, nb_list_to, email_list_to, feed_id, feed_valuation_to_show, feed_order_by, feed_selection, feed_grouping, feed_name, feed_name_normalized, feed_num_mentions)) print('extracting {0} newsletters from {1}: OK'. format(counter, file_path)) except Exception as ex: print('ERROR extracting newsletters from {0}: {1}'.format( file_path, str(ex))) return values