Example #1
0
def dump_feeds(json_paths):
    print('extracting feeds from JSON files csv...')
    csv = open('./output/json_feeds.csv', 'w+', encoding="utf-8")
    csv.write('filePath\tcustomer_id\tcustomer_name\tfeed_id\tfeed_name\tfeed_property\tfeed_key\tfeed_format\tfeed_link\n')
    for filePath in json_paths:
        with open(filePath) as json_file:
            try:
                data = json.load(json_file)
                print(filePath + ': OK')
            except Exception as ex:
                print(filePath + ': ' + str(ex))
                continue
            
            idCustomer=''
            nameCustomer=''
            for item in data: 
                if item is None:
                    continue
                
                if 'nameCustomer' in item:
                    idCustomer = commons.get_prop('idCustomer', item)
                    nameCustomer = commons.get_prop('nameCustomer', item)
                elif 'feedID' in item:
                    if 'publications' in item:
                        publications = item['publications']
                        if 'http' in publications:
                            http = publications['http']
                            for (http_prop, http_val) in http.items():
                                csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + item['feedID'] + '\t' +  item['feedName'] + '\t' + http_prop + '\t' + http_val['key'] + '\t' + http_val['format'] + '\t' + http_val['ecodeLink'] + '\n')
                        else:
                            csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + item['feedID'] + '\t' +  item['feedName'] + '\t\t\t\t\n')
                    else:
                        csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + item['feedID'] + '\t' +  item['feedName'] + '\t\t\t\t\n')
    csv.close()
Example #2
0
def dump_newsletters(json_paths):
    print('extracting newsletters from JSON files csv...')
    default_headers = 'file_path\tidCustomer\tname_customer\tnormalized_customer_name\tnewsletter_id\tnewsletter_name\tnewsletter_subject\tnewsletter_design_format\tnewsletter_design_title\tlogo_url\tprimary_color\tnewsletter_hour\tnewsletter_min\tnewsletter_hour2\tnewsletter_min2\tnewsletter_valuation_to_show\tnewsletter_order_by\tnewsletter_grouping\tnewsletter_num_mentions\tnewsletter_email_to\tnewsletter_email_remitent\tnewsletter_selection\tnewsletter_name_remitent\tnewsletter_charset\tnewsletter_type\tnewsletter_days\tnewsletter_nb_list_to\tnewsletter_list_to'
    csv = open('./output/json_newsletters.csv', 'w+', encoding="utf-8")
    csv.write(default_headers + '\tfeed_id\tfeed_valuation_to_show\tfeed_order_by\tfeed_selection\tfeed_grouping\tfeed_feedName\tnormalized_feedName\tfeed_num_mentions\n')
  
    for filePath in json_paths:
        with open(filePath) as json_file:
            try:
                data = json.load(json_file)
                print(filePath + ': OK')
            except Exception as ex:
                print(filePath + ': ' + str(ex))
                continue
            
            for item in data:
                if item is None: 
                    continue
                if 'module' in item:
                    module =item['module']
                    if 'newsletter' in module:
                        newsletters = module['newsletter']
                        nameCustomer = commons.get_prop('nameCustomer', item)
                        idCustomer = commons.get_prop('idCustomer', item)
                        dump_newsletter(filePath,idCustomer, nameCustomer, newsletters, csv)
    csv.close()
Example #3
0
def get_connection():
    conn = None
    cursor = None
    try:
        user_name = commons.get_prop('postgres', 'user-name')
        password = commons.get_prop('postgres', 'password')
        db_name = commons.get_prop('postgres', 'database')
        conn = psycopg2.connect(database=db_name,
                                user=user_name,
                                password=password)
        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cursor = conn.cursor()
    except psycopg2.DatabaseError as e:
        print("Error while connecting database: %s", e)
    return cursor, conn
Example #4
0
def get_shares_details(stock_url, thread_count):
    # Variables declaration
    results_que = Queue()
    failed_que = Queue()

    start = time.time()
    # Get the shares from money control
    shares = get_shrs_from_mnctl(stock_url)
    log.info("Total number of shares returned = {}".format(len(shares)))
    # shares = {k: shares[k] for k in list(shares)[:50]}
    if shares and len(shares) > 0:
        # put into Queue
        url_que = get_shares_category(shares)
        log.info("Shares added to Queue to process...")

    for i in range(thread_count):
        t = threading.Thread(target=process_queue,
                             args=(url_que, results_que, failed_que))
        t.daemon = True
        t.start()

    url_que.join()
    log.info("Failed url count = {}".format(failed_que.qsize()))
    log.info("Success url count = {}".format(results_que.qsize()))

    while not failed_que.empty():
        log.warning("Failed URL details = {}".format(failed_que.get()))

    final_data = {}
    while not results_que.empty():
        # final_data.append(results_que.get())
        tmp_dict = results_que.get()
        key = tmp_dict.get("CATEGORY")
        h.upd_dic_with_sub_list(key, tmp_dict, final_data)
    pd.set_option('display.max_columns', 15)
    for category in final_data:
        cat_up = category.upper()
        print("CATEGORY = {} and count = {}".format(cat_up,
                                                    len(final_data[category])))
        df = pd.DataFrame(final_data[category])
        df = df.set_index("NAME")
        # Slice it as needed
        sliced_df = df.loc[:, [
            'MARKET CAP (Rs Cr)', 'EPS (TTM)', 'P/E', 'INDUSTRY P/E',
            'BOOK VALUE (Rs)', 'FACE VALUE (Rs)', 'DIV YIELD.(%)'
        ]]
        sliced_df = sliced_df.apply(pd.to_numeric, errors='ignore')
        sorted_df = sliced_df.sort_values(by=['EPS (TTM)', 'P/E'],
                                          ascending=[False, False])
        writer_orig = pd.ExcelWriter(os.path.join(
            commons.get_prop('base-path', 'output'),
            cat_up + '_Listings.xlsx'),
                                     engine='xlsxwriter')
        sorted_df.to_excel(writer_orig, index=True, sheet_name='report')
        writer_orig.save()

        # Sort by  P/E

    print("Execution time = {0:.5f}".format(time.time() - start))
Example #5
0
def load_stk_ratio():
    # Variables declaration
    start = time.time()
    file_path = os.path.join(commons.get_prop('base-path', 'ratio-input'))
    files = [os.path.join(file_path, fn) for fn in next(os.walk(file_path))[2]]
    all_pages = []
    try:
        for file in files:
            read_lines = h.read_list_from_json_file(file)
            all_pages.extend(read_lines)

        # Total number of links to process
        print("No of urls to process", len(all_pages))
        page_bins = h.chunks(THREAD_COUNT, all_pages)

        pool = ThreadPool(processes=THREAD_COUNT)
        # use all available cores, otherwise specify the number you want as an argument
        for link_array in page_bins:
            pool.apply_async(process_pages,
                             args=(link_array, ),
                             callback=log_result)
        pool.close()
        pool.join()

        for df_frames in result_list:
            try:
                result = pd.concat(df_frames, ignore_index=True)
                if len(result) > 0:
                    df_columns = list(result)
                    table = "STK_PERF_HISTORY"
                    values = "to_date(%s, 'MONYY'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s"
                    constraint = ', '.join(['NAME', 'NSE_CODE', 'STK_YEAR'])
                    # create INSERT INTO table (columns) VALUES('%s',...)
                    insert_stmt = h.create_update_query(
                        table, df_columns, values, constraint)
                    curr, con = db.get_connection()
                    execute_batch(curr, insert_stmt, result.values)
                    con.commit()
                    db.close_connection(con, curr)

            except Exception as err:
                print("Exception while inserting data into table ", str(err))

    except Exception as err:
        print(str(err))
    print("Execution time = {0:.5f}".format(time.time() - start))
Example #6
0
def get_log_info():
    """
    Function get all log variables from config file
    :return: user defined log variables
    """
    log_path = commons.get_prop('common', 'log-path')
    log_size = int(commons.get_prop('common', 'log-size'))
    log_format = commons.get_prop('common', 'log-format')
    log_backup = int(commons.get_prop('common', 'log-backup'))
    log_level = commons.get_prop('common', 'log-level')
    date_format = commons.get_prop('common', 'date-format')

    return log_path, log_size, log_format, log_backup, log_level, date_format
def get_shares_details(stock_url, process_cnt):
    # Variables declaration
    failed_data = []
    start = time.time()
    # Get the shares from money control
    page_list = get_list_of_share_links(stock_url)
    page_list = page_list[:10]
    print("Total Process count = {}".format(process_cnt))
    print("Total URL count = {}".format(len(page_list)))

    pool = multi.Pool(processes=process_cnt)
    # use all available cores, otherwise specify the number you want as an argument
    results = [pool.apply_async(process_queue, args=(link,)) for link in page_list]
    pool.close()
    pool.join()
    print(results)
    print("Total SUCCESS URL count = {}".format(len(results)))
    log.warning("Total FAILURE URL Count = {}".format(len(failed_data)))

    final_data = {}
    for ele in results:
        tmp_dict = ele.get()
        key = tmp_dict.get("CATEGORY")
        h.upd_dic_with_sub_list(key, tmp_dict, final_data)
    pd.set_option('display.max_columns', 15)

    for category in final_data:
        cat_up = category.upper()
        print("CATEGORY = {} and count = {}".format(cat_up, len(final_data[category])))
        df = pd.DataFrame(final_data[category])
        df = df.set_index("NAME")
        # Slice it as needed
        sliced_df = df.loc[:, ['MARKET CAP (Rs Cr)', 'EPS (TTM)', 'P/E', 'INDUSTRY P/E', 'BOOK VALUE (Rs)',
                               'FACE VALUE (Rs)', 'DIV YIELD.(%)']]
        sliced_df = sliced_df.apply(pd.to_numeric, errors='ignore')
        sorted_df = sliced_df.sort_values(by=['EPS (TTM)', 'P/E'], ascending=[False, False])
        writer_orig = pd.ExcelWriter(os.path.join(commons.get_prop('base-path', 'output'), cat_up + '_Listings.xlsx'),
                                     engine='xlsxwriter')
        sorted_df.to_excel(writer_orig, index=True, sheet_name='report')
        writer_orig.save()
    print("Execution time = {0:.5f}".format(time.time() - start))
def get_shares_details(all_pages, first_time_process):
    # Variables declaration
    jobs = []
    spipe_list = []
    failed_que = multi.Queue()
    start = time.time()
    cpdus = multi.cpu_count()
    print("Total Process count = {}".format(cpdus))
    print("Total URL count = {}".format(len(all_pages)))
    page_bins = chunks(cpdus, all_pages)
    for cpdu in range(cpdus):
        recv_end, send_end = multi.Pipe(False)
        worker = multi.Process(target=process_page, args=(page_bins[cpdu], send_end, failed_que,))
        worker.daemon = True
        jobs.append(worker)
        spipe_list.append(recv_end)
        worker.start()

    # end_at = time.time() + (5)
    # while jobs:
    #     job = jobs.pop()
    #     delta = end_at - time.time()
    #     if delta > 0:
    #         job.join(timeout=delta)
    #     job.terminate()
    #     job.join()
    for job in jobs:
        job.join(timeout=10)
    print("All jobs completed......")

    # if first_time_process:
    #     result_list = [x.recv() for x in spipe_list]
    #     failed_pages = []
    #     while not failed_que.empty():
    #         failed_pages.append(failed_que.get())
    #     print("Parsing failed page count = {}".format(len(failed_pages)))
    #     get_shares_details(failed_pages, False)
    try:
        result_list = [x.recv() for x in spipe_list]
        final_data = {}
        ratio_links = []
        print("FAILED URL COUNT = {}".format(failed_que.qsize()))
        for results in result_list:
            print("size of the results array from result_list = ", len(results))
            for tmp_dict in results:
                key = tmp_dict.get("CATEGORY")
                link = tmp_dict.get("URL")
                ratio_links.append(define_mc_ratio_link(link))
                h.upd_dic_with_sub_list(key, tmp_dict, final_data)
        if ratio_links and len(ratio_links) > 0:
            print("Size of the RATIO array = ", len(ratio_links))
            h.write_list_to_json_file(os.path.join(
                commons.get_prop('base-path', 'output'), "5yrs_stk_ratio.txt"), ratio_links)

        # Set pandas options
        pd.set_option('display.max_columns', None)
        pd.set_option('display.expand_frame_repr', False)
        pd.set_option('max_colwidth', 0)
        for category in final_data:
            df = pd.DataFrame(final_data[category])
            cols = df.columns.drop(['STK_DATE', 'NSE_CODE', 'NAME', 'CATEGORY', 'SUB_CATEGORY', 'URL'])
            df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
            df = df.fillna(0)
            # print(df)
            if len(df) > 0:
                try:
                    df_columns = list(df)
                    table = "STK_DETAILS"
                    columns = ",".join(df_columns)
                    print("Batch started with count {} to insert into DB = ", len(df.values))
                    values = "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
                             "%s, %s, %s, %s, %s, %s, to_date(%s, 'YYYMONDD'), %s, %s, %s);"
                    # create INSERT INTO table (columns) VALUES('%s',...)
                    insert_stmt = "INSERT INTO {} ({}) VALUES {}".format(table, columns, values)
                    curr, con = db.get_connection()
                    execute_batch(curr, insert_stmt, df.values)
                    con.commit()
                    db.close_connection(con, curr)
                    print("Batch inserted into DB successfully")

                except Exception as err:
                    print("While inserting data into DB exception = {}".format(err))

    except Exception as err:
        print("Exception in get_share_details function = {}".format(err))

    print("Execution time = {0:.5f}".format(time.time() - start))
Example #9
0
import pandas as pd
import commons, os
import time
import logging
from commons import Helper as h
from commons import Constants as c
from commons import logger
import multiprocessing as multi, traceback
from dao import PostgreSQLCon as db
from psycopg2.extras import execute_batch

URL = commons.get_prop('common', 'ipo-url')
PROCESS_COUNT = commons.get_prop('common', 'process_cnt')
logger.init("IPOMC Reader", c.INFO)
log = logging.getLogger("IPOMC Reader")

DATABASE_COLUMNS = [
    'NAME', 'LISTED_DATE', 'ISSUED_PRICE', 'LISTED_PRICE', 'LISTED_GAIN',
    'CURR_PRICE', 'PROFIT_LOSS', 'NSE_CODE', 'OPEN_PRICE', 'HIGH_PRICE',
    'LOW_PRICE', 'VOLUME', 'PREV_PRICE'
]


def create_update_query(table):
    """This function creates an upsert query which replaces existing data based on primary key conflicts"""
    columns = ', '.join(DATABASE_COLUMNS)
    constraint = 'NSE_CODE'
    placeholder = "%s, to_date(%s, 'MON-DD-YYYY'), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s"
    updates = ', '.join(
        [f'{col} = EXCLUDED.{col}' for col in DATABASE_COLUMNS])
    query = f"""INSERT INTO {table} ({columns}) 
Example #10
0
def dump_newsletter(filePath, idCustomer, nameCustomer, newsletters, csv): 
    for (k, v) in newsletters.items():
        line = k 
        line += '\t' + commons.get_prop('newsletter_name', v)
        line += '\t' + commons.get_prop('subject', v)
        line += '\t' + commons.get_prop('design_format', v)
        line += '\t' + commons.get_prop('design_title', v)
        line += '\t' + commons.get_prop('logoUrl', v)
        line += '\t' + commons.get_prop('primaryColor', v)
        line += '\t' + commons.get_prop('hour', v)
        line += '\t' + commons.get_prop('min', v)
        line += '\t' + commons.get_prop('hour2', v)
        line += '\t' + commons.get_prop('min2', v)
        line += '\t' + commons.get_prop('valuation_to_show', v)
        line += '\t' + commons.join_prop('orderShowSearch', v, '|')
        line += '\t' + commons.get_prop('grouping', v)
        line += '\t' + commons.get_prop('num_mentions', v)
        line += '\t' + commons.get_prop('email_to', v)
        line += '\t' + commons.get_prop('email_remitent', v)
        line += '\t' + commons.get_prop('selection', v)
        line += '\t' + commons.get_prop('name_remitent', v)
        line += '\t' + commons.get_prop('charset', v)
        line += '\t' + commons.get_prop('type', v)
        line += '\t' + commons.join_prop('days', v, '|')
        if 'list' in v:
            line += '\t' + str(len(v['email_list_to']))
        else:
            line += '\t'
        line += '\t' + commons.join_prop('email_list_to', v, ';')

        if 'feeds' not in v:
            csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + commons.normalized(nameCustomer) + '\t' + line + '\t\t\t\t\t\t\t\t\n')
        else:
            feeds = v['feeds']
            for (ke, va) in feeds.items():
                feedLine = ke   
                feedLine += '\t' + commons.get_prop('valuation_to_show', va)
                feedLine += '\t' + commons.get_prop('order_by', va)
                feedLine += '\t' + commons.get_prop('selection', va)
                feedLine += '\t' + commons.get_prop('grouping', va)
                feedName = commons.get_prop('feedName', va)         
                feedLine += '\t' + feedName + '\t' + commons.normalized(feedName)
                feedLine += '\t' + commons.get_prop('num_mentions', va)
                csv.write(filePath + '\t' + idCustomer + '\t' + nameCustomer + '\t' + commons.normalized(nameCustomer) + '\t' + line + '\t' + feedLine + '\n')
Example #11
0
def extract_all_newsletters(json_paths):
    values = []
    for file_path in json_paths:
        with open(file_path) as json_file:
            try:
                data = json.load(json_file)

                for item in data:
                    if item is None:
                        continue

                    customer_id = commons.get_prop('idCustomer', item)
                    customer_name = commons.get_prop('nameCustomer', item)
                    customer_name_normalized = commons.normalized(
                        customer_name)

                    if 'module' in item:
                        module = item['module']
                        if 'newsletter' in module:
                            counter = 0
                            newsletters = module['newsletter']
                            for (k, v) in newsletters.items():
                                if 'feeds' in v:
                                    counter += 1
                                    newsletter_id = k
                                    newsletter_name = commons.get_prop(
                                        'newsletter_name', v)
                                    subject = commons.get_prop('subject', v)
                                    design_format = commons.get_prop(
                                        'design_format', v)
                                    design_title = commons.get_prop(
                                        'design_title', v)
                                    logoUrl = commons.get_prop('logoUrl', v)
                                    primaryColor = commons.get_prop(
                                        'primaryColor', v)
                                    hour = commons.get_prop('hour', v)
                                    min1 = commons.get_prop('min', v)
                                    hour2 = commons.get_prop('hour2', v)
                                    min2 = commons.get_prop('min2', v)
                                    valuation_to_show = commons.get_prop(
                                        'valuation_to_show', v)
                                    order_by = commons.join_prop(
                                        'orderShowSearch', v, '|')
                                    grouping = commons.get_prop('grouping', v)
                                    num_mentions = commons.get_prop(
                                        'num_mentions', v)
                                    email_to = commons.get_prop('email_to', v)
                                    email_remitent = commons.get_prop(
                                        'email_remitent', v)
                                    selection = commons.get_prop(
                                        'selection', v)
                                    name_remitent = commons.get_prop(
                                        'name_remitent', v)
                                    charset = commons.get_prop('charset', v)
                                    newsletter_type = commons.get_prop(
                                        'type', v)
                                    days = commons.join_prop('days', v, '|')

                                    if 'email_list_to' in v:
                                        nb_list_to = str(
                                            len(v['email_list_to']))
                                        email_list_to = commons.join_prop(
                                            'email_list_to', v, ';')
                                    else:
                                        nb_list_to = '0'
                                        email_list_to = None

                                    feeds = v['feeds']
                                    for (ke, va) in feeds.items():
                                        feed_id = ke
                                        feed_name = commons.get_prop(
                                            'feedName', va)
                                        feed_name_normalized = commons.normalized(
                                            feed_name)
                                        feed_valuation_to_show = commons.get_prop(
                                            'valuation_to_show', va)
                                        feed_order_by = commons.get_prop(
                                            'order_by', va)
                                        feed_selection = commons.get_prop(
                                            'selection', va)
                                        feed_grouping = commons.get_prop(
                                            'grouping', va)
                                        feed_num_mentions = commons.get_prop(
                                            'num_mentions', va)

                                        values.append(
                                            (file_path, customer_id,
                                             customer_name,
                                             customer_name_normalized,
                                             newsletter_id, newsletter_name,
                                             subject, design_format,
                                             design_title, logoUrl,
                                             primaryColor, hour, min1, hour2,
                                             min2, valuation_to_show, order_by,
                                             grouping, num_mentions, email_to,
                                             email_remitent, selection,
                                             name_remitent, charset,
                                             newsletter_type, days, nb_list_to,
                                             email_list_to, feed_id,
                                             feed_valuation_to_show,
                                             feed_order_by, feed_selection,
                                             feed_grouping, feed_name,
                                             feed_name_normalized,
                                             feed_num_mentions))
                            print('extracting {0} newsletters from {1}: OK'.
                                  format(counter, file_path))
            except Exception as ex:
                print('ERROR extracting newsletters from {0}: {1}'.format(
                    file_path, str(ex)))
    return values