Beispiel #1
0
def upload(params):
	Debug = DebugManager.DebugManager();
	Debug.start();
	Debug.trace('start');
	dbManager = SharedMemoryManager.getInstance();
	db = dbManager.query();
	date = fn.getNestedElement(params, 'date');
	path = fn.getNestedElement(params, 'path');
	# url = fn.getNestedElement(params, 'callback_url'); # required params to handle callback_url
	paths, should_reset = ModelUpload.getPath(params);
	for idx in range(0, len(paths)):
		p = paths[idx];
		processed_filename = File.converExcelFileToCsv(p, ignore_index=True);
		Logger.v('processed_filename', processed_filename);
		Debug.trace('convert to json : path {0}'.format( processed_filename ) );
		if idx == 0 and should_reset: #reset once at the beginning
			Logger.v('Reset Database.');
			reset(date); #reset stock_issue collection
			ModelSIIntegrity.reset(date); #reset stock_issue_datalog by date given
		File.readCsvFileInChunks(processed_filename, save, params, chunksize=chunksize);
		Debug.trace('uploaded to mongo.');
	generateIndex();
	ModelSIIntegrity.generateIndex();
	Debug.trace('indexing mongo collection.');
	saveIssueOption();
	Debug.trace('save option to json.');
	trigger_params = copy.deepcopy(params);
	trigger_params['result'] = 'data count: {0}'.format(params['data_count'][path]);
	# Logger.v('trigger_params', trigger_params);
	dbManager.executeBulkOperations(None); # Insert all the remaining job at once.
	ReportStock.triggerOnComplete(trigger_params);
	Debug.trace('trigger api on complete.');
	Debug.end();
	Debug.show('Stock.upload');
Beispiel #2
0
def stockIntegrity(params, data):
    Debug = DebugManager.DebugManager()
    Debug.start()
    global msia_tz, date_retrieve_limit
    result = []
    today = DateTime.now(tzinfo=msia_tz)
    start_date = DateTime.getDaysAgo(date_retrieve_limit, datefrom=today)
    durations = DateTime.getBetween([start_date, today],
                                    element='date',
                                    offset=24)['order']
    # offset 24 hour to include today
    state_data = fn.getNestedElement(data, 'state')
    facility_data_by_state = fn.getNestedElement(data, 'state_facility')

    check_data = combinedFacilityList(data=facility_data_by_state)
    result = getIntegrity(params={
        'durations': durations,
    },
                          data={
                              'facility': facility_data_by_state,
                              'state': state_data,
                              'to_update': result,
                              'check_data': check_data,
                          })
    updateStateData(result)
    result = list(sorted(result, key=lambda k: k['name'], reverse=False))
    Debug.end()
    Debug.show('Model.Structure.stockIntegrity')
    return result
Beispiel #3
0
def createSchedules(args={}):  #upid, page_type
    global filter_page_type
    Debug = DebugManager.DebugManager()
    Debug.start()
    dbManager = SharedMemoryManager.getInstance()
    # crawl_duration = fn.getNestedElement(fn.config,'CRAWL_DURATION', 12);
    incomplete_task, incomplete_task_count = checkRemaining()
    new_queue_count = 0
    # Logger.v(incomplete_task_count, incomplete_task, filter_page_type);
    extra_params = {
        'crawl_comment': fn.getNestedElement(args, 'crawl_comment', None)
    }
    extra_params = {k: v for k, v in extra_params.items() if v is not None}

    for platform in filter_page_type:
        if args and not platform in fn.getNestedElement(
                args, 'page_type', platform).split(','):
            Logger.v('Skip Platform:%s' % (platform))
            continue
            # skip when page_type appear and not same
        pages = fn.getNestedElement(args, 'pages.{0}'.format(platform), [])
        Logger.v('platform', platform)
        # Logger.v('page', args['pages']['budget']);
        for page in pages:  #Create queue for each
            # Logger.v('page', page);
            Queue.create(page,
                         extra_params=extra_params,
                         priority=fn.getNestedElement(args, 'priority',
                                                      'daily'),
                         batch=True)
            new_queue_count += 1
            Logger.v('new_queue_count', new_queue_count)
        # Debug.trace();

    Logger.v('Incomplete:%s, New Queue: %s' %
             (incomplete_task_count, new_queue_count))
    if incomplete_task_count > (new_queue_count *
                                int(fn.config['DEBUG_CRAWL_WARNING']) /
                                100) or incomplete_task_count > int(
                                    fn.config['DEBUG_CRAWL_WARNING']):
        # Mail.send('[%s]Incomplete Crawl [%s], Current Schedule: [%s]'%(DateTime.getReadableDate(DateTime.now()),
        # 	incomplete_task_count, new_queue_count),
        # 		 fn.dumps(incomplete_task, encode=False)
        # );
        pass

    result = {
        'pending_count': new_queue_count,
        'incomplete_count': incomplete_task_count
    }
    dbManager.executeBulkOperations(None)
    # Debug.show('Create Schedule');
    return Params.generate(True, result)
Beispiel #4
0
def getResult2(action, params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')
    if action == 'procurement':
        result = ModelProcurement.get(params=params)
        # result = Budget.get(params=params);
    elif action == 'budget':
        result = ModelBudget.get(params=params)
        # result = Budget.get(params=params);
    Debug.end()
    Debug.show('Report.getResult2')
    return result
Beispiel #5
0
def check(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')
    global process_order
    global key_to_join
    global group_by
    group_by_list = fn.getNestedElement(params, 'group_by', [])
    filter_quantity = fn.getNestedElement(params, 'filter.quantity', [])
    export = fn.getNestedElement(params, 'export', None)
    custom_params = copy.deepcopy(params)
    result = {}

    # filter and read mongo db
    data = ModelStockIssue.get(params)
    Debug.trace('read mongo')

    # filtering by group
    if group_by_list:
        group_by = group_by_list[-1]['id']
    else:
        group_by = 'state'

    custom_params['group_by_key'] = group_by
    custom_params['process_order'] = process_order[group_by]
    custom_params['key_to_join'] = key_to_join[group_by]
    custom_params['item_key_to_show'] = item_key_to_show[group_by]

    # processing data
    if data:
        temp_result = ModelStockIssue.calculateData(params=custom_params,
                                                    data=data)
        Debug.trace('calculate data')
        result = toOutputStructure(params=custom_params, data=temp_result)
        Debug.trace('structure data')

    if result == {}:
        result = []
    Debug.end()
    Debug.show('StockIssue.run')
    if export:
        export_result = generateExcelStructure(params=custom_params,
                                               data=result)
        return Params.generate(True, export_result)
    else:
        return Params.generate(True, result)
def get(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')

    report_name = Report.getReportName(params)
    Logger.v('-----Start Getting Data-----')
    data = getData(params=params)
    Debug.trace('get data')
    Logger.v('-----Start Calculate Data-----')
    calculated_data = calculateData(params=params, data=data)
    Debug.trace('calculate data')
    Logger.v('-----Start Restructure Data-----')
    result = toOutputStructure(params=params,
                               data={
                                   'data': data,
                                   'calculated_data': calculated_data
                               })
    Debug.trace('structure data')
    Debug.end()
    Debug.show('Model.Procurement.get')
    return result
Beispiel #7
0
def get(action, params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    if action == 'whitelist':
        result = getWhitelist(params)
    elif action == 'procurement':
        Logger.v('action:', action)
        # result = getResult(action, params); # XXX
        result = getResult2(action, params)
        # XXX
    elif action == 'budget':
        Logger.v('action:', action)
        # result = getResult(action, params); # XXX
        result = getResult2(action, params)
        # XXX
    elif action == 'stockcheck':
        Logger.v('action', action)
        result = Stock.run(params)

    Debug.end()
    Debug.show('Report.get')
    return Params.generate(True, result)
Beispiel #8
0
def run(params):
    Mail.send(
        '{0} Crawl - Store starts'.format(
            DateTime.getReadableDate(DateTime.now())),
        'Start at: {0}'.format(DateTime.now(tzinfo=msia_tz)))
    recordTime(key='create_schedule_start_time')
    Debug = DebugManager.DebugManager()
    Debug.start()
    start_crawl = fn.getNestedElement(params, 'schedule_params.start_crawl',
                                      False)
    check_empty = fn.getNestedElement(params, 'schedule_params.check_empty',
                                      False)
    Logger.v('Creating schedule:')
    updateDropdownOptions(params=params)
    crawl_params = generateCrawlParam(params)
    Debug.trace('Generate Crawl Params')
    createSchedules({'pages': crawl_params})
    Debug.trace('Create Schedule')
    if start_crawl:
        Crawl.start(params)
        Debug.trace('crawling')
    recordTime(key='create_schedule_end_time')
    Debug.show('Run')
Beispiel #9
0
def extractSearch():
    from lib import File, Logger, DebugManager
    from Report import Item, Stock
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')

    # filename = '2020-03-30.xlsx';
    filename = 'upload/desk.csv'
    # fn.ensureDirectory('tests');
    data = Item.upload(filename)

    filename = 'upload/desh.csv'
    # fn.ensureDirectory('tests');
    data = Item.upload(filename, new=False)
    # Stock.upload({
    # 	'action': 'upload_stock',
    # 	'id': 1,
    # 	'date': '2020-03-30',
    # 	'path': filename, # staging filepath
    # });
    # Item.save(data);
    #

    result = Item.find({
        'search_text': 'oxygen',
        # 'item_code': 'Y1690580002.00,Y1690130024.00'
        # 'group_by' : 'sub_group_desc'
    })
    fn.show(result)
    # for d in result['data']:
    # 	Logger.v(d);
    # File.writeJSONFile('search.json',data);
    Debug.trace('end')
    Debug.end()
    Debug.show('Daniel.py')
Beispiel #10
0
def toOutputStructure(params, data):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')
    new_data = fn.getNestedElement(data, 'data')
    calculated_data = fn.getNestedElement(data, 'calculated_data')
    report_name = Report.getReportName(params)
    min_purchase_amount = fn.getNestedElement(params,
                                              'filter.min_purchase_amount', 0)
    if type(min_purchase_amount) == str:
        min_purchase_amount = int(min_purchase_amount)
    total_keys = global_report_key_update[report_name]
    result = {}
    all_keys = global_group_order[report_name]
    keys = {}
    for idx in range(0, len(all_keys)):
        ak = all_keys[idx]
        i = str(idx)
        keys['key' + i] = ak

    new_data_0 = copy.deepcopy(new_data)
    portion_0 = []
    unique_values0 = sorted(
        list(
            set(obj_[global_group_order_kepmap[keys['key0']]]
                for obj_ in new_data_0)))

    for uv0 in unique_values0:
        new_data_1 = list(
            filter(lambda d: d[global_group_order_kepmap[keys['key0']]] == uv0,
                   new_data_0))
        portion_1 = []
        unique_values1 = sorted(
            list(
                set(obj_[global_group_order_kepmap[keys['key1']]]
                    for obj_ in new_data_1)))
        join_key_1 = '_'.join([uv0])

        for uv1 in unique_values1:
            new_data_2 = list(
                filter(
                    lambda d: d[global_group_order_kepmap[keys['key1']]] ==
                    uv1, new_data_1))
            portion_2 = []
            unique_values2 = sorted(
                list(
                    set(obj_[global_group_order_kepmap[keys['key2']]]
                        for obj_ in new_data_2)))
            join_key_2 = '_'.join([uv0, uv1])

            if report_name == 'budget':
                for uv2 in unique_values2:
                    new_data_3 = list(
                        filter(
                            lambda d: d[global_group_order_kepmap[keys['key2']]
                                        ] == uv2, new_data_2))
                    portion_3 = []
                    unique_values3 = sorted(
                        list(
                            set(obj_[global_group_order_kepmap[keys['key3']]]
                                for obj_ in new_data_3)))
                    join_key_3 = '_'.join([uv0, uv1, uv2])

                    for uv3 in unique_values3:
                        new_data_4 = list(
                            filter(
                                lambda d: d[global_group_order_kepmap[keys[
                                    'key3']]] == uv3, new_data_3))
                        portion_4 = []
                        unique_values4 = sorted(
                            list(
                                set(obj_[global_group_order_kepmap[
                                    keys['key4']]] for obj_ in new_data_4)))
                        join_key_4 = '_'.join([uv0, uv1, uv2, uv3])

                        for uv4 in unique_values4:  # last key
                            new_data_5 = list(
                                filter(
                                    lambda d: d[global_group_order_kepmap[keys[
                                        'key4']]] == uv4, new_data_4))
                            join_key_5 = '_'.join([uv0, uv1, uv2, uv3, uv4])
                            children = generateChildren(params=params,
                                                        data=new_data_5)
                            if children:
                                portion_4.append({
                                    'id':
                                    fn.getNestedElement(
                                        calculated_data,
                                        '{0}.id'.format(join_key_5)),
                                    'name':
                                    fn.getNestedElement(
                                        calculated_data,
                                        '{0}.name'.format(join_key_5)),
                                    'code':
                                    fn.getNestedElement(
                                        calculated_data,
                                        '{0}.code'.format(join_key_5)),
                                    'total':
                                    generateSummary(params=params,
                                                    data=children),
                                    'children':
                                    children,
                                })
                        if portion_4:
                            portion_3.append({
                                'id':
                                fn.getNestedElement(
                                    calculated_data,
                                    '{0}.id'.format(join_key_4)),
                                'name':
                                fn.getNestedElement(
                                    calculated_data,
                                    '{0}.name'.format(join_key_4)),
                                'code':
                                fn.getNestedElement(
                                    calculated_data,
                                    '{0}.code'.format(join_key_4)),
                                'total':
                                generateSummary(params=params, data=portion_4),
                                keys['key4']:
                                portion_4,
                            })
                    if portion_3:
                        portion_2.append({
                            'id':
                            fn.getNestedElement(calculated_data,
                                                '{0}.id'.format(join_key_3)),
                            'name':
                            fn.getNestedElement(calculated_data,
                                                '{0}.name'.format(join_key_3)),
                            'code':
                            fn.getNestedElement(calculated_data,
                                                '{0}.code'.format(join_key_3)),
                            'total':
                            generateSummary(params=params, data=portion_3),
                            keys['key3']:
                            portion_3,
                        })
                if portion_2:
                    portion_1.append({
                        'id':
                        fn.getNestedElement(calculated_data,
                                            '{0}.id'.format(join_key_2)),
                        'name':
                        fn.getNestedElement(calculated_data,
                                            '{0}.name'.format(join_key_2)),
                        'code':
                        fn.getNestedElement(calculated_data,
                                            '{0}.code'.format(join_key_2)),
                        'total':
                        generateSummary(params=params, data=portion_2),
                        keys['key2']:
                        portion_2,
                    })
            elif report_name == 'procurement':
                for uv2 in unique_values2:
                    new_data_3 = list(
                        filter(
                            lambda d: d[global_group_order_kepmap[keys['key2']]
                                        ] == uv2, new_data_2))
                    join_key_3 = '_'.join([uv0, uv1, uv2])
                    children = generateChildren(params=params, data=new_data_3)
                    if children:
                        portion_2.append({
                            'id':
                            fn.getNestedElement(calculated_data,
                                                '{0}.id'.format(join_key_3)),
                            'name':
                            fn.getNestedElement(calculated_data,
                                                '{0}.name'.format(join_key_3)),
                            'code':
                            fn.getNestedElement(calculated_data,
                                                '{0}.code'.format(join_key_3)),
                            'total':
                            generateSummary(params=params, data=children),
                            'children':
                            children,
                        })
                if portion_2:
                    portion_1.append({
                        'id':
                        fn.getNestedElement(calculated_data,
                                            '{0}.id'.format(join_key_2)),
                        'name':
                        fn.getNestedElement(calculated_data,
                                            '{0}.name'.format(join_key_2)),
                        'code':
                        fn.getNestedElement(calculated_data,
                                            '{0}.code'.format(join_key_2)),
                        'total':
                        generateSummary(params=params, data=portion_2),
                        keys['key2']:
                        portion_2,
                    })
        if portion_1:
            portion_0.append({
                'id':
                fn.getNestedElement(calculated_data,
                                    '{0}.id'.format(join_key_1)),
                'name':
                fn.getNestedElement(calculated_data,
                                    '{0}.name'.format(join_key_1)),
                'code':
                fn.getNestedElement(calculated_data,
                                    '{0}.code'.format(join_key_1)),
                'total':
                generateSummary(params=params, data=portion_1),
                keys['key1']:
                portion_1,
            })
    if portion_0:
        result[report_name] = {
            'group_order': global_group_order[report_name],
            'total': generateSummary(params=params, data=portion_0),
            keys['key0']: portion_0,
        }
    Debug.end()
    Debug.show('toOutputStructure')
    return result
Beispiel #11
0
def calculateData(params, data):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')
    custom_params = copy.deepcopy(params)
    report_name = Report.getReportName(params)
    group_order = global_group_order[report_name]
    result = {}
    unique_keys = generateUniqueKeys(params, data)
    Debug.trace('get unique_keys')

    for uk in unique_keys:
        # Logger.v('uk', uk);
        split_uk = uk.split('_')
        # Logger.v('split_uk', split_uk);
        # Logger.v('lenght split_uk', len(split_uk));
        custom_params['split_uk'] = split_uk

        functions = generateFilterFunction(params=custom_params)
        function_name = 'function_{0}'.format(len(split_uk))

        # Logger.v('functions', functions);
        filtered_data = list(filter(functions[function_name], data))
        # Logger.v('filtered_data', filtered_data);

        # Logger.v('sum of first allocation', sum(list(obj_['first_allocation'] for obj_ in filtered_data)));
        total_keys = global_report_key_update[report_name]
        # Debug.trace('get filtered_data');

        if filtered_data:
            item = generateItemDetail(params=custom_params, data=filtered_data)

            if uk not in result:
                result[uk] = {}

            result[uk] = {
                'id': item['id'],
                'name': item['name'],
                'code': item['code'],
                'total': {},
            }
            for tk in total_keys:
                try:
                    result[uk]['total'][tk] = sum(
                        list(obj_[tk] for obj_ in filtered_data))
                except Exception as KeyError:
                    # Logger.v('Report.calculateData: {0} not found, sum up after data cleaning process.'.format(tk));
                    if tk == 'total_allocation':
                        result[uk]['total'][tk] = result[uk]['total'][
                            'first_allocation'] + result[uk]['total'][
                                'additional_allocation']
                    elif tk == 'balance_amount':
                        result[uk]['total'][tk] = result[uk]['total'][
                            'first_allocation'] + result[uk]['total'][
                                'additional_allocation'] - result[uk]['total'][
                                    'pending_amount'] - result[uk]['total'][
                                        'liablity_amount'] - result[uk][
                                            'total']['utilized_amount']

                # Logger.v('tk', tk);
        # Debug.trace('calculating data');

    # Logger.v('result', result);
    Debug.end()
    Debug.show('Report.calculateData')
    return result
Beispiel #12
0
def generateCrawlParam(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    global pass_month_quantity
    dbManager = SharedMemoryManager.getInstance()
    db = dbManager.query()
    crawl_params = {}
    limit_for_test = 10
    report_keys = fn.getNestedElement(params, 'keys.report',
                                      ['budget', 'procurement'])
    interval = fn.getNestedElement(params, 'interval', 1)
    filter_facility_code = fn.getNestedElement(params, 'filter.facility_code',
                                               True)
    check_empty = fn.getNestedElement(params, 'schedule_params.check_empty',
                                      False)
    today = fn.getNestedElement(
        params, 'schedule_params.today',
        DateTime.toString(DateTime.now(tzinfo=msia_tz)))
    # Logger.v('filter_facility_code', filter_facility_code);
    if check_empty:
        # past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval);
        past_dates = DateTime.getPastDate(
            count=pass_month_quantity,
            duration=interval,
            end=DateTime.convertDateTimeFromString(today))
        # Logger.v('past_dates', past_dates);
        # exit();
    else:
        past_dates = DateTime.getPastDate(count=pass_month_quantity,
                                          duration=interval)

    # Logger.v('past_dates', past_dates);
    state_codes = retrieveOption(collection_name='state',
                                 show_keys=['state_code'],
                                 hide_keys=['_id'])
    state_code = extractListByKey(data=state_codes, key='state_code')
    facility_codes = retrieveOption(collection_name='facility',
                                    show_keys=['facility_code'],
                                    hide_keys=['_id'])
    facility_code = extractListByKey(data=facility_codes, key='facility_code')
    for key in report_keys:
        # Logger.v('collection', key, past_dates[0]);
        Debug.trace()
        if key not in crawl_params:
            crawl_params[key] = []
        mongo_data = list(db[key].find({}, {}))

        if len(mongo_data) == 0:
            dates = past_dates[0][:]
        else:
            dates = past_dates[0][:1]

        year = extractYear(data=dates)
        # Logger.v('year', year);
        # Logger.v('filter_facility_code', filter_facility_code);
        if key == 'budget':
            if not filter_facility_code:
                iteration = 0
                total = len(year) * len(state_code)
                # fn.printProgressBar(iteration=iteration, total=total);
                for y in year:
                    for sc in state_code:
                        obj_ = {
                            'financial_year': y,
                            'state_code': sc,
                            'page_type': key,
                            'upid': '_'.join([sc, y]),
                            'url': api_links[key].format(sc, y, ''),
                            'start_date': today,
                            'end_date': today,
                        }
                        if obj_ not in crawl_params[key]:
                            crawl_params[key].append(obj_)
                            # Logger.v('len(crawl_param])', len(crawl_params[key]));
                        iteration += 1
                        # fn.printProgressBar(iteration=iteration, total=total);
            else:
                iteration = 0
                total = len(year) * len(state_code) * len(
                    facility_code[:limit_for_test])
                # fn.printProgressBar(iteration=iteration, total=total);
                for y in year:
                    for sc in state_code:
                        for fc in facility_code[:limit_for_test]:
                            obj_ = {
                                'financial_year': y,
                                'state_code': sc,
                                'page_type': key,
                                'upid': '_'.join([sc, y, fc]),
                                'facility_code': fc,
                                'url': api_links[key].format(sc, y, fc),
                                'start_date': today,
                                'end_date': today,
                            }
                            if obj_ not in crawl_params[key]:
                                crawl_params[key].append(obj_)
                                # Logger.v('len(crawl_param])', len(crawl_params[key]));
                            iteration += 1
                            # fn.printProgressBar(iteration=iteration, total=total);

        elif key == 'procurement':
            if not filter_facility_code:
                for past_duration in dates:
                    start_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=-1,
                                            datefrom=past_duration[0]))
                    end_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=1,
                                            datefrom=past_duration[1]))
                    for sc in state_code:
                        obj_ = {
                            'state_code':
                            sc,
                            'start_date':
                            start_date,
                            'end_date':
                            end_date,
                            'page_type':
                            key,
                            'upid':
                            '_'.join([sc, start_date, end_date]),
                            'url':
                            api_links[key].format(sc,
                                                  start_date.replace('-', ''),
                                                  end_date.replace('-', ''),
                                                  ''),
                        }

                        if obj_ not in crawl_params[key]:
                            crawl_params[key].append(obj_)
                            # Logger.v('len(crawl_param])', len(crawl_params[key]));
            else:
                for past_duration in dates:
                    start_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=-1,
                                            datefrom=past_duration[0]))
                    end_date = DateTime.toString(
                        DateTime.getDaysAgo(days_to_crawl=1,
                                            datefrom=past_duration[1]))
                    for sc in state_code:
                        for fc in facility_code[:limit_for_test]:
                            obj_ = {
                                'state_code':
                                sc,
                                'start_date':
                                start_date,
                                'end_date':
                                end_date,
                                'page_type':
                                key,
                                'facility_code':
                                fc,
                                'upid':
                                '_'.join([sc, start_date, end_date, fc]),
                                'url':
                                api_links[key].format(
                                    sc, start_date.replace('-', ''),
                                    end_date.replace('-', ''), fc)
                            }
                            if obj_ not in crawl_params[key]:
                                crawl_params[key].append(obj_)
                                # Logger.v('len(crawl_param])', len(crawl_params[key]));

    for c in crawl_params:
        # Logger.v('crawl_params', c, len(crawl_params[c]));
        fn.writeExcelFile(filename='{0}/{1}'.format(test_folder, c),
                          data=crawl_params[c])
    Logger.v('crawl_params', len(crawl_params))
    Debug.show('Generate Crawl Params')
    return crawl_params
Beispiel #13
0
def sourceKlsescreener(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')

    stock_list = pd.read_csv('reference/stock_info/stock_list.csv')
    stock_codes = stock_list['code'].unique().tolist()
    Debug.trace('read stock list')
    summary = []
    error = []
    for code in stock_codes:
        result = {}
        url = 'https://www.klsescreener.com/v2/stocks/view/{0}'.format(code)
        Logger.v('crawling:', url)

        page = ''
        error_count = 0
        while page == '':
            try:
                page = requests.get(url)
                error_count = 0
                break
            except:
                if error_count > 3:
                    page = 'None'
                    error_count = 0
                    break
                print("Connection refused by the server..")
                print("Let me sleep for 5 seconds")
                print("ZZzzzz...")
                time.sleep(5)
                print("Was a nice sleep, now let me continue...")
                error_count += 1
                continue

        if not page == '':
            time.sleep(3)
            soup = BeautifulSoup(page.text, 'html.parser')
            # print(soup.prettify());
            # print(soup.find_all('div', id='quarter_reports'));
            # Logger.v('Quarter Report');
            quarter_reports_div = soup.find_all('div', id='quarter_reports')
            result['quarter_reports'] = getQuarterData(
                data=quarter_reports_div)

            # Logger.v('Annual Report'); # th in tbody
            annual_report_div = soup.find_all('div', id='annual')
            result['annual'] = getAnnualData(data=annual_report_div)

            # Logger.v('Dividend');
            dividends_div = soup.find_all('div', id='dividends')
            result['dividends'] = getQuarterData(data=dividends_div)

            # Logger.v('Capital Changes');
            capital_changes_div = soup.find_all('div', id='capital_changes')
            result['capital_changes'] = getQuarterData(
                data=capital_changes_div)

            # Logger.v('Warrants'); # every row got new tbody
            warrants_div = soup.find_all('div', id='warrants')
            result['warrants'] = getQuarterData(data=warrants_div)

            # Logger.v('Shareholding Changes'); # every row got new tbody
            shareholding_changes_div = soup.find_all('div',
                                                     id='shareholding_changes')
            result['shareholding_changes'] = getQuarterData(
                data=shareholding_changes_div)
            Debug.trace('crawl report')

            save_dir = 'crawled_data/reports'
            fn.ensureDirectory(save_dir)
            save_file = '{0}/{1}.json'.format(save_dir, code)
            fn.writeJSONFile(save_file, result)

            summary.append({
                'code':
                code,
                'quarter_reports':
                len(result['quarter_reports']),
                'annual':
                len(result['annual']),
                'dividends':
                len(result['dividends']),
                'capital_changes':
                len(result['capital_changes']),
                'warrants':
                len(result['warrants']),
                'shareholding_changes':
                len(result['shareholding_changes']),
            })
            Debug.trace('save report')
        elif page == 'None':
            error.append({'code': code})
    # Logger.v('summary', summary);
    summary_file = '{0}/summary/klsescreener_report'.format(save_dir)
    fn.writeExcelFile(filename=summary_file, data=summary)
    Debug.trace('save summary')
    error_file = '{0}/error/klsescreener_report'.format(save_dir)
    fn.writeExcelFile(filename=error_file, data=error)
    Debug.trace('save error')
    Debug.end()
    Debug.show('sourceKlsescreener')
    return True
Beispiel #14
0
def sourceBursaMalaysia(params):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')
    url = 'https://www.bursamalaysia.com/market_information/equities_prices'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    total_page = int(soup.find('li', id='total_page').get('data-val'))
    limit_page = None
    Logger.v('total pages', total_page)
    # print(soup.find_all('h2'));
    date = ''
    time = ''
    for div in soup.find_all('div'):
        if div.get('class') == [
                'col', 'bg-white', 'mr-1', 'mb-0', 'h5', 'bold', 'p-2'
        ] and date == '':
            # print(div.get_text().strip());
            date = div.get_text().strip()
        if div.get('class') == [
                'col', 'bg-white', 'h5', 'mb-0', 'bold', 'p-2'
        ] and time == '':
            # print(div.get_text().strip());
            time = div.get_text().strip()
    Logger.v('date', date, 'time', time)
    result = []
    Debug.trace('get detail')
    for page in range(1, total_page + 1)[:limit_page]:
        fn.printProgressBar(iteration=page,
                            total=total_page + 1,
                            prefix='Page:{0}'.format(page))
        url_with_page = 'https://www.bursamalaysia.com/market_information/equities_prices?page={0}'.format(
            page)
        page = requests.get(url_with_page)
        soup = BeautifulSoup(page.text, 'html.parser')
        tables = soup.find_all('table')
        for table in tables:
            if table.get('class') == [
                    'table', 'datatable-striped', 'text-center',
                    'equity_prices_table', 'datatable-with-sneak-peek',
                    'js-anchor-price-table'
            ]:  # only take first table (price)
                headers = getTableHeader(data=table)
                headers += ['date', 'time']
                content = getTableContent(data=table)
                for row in content:
                    row += [date, time]
                    result.append(dict(zip(headers, row)))
                # Logger.v('headers', headers)

    # Logger.v('result', len(result), result);
    Debug.trace('crawl price')
    save_dir = 'crawled_data/price'
    filename = '{0}_{1}_price.csv'.format(date, time)
    fn.ensureDirectory(save_dir)
    save_file = '{0}/{1}'.format(save_dir, filename)
    df = pd.DataFrame(result)
    df.to_csv(save_file, index=False)
    Logger.v('saved {0}'.format(save_file))
    Debug.trace('save price')

    saveStockList(data=result)
    Debug.trace('save stock list')
    Debug.end()
    Debug.show('sourceBursaMalaysia')
    return True
Beispiel #15
0
def calculateData(params, data):
    Debug = DebugManager.DebugManager()
    Debug.start()
    Debug.trace('start')
    global naming_keymap, crawl_folder
    item_key_to_show = fn.getNestedElement(params, 'item_key_to_show')
    process_order = fn.getNestedElement(params, 'process_order')
    custom_params = copy.deepcopy(params)
    result = {}
    main_po = {}

    df = pd.DataFrame(data[:]).astype(str)
    df = preprocessDataframe(params=custom_params, data=df)
    summation_df = groupDataframe(params=custom_params, data=df)

    # to check summation result
    # output_file = '{0}/output/find_result.xlsx'.format(crawl_folder);
    # df.to_excel(output_file);
    Debug.trace('dataframe process')

    for idx in range(0, len(process_order)):
        main_po[idx] = {
            'po': process_order[idx],
            'group_po': [],
        }
        for idx1 in range(0, idx + 1):
            main_po[idx]['group_po'].append(process_order[idx1])

    for idx in range(0, len(process_order)):
        po = main_po[idx]['po']
        group_po = main_po[idx]['group_po']
        custom_params['po'] = {
            'po': po,
            'naming_keymap': naming_keymap,
        }
        grouped_df = df.groupby(group_po).groups
        Logger.v('len', idx, 'th', len(grouped_df.keys()))
        if idx == 0:
            for gk in grouped_df.keys():
                code = gk.split('|')[-1]
                name = df[df[po] == gk][naming_keymap[po]].unique().tolist()[0]
                result[gk] = {
                    'id': fn.convertToSnakecase(gk),
                    'name': name,
                    'code': code,
                }
        else:
            for gk in grouped_df.keys():
                level = gk[-2]
                code = gk[-1]

                if idx == 1:
                    temp_result = result
                elif idx == 2:
                    # Logger.v('gk0', gk[0], main_po[idx-1]['po'], 'level', level, 'code', code);
                    temp_result = fn.getNestedElement(
                        result, '{0}.{1}'.format(gk[0],
                                                 main_po[idx - 1]['po']))
                elif idx == 3:
                    # Logger.v('gk0', gk[0], main_po[idx-2]['po'], gk[1], main_po[idx-1]['po'], 'level', level, 'code', code);
                    temp_result = fn.getNestedElement(
                        result,
                        '{0}.{1}.{2}.{3}'.format(gk[0], main_po[idx - 2]['po'],
                                                 gk[1],
                                                 main_po[idx - 1]['po']))

                if po not in temp_result[level]:
                    temp_result[level][po] = {}
                if code not in temp_result[level][po]:
                    temp_result[level][po][code] = {}

                custom_params['po'].update({
                    'gk': code,
                })
                # when this is the last element in process_order
                if process_order[-1] == po:
                    last_child_data = insertNthChild(params=custom_params,
                                                     data=summation_df,
                                                     is_last=True)
                    info = last_child_data['info']
                    temp_result[level][po][code] = last_child_data['obj_']
                    # add extra info by group_by
                    for ik in item_key_to_show:
                        temp_result[level][po][code].update({
                            ik:
                            info[ik].values[0],
                        })
                else:
                    last_child_data = insertNthChild(params=custom_params,
                                                     data=summation_df)
                    temp_result[level][po][code] = last_child_data['obj_']
        Debug.trace('{0}th'.format(idx))

    Debug.end()
    Debug.show('Model.Stock.calculateData')
    return result