def upload(params): Debug = DebugManager.DebugManager(); Debug.start(); Debug.trace('start'); dbManager = SharedMemoryManager.getInstance(); db = dbManager.query(); date = fn.getNestedElement(params, 'date'); path = fn.getNestedElement(params, 'path'); # url = fn.getNestedElement(params, 'callback_url'); # required params to handle callback_url paths, should_reset = ModelUpload.getPath(params); for idx in range(0, len(paths)): p = paths[idx]; processed_filename = File.converExcelFileToCsv(p, ignore_index=True); Logger.v('processed_filename', processed_filename); Debug.trace('convert to json : path {0}'.format( processed_filename ) ); if idx == 0 and should_reset: #reset once at the beginning Logger.v('Reset Database.'); reset(date); #reset stock_issue collection ModelSIIntegrity.reset(date); #reset stock_issue_datalog by date given File.readCsvFileInChunks(processed_filename, save, params, chunksize=chunksize); Debug.trace('uploaded to mongo.'); generateIndex(); ModelSIIntegrity.generateIndex(); Debug.trace('indexing mongo collection.'); saveIssueOption(); Debug.trace('save option to json.'); trigger_params = copy.deepcopy(params); trigger_params['result'] = 'data count: {0}'.format(params['data_count'][path]); # Logger.v('trigger_params', trigger_params); dbManager.executeBulkOperations(None); # Insert all the remaining job at once. ReportStock.triggerOnComplete(trigger_params); Debug.trace('trigger api on complete.'); Debug.end(); Debug.show('Stock.upload');
def stockIntegrity(params, data): Debug = DebugManager.DebugManager() Debug.start() global msia_tz, date_retrieve_limit result = [] today = DateTime.now(tzinfo=msia_tz) start_date = DateTime.getDaysAgo(date_retrieve_limit, datefrom=today) durations = DateTime.getBetween([start_date, today], element='date', offset=24)['order'] # offset 24 hour to include today state_data = fn.getNestedElement(data, 'state') facility_data_by_state = fn.getNestedElement(data, 'state_facility') check_data = combinedFacilityList(data=facility_data_by_state) result = getIntegrity(params={ 'durations': durations, }, data={ 'facility': facility_data_by_state, 'state': state_data, 'to_update': result, 'check_data': check_data, }) updateStateData(result) result = list(sorted(result, key=lambda k: k['name'], reverse=False)) Debug.end() Debug.show('Model.Structure.stockIntegrity') return result
def createSchedules(args={}): #upid, page_type global filter_page_type Debug = DebugManager.DebugManager() Debug.start() dbManager = SharedMemoryManager.getInstance() # crawl_duration = fn.getNestedElement(fn.config,'CRAWL_DURATION', 12); incomplete_task, incomplete_task_count = checkRemaining() new_queue_count = 0 # Logger.v(incomplete_task_count, incomplete_task, filter_page_type); extra_params = { 'crawl_comment': fn.getNestedElement(args, 'crawl_comment', None) } extra_params = {k: v for k, v in extra_params.items() if v is not None} for platform in filter_page_type: if args and not platform in fn.getNestedElement( args, 'page_type', platform).split(','): Logger.v('Skip Platform:%s' % (platform)) continue # skip when page_type appear and not same pages = fn.getNestedElement(args, 'pages.{0}'.format(platform), []) Logger.v('platform', platform) # Logger.v('page', args['pages']['budget']); for page in pages: #Create queue for each # Logger.v('page', page); Queue.create(page, extra_params=extra_params, priority=fn.getNestedElement(args, 'priority', 'daily'), batch=True) new_queue_count += 1 Logger.v('new_queue_count', new_queue_count) # Debug.trace(); Logger.v('Incomplete:%s, New Queue: %s' % (incomplete_task_count, new_queue_count)) if incomplete_task_count > (new_queue_count * int(fn.config['DEBUG_CRAWL_WARNING']) / 100) or incomplete_task_count > int( fn.config['DEBUG_CRAWL_WARNING']): # Mail.send('[%s]Incomplete Crawl [%s], Current Schedule: [%s]'%(DateTime.getReadableDate(DateTime.now()), # incomplete_task_count, new_queue_count), # fn.dumps(incomplete_task, encode=False) # ); pass result = { 'pending_count': new_queue_count, 'incomplete_count': incomplete_task_count } dbManager.executeBulkOperations(None) # Debug.show('Create Schedule'); return Params.generate(True, result)
def getResult2(action, params): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') if action == 'procurement': result = ModelProcurement.get(params=params) # result = Budget.get(params=params); elif action == 'budget': result = ModelBudget.get(params=params) # result = Budget.get(params=params); Debug.end() Debug.show('Report.getResult2') return result
def check(params): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') global process_order global key_to_join global group_by group_by_list = fn.getNestedElement(params, 'group_by', []) filter_quantity = fn.getNestedElement(params, 'filter.quantity', []) export = fn.getNestedElement(params, 'export', None) custom_params = copy.deepcopy(params) result = {} # filter and read mongo db data = ModelStockIssue.get(params) Debug.trace('read mongo') # filtering by group if group_by_list: group_by = group_by_list[-1]['id'] else: group_by = 'state' custom_params['group_by_key'] = group_by custom_params['process_order'] = process_order[group_by] custom_params['key_to_join'] = key_to_join[group_by] custom_params['item_key_to_show'] = item_key_to_show[group_by] # processing data if data: temp_result = ModelStockIssue.calculateData(params=custom_params, data=data) Debug.trace('calculate data') result = toOutputStructure(params=custom_params, data=temp_result) Debug.trace('structure data') if result == {}: result = [] Debug.end() Debug.show('StockIssue.run') if export: export_result = generateExcelStructure(params=custom_params, data=result) return Params.generate(True, export_result) else: return Params.generate(True, result)
def get(params): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') report_name = Report.getReportName(params) Logger.v('-----Start Getting Data-----') data = getData(params=params) Debug.trace('get data') Logger.v('-----Start Calculate Data-----') calculated_data = calculateData(params=params, data=data) Debug.trace('calculate data') Logger.v('-----Start Restructure Data-----') result = toOutputStructure(params=params, data={ 'data': data, 'calculated_data': calculated_data }) Debug.trace('structure data') Debug.end() Debug.show('Model.Procurement.get') return result
def get(action, params): Debug = DebugManager.DebugManager() Debug.start() if action == 'whitelist': result = getWhitelist(params) elif action == 'procurement': Logger.v('action:', action) # result = getResult(action, params); # XXX result = getResult2(action, params) # XXX elif action == 'budget': Logger.v('action:', action) # result = getResult(action, params); # XXX result = getResult2(action, params) # XXX elif action == 'stockcheck': Logger.v('action', action) result = Stock.run(params) Debug.end() Debug.show('Report.get') return Params.generate(True, result)
def run(params): Mail.send( '{0} Crawl - Store starts'.format( DateTime.getReadableDate(DateTime.now())), 'Start at: {0}'.format(DateTime.now(tzinfo=msia_tz))) recordTime(key='create_schedule_start_time') Debug = DebugManager.DebugManager() Debug.start() start_crawl = fn.getNestedElement(params, 'schedule_params.start_crawl', False) check_empty = fn.getNestedElement(params, 'schedule_params.check_empty', False) Logger.v('Creating schedule:') updateDropdownOptions(params=params) crawl_params = generateCrawlParam(params) Debug.trace('Generate Crawl Params') createSchedules({'pages': crawl_params}) Debug.trace('Create Schedule') if start_crawl: Crawl.start(params) Debug.trace('crawling') recordTime(key='create_schedule_end_time') Debug.show('Run')
def extractSearch(): from lib import File, Logger, DebugManager from Report import Item, Stock Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') # filename = '2020-03-30.xlsx'; filename = 'upload/desk.csv' # fn.ensureDirectory('tests'); data = Item.upload(filename) filename = 'upload/desh.csv' # fn.ensureDirectory('tests'); data = Item.upload(filename, new=False) # Stock.upload({ # 'action': 'upload_stock', # 'id': 1, # 'date': '2020-03-30', # 'path': filename, # staging filepath # }); # Item.save(data); # result = Item.find({ 'search_text': 'oxygen', # 'item_code': 'Y1690580002.00,Y1690130024.00' # 'group_by' : 'sub_group_desc' }) fn.show(result) # for d in result['data']: # Logger.v(d); # File.writeJSONFile('search.json',data); Debug.trace('end') Debug.end() Debug.show('Daniel.py')
def toOutputStructure(params, data): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') new_data = fn.getNestedElement(data, 'data') calculated_data = fn.getNestedElement(data, 'calculated_data') report_name = Report.getReportName(params) min_purchase_amount = fn.getNestedElement(params, 'filter.min_purchase_amount', 0) if type(min_purchase_amount) == str: min_purchase_amount = int(min_purchase_amount) total_keys = global_report_key_update[report_name] result = {} all_keys = global_group_order[report_name] keys = {} for idx in range(0, len(all_keys)): ak = all_keys[idx] i = str(idx) keys['key' + i] = ak new_data_0 = copy.deepcopy(new_data) portion_0 = [] unique_values0 = sorted( list( set(obj_[global_group_order_kepmap[keys['key0']]] for obj_ in new_data_0))) for uv0 in unique_values0: new_data_1 = list( filter(lambda d: d[global_group_order_kepmap[keys['key0']]] == uv0, new_data_0)) portion_1 = [] unique_values1 = sorted( list( set(obj_[global_group_order_kepmap[keys['key1']]] for obj_ in new_data_1))) join_key_1 = '_'.join([uv0]) for uv1 in unique_values1: new_data_2 = list( filter( lambda d: d[global_group_order_kepmap[keys['key1']]] == uv1, new_data_1)) portion_2 = [] unique_values2 = sorted( list( set(obj_[global_group_order_kepmap[keys['key2']]] for obj_ in new_data_2))) join_key_2 = '_'.join([uv0, uv1]) if report_name == 'budget': for uv2 in unique_values2: new_data_3 = list( filter( lambda d: d[global_group_order_kepmap[keys['key2']] ] == uv2, new_data_2)) portion_3 = [] unique_values3 = sorted( list( set(obj_[global_group_order_kepmap[keys['key3']]] for obj_ in new_data_3))) join_key_3 = '_'.join([uv0, uv1, uv2]) for uv3 in unique_values3: new_data_4 = list( filter( lambda d: d[global_group_order_kepmap[keys[ 'key3']]] == uv3, new_data_3)) portion_4 = [] unique_values4 = sorted( list( set(obj_[global_group_order_kepmap[ keys['key4']]] for obj_ in new_data_4))) join_key_4 = '_'.join([uv0, uv1, uv2, uv3]) for uv4 in unique_values4: # last key new_data_5 = list( filter( lambda d: d[global_group_order_kepmap[keys[ 'key4']]] == uv4, new_data_4)) join_key_5 = '_'.join([uv0, uv1, uv2, uv3, uv4]) children = generateChildren(params=params, data=new_data_5) if children: portion_4.append({ 'id': fn.getNestedElement( calculated_data, '{0}.id'.format(join_key_5)), 'name': fn.getNestedElement( calculated_data, '{0}.name'.format(join_key_5)), 'code': fn.getNestedElement( calculated_data, '{0}.code'.format(join_key_5)), 'total': generateSummary(params=params, data=children), 'children': children, }) if portion_4: portion_3.append({ 'id': fn.getNestedElement( calculated_data, '{0}.id'.format(join_key_4)), 'name': fn.getNestedElement( calculated_data, '{0}.name'.format(join_key_4)), 'code': fn.getNestedElement( calculated_data, '{0}.code'.format(join_key_4)), 'total': generateSummary(params=params, data=portion_4), keys['key4']: portion_4, }) if portion_3: portion_2.append({ 'id': fn.getNestedElement(calculated_data, '{0}.id'.format(join_key_3)), 'name': fn.getNestedElement(calculated_data, '{0}.name'.format(join_key_3)), 'code': fn.getNestedElement(calculated_data, '{0}.code'.format(join_key_3)), 'total': generateSummary(params=params, data=portion_3), keys['key3']: portion_3, }) if portion_2: portion_1.append({ 'id': fn.getNestedElement(calculated_data, '{0}.id'.format(join_key_2)), 'name': fn.getNestedElement(calculated_data, '{0}.name'.format(join_key_2)), 'code': fn.getNestedElement(calculated_data, '{0}.code'.format(join_key_2)), 'total': generateSummary(params=params, data=portion_2), keys['key2']: portion_2, }) elif report_name == 'procurement': for uv2 in unique_values2: new_data_3 = list( filter( lambda d: d[global_group_order_kepmap[keys['key2']] ] == uv2, new_data_2)) join_key_3 = '_'.join([uv0, uv1, uv2]) children = generateChildren(params=params, data=new_data_3) if children: portion_2.append({ 'id': fn.getNestedElement(calculated_data, '{0}.id'.format(join_key_3)), 'name': fn.getNestedElement(calculated_data, '{0}.name'.format(join_key_3)), 'code': fn.getNestedElement(calculated_data, '{0}.code'.format(join_key_3)), 'total': generateSummary(params=params, data=children), 'children': children, }) if portion_2: portion_1.append({ 'id': fn.getNestedElement(calculated_data, '{0}.id'.format(join_key_2)), 'name': fn.getNestedElement(calculated_data, '{0}.name'.format(join_key_2)), 'code': fn.getNestedElement(calculated_data, '{0}.code'.format(join_key_2)), 'total': generateSummary(params=params, data=portion_2), keys['key2']: portion_2, }) if portion_1: portion_0.append({ 'id': fn.getNestedElement(calculated_data, '{0}.id'.format(join_key_1)), 'name': fn.getNestedElement(calculated_data, '{0}.name'.format(join_key_1)), 'code': fn.getNestedElement(calculated_data, '{0}.code'.format(join_key_1)), 'total': generateSummary(params=params, data=portion_1), keys['key1']: portion_1, }) if portion_0: result[report_name] = { 'group_order': global_group_order[report_name], 'total': generateSummary(params=params, data=portion_0), keys['key0']: portion_0, } Debug.end() Debug.show('toOutputStructure') return result
def calculateData(params, data): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') custom_params = copy.deepcopy(params) report_name = Report.getReportName(params) group_order = global_group_order[report_name] result = {} unique_keys = generateUniqueKeys(params, data) Debug.trace('get unique_keys') for uk in unique_keys: # Logger.v('uk', uk); split_uk = uk.split('_') # Logger.v('split_uk', split_uk); # Logger.v('lenght split_uk', len(split_uk)); custom_params['split_uk'] = split_uk functions = generateFilterFunction(params=custom_params) function_name = 'function_{0}'.format(len(split_uk)) # Logger.v('functions', functions); filtered_data = list(filter(functions[function_name], data)) # Logger.v('filtered_data', filtered_data); # Logger.v('sum of first allocation', sum(list(obj_['first_allocation'] for obj_ in filtered_data))); total_keys = global_report_key_update[report_name] # Debug.trace('get filtered_data'); if filtered_data: item = generateItemDetail(params=custom_params, data=filtered_data) if uk not in result: result[uk] = {} result[uk] = { 'id': item['id'], 'name': item['name'], 'code': item['code'], 'total': {}, } for tk in total_keys: try: result[uk]['total'][tk] = sum( list(obj_[tk] for obj_ in filtered_data)) except Exception as KeyError: # Logger.v('Report.calculateData: {0} not found, sum up after data cleaning process.'.format(tk)); if tk == 'total_allocation': result[uk]['total'][tk] = result[uk]['total'][ 'first_allocation'] + result[uk]['total'][ 'additional_allocation'] elif tk == 'balance_amount': result[uk]['total'][tk] = result[uk]['total'][ 'first_allocation'] + result[uk]['total'][ 'additional_allocation'] - result[uk]['total'][ 'pending_amount'] - result[uk]['total'][ 'liablity_amount'] - result[uk][ 'total']['utilized_amount'] # Logger.v('tk', tk); # Debug.trace('calculating data'); # Logger.v('result', result); Debug.end() Debug.show('Report.calculateData') return result
def generateCrawlParam(params): Debug = DebugManager.DebugManager() Debug.start() global pass_month_quantity dbManager = SharedMemoryManager.getInstance() db = dbManager.query() crawl_params = {} limit_for_test = 10 report_keys = fn.getNestedElement(params, 'keys.report', ['budget', 'procurement']) interval = fn.getNestedElement(params, 'interval', 1) filter_facility_code = fn.getNestedElement(params, 'filter.facility_code', True) check_empty = fn.getNestedElement(params, 'schedule_params.check_empty', False) today = fn.getNestedElement( params, 'schedule_params.today', DateTime.toString(DateTime.now(tzinfo=msia_tz))) # Logger.v('filter_facility_code', filter_facility_code); if check_empty: # past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval); past_dates = DateTime.getPastDate( count=pass_month_quantity, duration=interval, end=DateTime.convertDateTimeFromString(today)) # Logger.v('past_dates', past_dates); # exit(); else: past_dates = DateTime.getPastDate(count=pass_month_quantity, duration=interval) # Logger.v('past_dates', past_dates); state_codes = retrieveOption(collection_name='state', show_keys=['state_code'], hide_keys=['_id']) state_code = extractListByKey(data=state_codes, key='state_code') facility_codes = retrieveOption(collection_name='facility', show_keys=['facility_code'], hide_keys=['_id']) facility_code = extractListByKey(data=facility_codes, key='facility_code') for key in report_keys: # Logger.v('collection', key, past_dates[0]); Debug.trace() if key not in crawl_params: crawl_params[key] = [] mongo_data = list(db[key].find({}, {})) if len(mongo_data) == 0: dates = past_dates[0][:] else: dates = past_dates[0][:1] year = extractYear(data=dates) # Logger.v('year', year); # Logger.v('filter_facility_code', filter_facility_code); if key == 'budget': if not filter_facility_code: iteration = 0 total = len(year) * len(state_code) # fn.printProgressBar(iteration=iteration, total=total); for y in year: for sc in state_code: obj_ = { 'financial_year': y, 'state_code': sc, 'page_type': key, 'upid': '_'.join([sc, y]), 'url': api_links[key].format(sc, y, ''), 'start_date': today, 'end_date': today, } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); iteration += 1 # fn.printProgressBar(iteration=iteration, total=total); else: iteration = 0 total = len(year) * len(state_code) * len( facility_code[:limit_for_test]) # fn.printProgressBar(iteration=iteration, total=total); for y in year: for sc in state_code: for fc in facility_code[:limit_for_test]: obj_ = { 'financial_year': y, 'state_code': sc, 'page_type': key, 'upid': '_'.join([sc, y, fc]), 'facility_code': fc, 'url': api_links[key].format(sc, y, fc), 'start_date': today, 'end_date': today, } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); iteration += 1 # fn.printProgressBar(iteration=iteration, total=total); elif key == 'procurement': if not filter_facility_code: for past_duration in dates: start_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=-1, datefrom=past_duration[0])) end_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=1, datefrom=past_duration[1])) for sc in state_code: obj_ = { 'state_code': sc, 'start_date': start_date, 'end_date': end_date, 'page_type': key, 'upid': '_'.join([sc, start_date, end_date]), 'url': api_links[key].format(sc, start_date.replace('-', ''), end_date.replace('-', ''), ''), } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); else: for past_duration in dates: start_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=-1, datefrom=past_duration[0])) end_date = DateTime.toString( DateTime.getDaysAgo(days_to_crawl=1, datefrom=past_duration[1])) for sc in state_code: for fc in facility_code[:limit_for_test]: obj_ = { 'state_code': sc, 'start_date': start_date, 'end_date': end_date, 'page_type': key, 'facility_code': fc, 'upid': '_'.join([sc, start_date, end_date, fc]), 'url': api_links[key].format( sc, start_date.replace('-', ''), end_date.replace('-', ''), fc) } if obj_ not in crawl_params[key]: crawl_params[key].append(obj_) # Logger.v('len(crawl_param])', len(crawl_params[key])); for c in crawl_params: # Logger.v('crawl_params', c, len(crawl_params[c])); fn.writeExcelFile(filename='{0}/{1}'.format(test_folder, c), data=crawl_params[c]) Logger.v('crawl_params', len(crawl_params)) Debug.show('Generate Crawl Params') return crawl_params
def sourceKlsescreener(params): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') stock_list = pd.read_csv('reference/stock_info/stock_list.csv') stock_codes = stock_list['code'].unique().tolist() Debug.trace('read stock list') summary = [] error = [] for code in stock_codes: result = {} url = 'https://www.klsescreener.com/v2/stocks/view/{0}'.format(code) Logger.v('crawling:', url) page = '' error_count = 0 while page == '': try: page = requests.get(url) error_count = 0 break except: if error_count > 3: page = 'None' error_count = 0 break print("Connection refused by the server..") print("Let me sleep for 5 seconds") print("ZZzzzz...") time.sleep(5) print("Was a nice sleep, now let me continue...") error_count += 1 continue if not page == '': time.sleep(3) soup = BeautifulSoup(page.text, 'html.parser') # print(soup.prettify()); # print(soup.find_all('div', id='quarter_reports')); # Logger.v('Quarter Report'); quarter_reports_div = soup.find_all('div', id='quarter_reports') result['quarter_reports'] = getQuarterData( data=quarter_reports_div) # Logger.v('Annual Report'); # th in tbody annual_report_div = soup.find_all('div', id='annual') result['annual'] = getAnnualData(data=annual_report_div) # Logger.v('Dividend'); dividends_div = soup.find_all('div', id='dividends') result['dividends'] = getQuarterData(data=dividends_div) # Logger.v('Capital Changes'); capital_changes_div = soup.find_all('div', id='capital_changes') result['capital_changes'] = getQuarterData( data=capital_changes_div) # Logger.v('Warrants'); # every row got new tbody warrants_div = soup.find_all('div', id='warrants') result['warrants'] = getQuarterData(data=warrants_div) # Logger.v('Shareholding Changes'); # every row got new tbody shareholding_changes_div = soup.find_all('div', id='shareholding_changes') result['shareholding_changes'] = getQuarterData( data=shareholding_changes_div) Debug.trace('crawl report') save_dir = 'crawled_data/reports' fn.ensureDirectory(save_dir) save_file = '{0}/{1}.json'.format(save_dir, code) fn.writeJSONFile(save_file, result) summary.append({ 'code': code, 'quarter_reports': len(result['quarter_reports']), 'annual': len(result['annual']), 'dividends': len(result['dividends']), 'capital_changes': len(result['capital_changes']), 'warrants': len(result['warrants']), 'shareholding_changes': len(result['shareholding_changes']), }) Debug.trace('save report') elif page == 'None': error.append({'code': code}) # Logger.v('summary', summary); summary_file = '{0}/summary/klsescreener_report'.format(save_dir) fn.writeExcelFile(filename=summary_file, data=summary) Debug.trace('save summary') error_file = '{0}/error/klsescreener_report'.format(save_dir) fn.writeExcelFile(filename=error_file, data=error) Debug.trace('save error') Debug.end() Debug.show('sourceKlsescreener') return True
def sourceBursaMalaysia(params): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') url = 'https://www.bursamalaysia.com/market_information/equities_prices' page = requests.get(url) soup = BeautifulSoup(page.text, 'html.parser') total_page = int(soup.find('li', id='total_page').get('data-val')) limit_page = None Logger.v('total pages', total_page) # print(soup.find_all('h2')); date = '' time = '' for div in soup.find_all('div'): if div.get('class') == [ 'col', 'bg-white', 'mr-1', 'mb-0', 'h5', 'bold', 'p-2' ] and date == '': # print(div.get_text().strip()); date = div.get_text().strip() if div.get('class') == [ 'col', 'bg-white', 'h5', 'mb-0', 'bold', 'p-2' ] and time == '': # print(div.get_text().strip()); time = div.get_text().strip() Logger.v('date', date, 'time', time) result = [] Debug.trace('get detail') for page in range(1, total_page + 1)[:limit_page]: fn.printProgressBar(iteration=page, total=total_page + 1, prefix='Page:{0}'.format(page)) url_with_page = 'https://www.bursamalaysia.com/market_information/equities_prices?page={0}'.format( page) page = requests.get(url_with_page) soup = BeautifulSoup(page.text, 'html.parser') tables = soup.find_all('table') for table in tables: if table.get('class') == [ 'table', 'datatable-striped', 'text-center', 'equity_prices_table', 'datatable-with-sneak-peek', 'js-anchor-price-table' ]: # only take first table (price) headers = getTableHeader(data=table) headers += ['date', 'time'] content = getTableContent(data=table) for row in content: row += [date, time] result.append(dict(zip(headers, row))) # Logger.v('headers', headers) # Logger.v('result', len(result), result); Debug.trace('crawl price') save_dir = 'crawled_data/price' filename = '{0}_{1}_price.csv'.format(date, time) fn.ensureDirectory(save_dir) save_file = '{0}/{1}'.format(save_dir, filename) df = pd.DataFrame(result) df.to_csv(save_file, index=False) Logger.v('saved {0}'.format(save_file)) Debug.trace('save price') saveStockList(data=result) Debug.trace('save stock list') Debug.end() Debug.show('sourceBursaMalaysia') return True
def calculateData(params, data): Debug = DebugManager.DebugManager() Debug.start() Debug.trace('start') global naming_keymap, crawl_folder item_key_to_show = fn.getNestedElement(params, 'item_key_to_show') process_order = fn.getNestedElement(params, 'process_order') custom_params = copy.deepcopy(params) result = {} main_po = {} df = pd.DataFrame(data[:]).astype(str) df = preprocessDataframe(params=custom_params, data=df) summation_df = groupDataframe(params=custom_params, data=df) # to check summation result # output_file = '{0}/output/find_result.xlsx'.format(crawl_folder); # df.to_excel(output_file); Debug.trace('dataframe process') for idx in range(0, len(process_order)): main_po[idx] = { 'po': process_order[idx], 'group_po': [], } for idx1 in range(0, idx + 1): main_po[idx]['group_po'].append(process_order[idx1]) for idx in range(0, len(process_order)): po = main_po[idx]['po'] group_po = main_po[idx]['group_po'] custom_params['po'] = { 'po': po, 'naming_keymap': naming_keymap, } grouped_df = df.groupby(group_po).groups Logger.v('len', idx, 'th', len(grouped_df.keys())) if idx == 0: for gk in grouped_df.keys(): code = gk.split('|')[-1] name = df[df[po] == gk][naming_keymap[po]].unique().tolist()[0] result[gk] = { 'id': fn.convertToSnakecase(gk), 'name': name, 'code': code, } else: for gk in grouped_df.keys(): level = gk[-2] code = gk[-1] if idx == 1: temp_result = result elif idx == 2: # Logger.v('gk0', gk[0], main_po[idx-1]['po'], 'level', level, 'code', code); temp_result = fn.getNestedElement( result, '{0}.{1}'.format(gk[0], main_po[idx - 1]['po'])) elif idx == 3: # Logger.v('gk0', gk[0], main_po[idx-2]['po'], gk[1], main_po[idx-1]['po'], 'level', level, 'code', code); temp_result = fn.getNestedElement( result, '{0}.{1}.{2}.{3}'.format(gk[0], main_po[idx - 2]['po'], gk[1], main_po[idx - 1]['po'])) if po not in temp_result[level]: temp_result[level][po] = {} if code not in temp_result[level][po]: temp_result[level][po][code] = {} custom_params['po'].update({ 'gk': code, }) # when this is the last element in process_order if process_order[-1] == po: last_child_data = insertNthChild(params=custom_params, data=summation_df, is_last=True) info = last_child_data['info'] temp_result[level][po][code] = last_child_data['obj_'] # add extra info by group_by for ik in item_key_to_show: temp_result[level][po][code].update({ ik: info[ik].values[0], }) else: last_child_data = insertNthChild(params=custom_params, data=summation_df) temp_result[level][po][code] = last_child_data['obj_'] Debug.trace('{0}th'.format(idx)) Debug.end() Debug.show('Model.Stock.calculateData') return result