def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: logger.info("Testing phase") location_code = args.get('locationCode', None) sample_name = args.get("sampleName", None) ###This is how your same report #rb_location = RBLocation(logger, village_code) #rb_location.save_report(logger, report_type, village_df) rb_crawler = RBCrawler(logger) if location_code is not None: village_df = rb_crawler.get_crawl_df(logger, block_code=location_code) elif sample_name is not None: village_df = rb_crawler.get_crawl_df(logger, tag_name=sample_name) else: logger.info("Either of location Code or Sample Name input is required") logger.info("Exiting!!!") exit(0) # village_df = pd.read_csv("~/thrash/village_df.csv") my_crawler = Crawler() #filename = "rayatu_barosa_district_block.csv" my_crawler.download_payment_report(logger, village_df, sample_name=sample_name) #my_crawler.read_census_parquet(logger) #my_crawler.teardown_method() logger.info("...END PROCESSING")
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: logger.info("Testing phase") if args['download']: logger.info("Dowloading Data") sample_name = "appi_ap_nrega_itda" force_download = True location_class = "APBlock" itda_blocks = [ '0203006', '0203005', '0203012', '0203004', '0203011', '0203013', '0203003', '0203014', '0203001', '0203010', '0203002' ] report_types = ["ap_suspended_payments_r14_5"] for report_type in report_types: for location_code in itda_blocks: my_location = getattr(models, location_class)( logger=logger, location_code=location_code, force_download=force_download, sample_name=sample_name) method_to_call = getattr(my_location, report_type) method_to_call(logger) logger.info("...END PROCESSING")
def runTestSuite(): args = argsFetch() logger = logger_fetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) # Mynk to use personal profile driver = driverInitialize(browser=args['browser'] , path='/home/mayank/.mozilla/firefox/4s3bttuq.default/') if args['cookie_dump']: cookieDump(driver) logger.info("Fetching [%s]" % driver.current_url) logger.info(wdTest(driver, args['url'])) logger.info("Fetched [%s]" % driver.current_url) if args['cookie_dump']: cookieDump(driver) driverFinalize(driver) displayFinalize(display) ''' display = vDisplayInitialize(visible) driver = driverInitialize(browser) logger.info(wdTest(driver)) driverFinalize(driver) vDisplayFinalize(display) ''' logger.info("...END PROCESSING")
def setUp(self): self.logger = logger_fetch('info') self.logger.info('BEGIN PROCESSING...') self.display = displayInitialize(isDisabled=True, isVisible=True) self.driver = driverInitialize(timeout=3) self.baseURL = "https://fadeb121.ngrok.io" self.listURL = f"{self.baseURL}/apartments"
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: logger.info("Testing phase") if args['execute']: logger.info("Executing state status") df = pd.read_csv( "https://libtech-india-data.s3.ap-south-1.amazonaws.com/temp_archives/stateStatus.csv", dtype={'code': object}) for index, row in df.iterrows(): state_code = row.get("code") success = row.get("success") fail = row.get("fail") total = row.get("total") logger.info(state_code) status = nic_server_status(logger, state_code) if status == True: success = success + 1 else: fail = fail + 1 total = total + 1 df.loc[index, "success"] = success df.loc[index, "fail"] = fail df.loc[index, "total"] = total df.to_csv("stateStatus.csv", index=False) filename = f"temp_archives/stateStatus.csv" file_url = upload_s3(logger, filename, df) logger.info(file_url) logger.info("...END PROCESSING")
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: input_dir = args['inputDir'] if input_dir is None: input_dir = "data/profiles/" output_dir = args['outputDir'] if output_dir is None: output_dir = "data/json/" file_list = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) ] logger.info(file_list) posts_dict = {} for each_file in file_list: filename = f"{input_dir}{each_file}" with open(filename, "rb") as f: myhtml = f.read() mysoup = BeautifulSoup(myhtml, "lxml") my_dict = extract_posts(logger, mysoup) posts_dict[each_file] = my_dict with open(f'{output_dir}profile_posts_data.json', 'w', encoding='utf8') as json_file: json.dump(posts_dict, json_file, indent=4, ensure_ascii=False) exit(0) logger.info("Testing phase") with open("himanta_cleaned.html", "rb") as f: myhtml = f.read() mysoup = BeautifulSoup(myhtml, "lxml") extract_posts(logger, mysoup) exit(0) with open("himanta.html", "rb") as f: myhtml = f.read() mysoup = BeautifulSoup(myhtml, "lxml") with open("himanta_cleaned.html", "w") as f: f.write(mysoup.prettify()) if args['crawl']: logger.info("Crawling facebook") base_url = 'https://mobile.facebook.com' session = requests.session() # Extracts credentials for the login and all of the profiles URL to scrape credentials = json_to_obj('credentials.json') profiles_urls = json_to_obj('profiles_urls.json') make_login(session, base_url, credentials) posts_data = None for profile_url in profiles_urls: posts_data = crawl_profile(logger, session, base_url, profile_url, 25) logging.info('[!] Scraping finished. Total: {}'.format( len(posts_data))) logging.info('[!] Saving.') save_data(posts_data) logger.info("...END PROCESSING")
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: logger.info("Testing phase") logger.info("...END PROCESSING") if args['execute']: logger.info("Executing task") process_name = args.get("processName", "default") task_id = args.get("taskID", None) execute_task(logger, process_name=process_name, task_id=task_id)
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: logger.info("Testing") #village_df_url = "https://libtech-india-data.s3.ap-south-1.amazonaws.com/data/locations/rayatu_barosa/vskp_villages.csv" #village_df = pd.read_csv(village_df_url, index_col=0) filename = 'data/locations/rayatu_barosa/vskp_villages.csv' village_df = pd.read_csv("~/thrash/v.csv") upload_s3(logger, filename, village_df) if args['crawl']: location_code = args['locationCode'] rb_crawler = RBCrawler(logger) dataframe = rb_crawler.get_crawl_df(logger, block_code=location_code) logger.info(dataframe.head()) logger.info("...END PROCESSING")
def __init__(self, logger=None, directory=None): if logger: self.logger = logger else: logger = self.logger = logger_fetch('info') logger.info(f'Constructor({type(self).__name__})') self.dir = '../Data/NIC' if directory: self.dir = directory if not os.path.exists(self.dir): os.makedirs(self.dir) self.base_url = 'http://nregasp2.nic.in/netnrega/Citizen_html/Musternew.aspx' self.session = requests.Session() response = self.session.get(self.base_url) self.cookies = self.session.cookies self.view_state = '' self.event_validation = ''
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args['test']: logger.info("Testing phase") test_input = args['testInput1'] parse_page(logger, test_input) if args['download']: logger.info("Downloading NE Magazine") category = args['category'] if category is not None: download_all_posts(logger, category) if args['crawl']: logger.info("Crawling NE Magazine") category = args['category'] if category is not None: scrape_ne(logger, category) logger.info("...END PROCESSING")
def main(): """Main Module of this program""" args = args_fetch() logging_dir = args.get("loggingDir", None) record_type = args.get("recordType", None) if record_type is None: record_type = "worker_register" home_dir = str(Path.home()) if logging_dir is None: logging_dir = f"{home_dir}/logs" if not os.path.exists(logging_dir): os.makedirs(logging_dir) log_file_path = f"{logging_dir}/{record_type}.log" logger = file_logger_fetch(args.get('log_level'), filepath=log_file_path) logger = logger_fetch(args.get('log_level')) if args['test']: logger.debug("Testing phase") for _ in range(40000): logger.debug("Hello, world!") if args['execute']: logger.debug("Executing ") while (True): obj = djmodels.Record.objects.filter( Q( Q(is_downloaded=False, record_type=record_type) | Q(is_recurring_download=True, record_type=record_type, download_date__lte=datetime.now() - timedelta( days=threshold_days)))).first().order_by("updated") if obj is None: time.sleep(5) continue logger.info(obj.id) obj.in_progress = True obj.save() time.sleep(2) obj.in_progress = False obj.save() logger.info("...END PROCESSING")
def main(): """Main Module of this program""" args = args_fetch() logger = logger_fetch(args.get('log_level')) if args["createBundle"]: location_sample = args.get("location_sample", None) location_type = args.get("locationType", None) location_code = args.get("locationCode", None) report_types = args.get("reportTypes", None) report_format = args.get("reportFormat", 'both') filename = args.get("zipfilename", None) bundle_title = args.get("bundleTitle", None) my_sample = LibtechSample(logger, sample_type=location_type, tag_name=location_sample, parent_location_code=location_code) url = my_sample.create_bundle(logger, report_types, filename=filename, title=bundle_title, report_format=report_format) logger.info(url) if args['populate']: func_name = args.get('func_name', None) is_not_nic = args.get('notnic', False) is_nic = not is_not_nic location_sample = args.get("location_sample", None) location_type = args.get("locationType", None) location_code = args.get("locationCode", None) priority = args.get("priority", 100) logger.info(location_code) my_sample = LibtechSample(logger, sample_type=location_type, tag_name=location_sample, parent_location_code=location_code, is_nic=is_nic) logger.info(my_sample.sample_location_codes) my_sample.populate_queue(logger, func_name, priority=priority) if args['insert']: logger.info("Inserting in Crawl Queue") location_code = args.get('locationCode', None) is_not_nic = args.get('notnic', None) func_name = args.get('func_name', None) location_type = args.get('locationType', 'panchayat') location_codes = [] if location_type == 'block': location_codes.append(location_code) elif location_type == 'panchayat': if len(location_code) == 7: block_location = getattr(models, BLOCK_CLASS)( logger=logger, location_code=location_code) location_codes = block_location.get_all_panchayats(logger) else: location_codes = [location_code] else: location_codes.append(location_code) data = { 'report_type': func_name, } for location_code in location_codes: data['location_code'] = location_code data['location_class'] = LOCATION_CLASS create_task(logger, data) if args['verify']: state_codes = api_get_child_locations(logger, 0) csv_array = [] columns = ["state", "code", "server status"] for state_code in state_codes: logger.info(state_code) ldict = get_location_dict(logger, location_code=state_code) state_name = ldict.get("name", "") status = nic_server_status(logger, state_code) a = [state_name, state_code, status] csv_array.append(a) logger.info(csv_array) df = pd.DataFrame(csv_array, columns=columns) df.to_csv('/tmp/stateStatus.csv') if args['test']: location_sample = args.get("location_sample", None) location_type = args.get("locationType", None) location_code = args.get("locationCode", None) logger.info(location_code) my_sample = LibtechSample(logger, sample_type=location_type, tag_name=location_sample, parent_location_code=location_code) logger.info(my_sample.sample_location_codes) exit(0) zipfilename = args.get("zipfilename", "zzz") tempDir = args.get("tempDir", "/tmp") if location_sample is not None: my_sample = getattr(samplemodels, location_sample)(logger) else: my_sample = LibtechSample(logger, sample_type='panchayat', parent_location_code="3406007") report_types = [ "worker_register", "nic_stats", "work_payment", "jobcard_transactions", "block_rejected_transactions" ] report_types = ["nic_stats"] report_types = ["block_rejected_transactions", "nic_r4_1"] report_types = ["worker_register"] report_types = ["block_rejected_transactions"] logger.info(tempDir) logger.info(zipfilename) zip_file_name = tempDir + "/" + zipfilename download_dir = tempDir + "/" + str(datetime.datetime.now().timestamp()) file_url = my_sample.create_bundle(logger, report_types, download_dir=download_dir, zip_file_name=zip_file_name) logger.info(f"file_url is {file_url}") # data={} # data['location_code'] = district_code # data['location_class'] = "NREGADistrict" # data['report_type'] = "nic_stat_urls" # create_task(logger, data) exit(0) url = "http://mnregaweb2.nic.in/Netnrega/placeHolder1/placeHolder2/../../citizen_html/musternew.aspx?lflag=&id=1&state_name=CHHATTISGARH&district_name=JASHPUR&block_name=Manora&panchayat_name=Alori&block_code=3307016&msrno=5603&finyear=2016-2017&workcode=3307016001%2fWC%2f81094155&dtfrm=27%2f02%2f2017&dtto=05%2f03%2f2017&wn=Laghu+Sichai+Talab+Nirman+Pushani+%2fRengashu+(1.60+Lakhs)&Digest=nTMkfSq3BkT80yXpUwcuFw" extract_dict = {} extract_dict['pattern'] = f"CH-" extract_dict['table_id_array'] = [ "ctl00_ContentPlaceHolder1_grdShowRecords", "ContentPlaceHolder1_grdShowRecords" ] extract_dict['split_cell_array'] = [1] cookies = None dataframe = get_dataframe_from_url(logger, url, mydict=extract_dict, cookies=cookies) logger.info(dataframe.head()) if args['debug']: logger.info("Debug phase") if args['forceDownload']: force_download = True else: force_download = False location_code = args.get('locationCode', None) func_name = args.get('func_name', None) location_type = args.get('locationType', 'panchayat') if args['notnic']: is_nic = False else: is_nic = True report_name = func_name sample_name = args.get('sample_name', "on_demand") if sample_name is None: sample_name = "on_demand" result = download_report(logger, location_code, location_type, report_name, is_nic=is_nic, force_download=force_download) exit(0) logger.info(f"in debug sample name is {sample_name}") location_codes = [] location_class = get_location_class(logger, location_type, args['notnic']) if location_type == 'block': location_codes.append(location_code) elif location_type == 'panchayat': if len(location_code) == 7: block_location = getattr(models, BLOCK_CLASS)( logger=logger, location_code=location_code) location_codes = block_location.get_all_panchayats(logger) else: location_codes = [location_code] else: location_codes.append(location_code) for location_code in location_codes: my_location = getattr(models, location_class)( logger=logger, location_code=location_code, force_download=force_download, sample_name=sample_name) # my_location.muster_list(logger) method_to_call = getattr(my_location, func_name) method_to_call(logger) logger.info("...END PROCESSING")
def setUp(self): self.logger = logger_fetch('info') self.logger.info('BEGIN PROCESSING...')