Esempio n. 1
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.info("Testing phase")
        location_code = args.get('locationCode', None)
        sample_name = args.get("sampleName", None)
        ###This is how your same report
        #rb_location = RBLocation(logger, village_code)
        #rb_location.save_report(logger, report_type, village_df)
        rb_crawler = RBCrawler(logger)
        if location_code is not None:
            village_df = rb_crawler.get_crawl_df(logger, block_code=location_code)
        elif sample_name is not None:
            village_df = rb_crawler.get_crawl_df(logger, tag_name=sample_name)
        else:
            logger.info("Either of location Code or Sample Name input is required")
            logger.info("Exiting!!!")
            exit(0)
       # village_df = pd.read_csv("~/thrash/village_df.csv")
        my_crawler = Crawler()
        #filename = "rayatu_barosa_district_block.csv"
        my_crawler.download_payment_report(logger, village_df,
                                           sample_name=sample_name)
        #my_crawler.read_census_parquet(logger)
        #my_crawler.teardown_method()

    logger.info("...END PROCESSING")
Esempio n. 2
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.info("Testing phase")
    if args['download']:
        logger.info("Dowloading Data")
        sample_name = "appi_ap_nrega_itda"
        force_download = True
        location_class = "APBlock"
        itda_blocks = [
            '0203006', '0203005', '0203012', '0203004', '0203011', '0203013',
            '0203003', '0203014', '0203001', '0203010', '0203002'
        ]
        report_types = ["ap_suspended_payments_r14_5"]
        for report_type in report_types:
            for location_code in itda_blocks:
                my_location = getattr(models, location_class)(
                    logger=logger,
                    location_code=location_code,
                    force_download=force_download,
                    sample_name=sample_name)
                method_to_call = getattr(my_location, report_type)
                method_to_call(logger)

    logger.info("...END PROCESSING")
Esempio n. 3
0
def runTestSuite():
  args = argsFetch()
  logger = logger_fetch(args.get('log_level'))
  logger.info('args: %s', str(args))

  logger.info("BEGIN PROCESSING...")

  display = displayInitialize(args['visible'])
  driver = driverInitialize(args['browser'])
  # Mynk to use personal profile driver = driverInitialize(browser=args['browser'] , path='/home/mayank/.mozilla/firefox/4s3bttuq.default/')

  if args['cookie_dump']:
    cookieDump(driver)

  logger.info("Fetching [%s]" % driver.current_url)
  logger.info(wdTest(driver, args['url']))
  logger.info("Fetched [%s]" % driver.current_url)

  if args['cookie_dump']:
    cookieDump(driver)

  driverFinalize(driver)
  displayFinalize(display)

  '''
  display = vDisplayInitialize(visible)
  driver = driverInitialize(browser)

  logger.info(wdTest(driver))

  driverFinalize(driver)
  vDisplayFinalize(display)
  '''	

  logger.info("...END PROCESSING")     
Esempio n. 4
0
 def setUp(self):
     self.logger = logger_fetch('info')
     self.logger.info('BEGIN PROCESSING...')
     self.display = displayInitialize(isDisabled=True, isVisible=True)
     self.driver = driverInitialize(timeout=3)
     self.baseURL = "https://fadeb121.ngrok.io"
     self.listURL = f"{self.baseURL}/apartments"
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.info("Testing phase")
    if args['execute']:
        logger.info("Executing state status")
        df = pd.read_csv(
            "https://libtech-india-data.s3.ap-south-1.amazonaws.com/temp_archives/stateStatus.csv",
            dtype={'code': object})
        for index, row in df.iterrows():
            state_code = row.get("code")
            success = row.get("success")
            fail = row.get("fail")
            total = row.get("total")
            logger.info(state_code)
            status = nic_server_status(logger, state_code)
            if status == True:
                success = success + 1
            else:
                fail = fail + 1
            total = total + 1
            df.loc[index, "success"] = success
            df.loc[index, "fail"] = fail
            df.loc[index, "total"] = total
        df.to_csv("stateStatus.csv", index=False)
        filename = f"temp_archives/stateStatus.csv"
        file_url = upload_s3(logger, filename, df)
        logger.info(file_url)

    logger.info("...END PROCESSING")
Esempio n. 6
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        input_dir = args['inputDir']
        if input_dir is None:
            input_dir = "data/profiles/"
        output_dir = args['outputDir']
        if output_dir is None:
            output_dir = "data/json/"
        file_list = [
            f for f in listdir(input_dir) if isfile(join(input_dir, f))
        ]
        logger.info(file_list)
        posts_dict = {}
        for each_file in file_list:
            filename = f"{input_dir}{each_file}"
            with open(filename, "rb") as f:
                myhtml = f.read()
            mysoup = BeautifulSoup(myhtml, "lxml")
            my_dict = extract_posts(logger, mysoup)
            posts_dict[each_file] = my_dict
        with open(f'{output_dir}profile_posts_data.json', 'w',
                  encoding='utf8') as json_file:
            json.dump(posts_dict, json_file, indent=4, ensure_ascii=False)
        exit(0)
        logger.info("Testing phase")
        with open("himanta_cleaned.html", "rb") as f:
            myhtml = f.read()
        mysoup = BeautifulSoup(myhtml, "lxml")
        extract_posts(logger, mysoup)
        exit(0)

        with open("himanta.html", "rb") as f:
            myhtml = f.read()
        mysoup = BeautifulSoup(myhtml, "lxml")
        with open("himanta_cleaned.html", "w") as f:
            f.write(mysoup.prettify())
    if args['crawl']:
        logger.info("Crawling facebook")
        base_url = 'https://mobile.facebook.com'
        session = requests.session()

        # Extracts credentials for the login and all of the profiles URL to scrape
        credentials = json_to_obj('credentials.json')
        profiles_urls = json_to_obj('profiles_urls.json')

        make_login(session, base_url, credentials)
        posts_data = None
        for profile_url in profiles_urls:
            posts_data = crawl_profile(logger, session, base_url, profile_url,
                                       25)
            logging.info('[!] Scraping finished. Total: {}'.format(
                len(posts_data)))
            logging.info('[!] Saving.')
            save_data(posts_data)

    logger.info("...END PROCESSING")
Esempio n. 7
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.info("Testing phase")
    logger.info("...END PROCESSING")
    if args['execute']:
        logger.info("Executing task")
        process_name = args.get("processName", "default")
        task_id = args.get("taskID", None)
        execute_task(logger, process_name=process_name, task_id=task_id)
Esempio n. 8
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.info("Testing")
        #village_df_url = "https://libtech-india-data.s3.ap-south-1.amazonaws.com/data/locations/rayatu_barosa/vskp_villages.csv"
        #village_df = pd.read_csv(village_df_url, index_col=0)
        filename = 'data/locations/rayatu_barosa/vskp_villages.csv'
        village_df = pd.read_csv("~/thrash/v.csv")
        upload_s3(logger, filename, village_df)
    if args['crawl']:
        location_code = args['locationCode']
        rb_crawler = RBCrawler(logger)
        dataframe = rb_crawler.get_crawl_df(logger, block_code=location_code)
        logger.info(dataframe.head())
    logger.info("...END PROCESSING")
Esempio n. 9
0
    def __init__(self, logger=None, directory=None):
        if logger:
            self.logger = logger
        else:
            logger = self.logger = logger_fetch('info')
        logger.info(f'Constructor({type(self).__name__})')

        self.dir = '../Data/NIC'
        if directory:
            self.dir = directory
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)

        self.base_url = 'http://nregasp2.nic.in/netnrega/Citizen_html/Musternew.aspx'
        self.session = requests.Session()
        response = self.session.get(self.base_url)
        self.cookies = self.session.cookies
        self.view_state = ''
        self.event_validation = ''
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.info("Testing phase")
        test_input = args['testInput1']
        parse_page(logger, test_input)
    if args['download']:
        logger.info("Downloading NE Magazine")
        category = args['category']
        if category is not None:
            download_all_posts(logger, category)

    if args['crawl']:
        logger.info("Crawling NE Magazine")
        category = args['category']
        if category is not None:
            scrape_ne(logger, category)
    logger.info("...END PROCESSING")
Esempio n. 11
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logging_dir = args.get("loggingDir", None)
    record_type = args.get("recordType", None)
    if record_type is None:
        record_type = "worker_register"
    home_dir = str(Path.home())
    if logging_dir is None:
        logging_dir = f"{home_dir}/logs"
    if not os.path.exists(logging_dir):
        os.makedirs(logging_dir)

    log_file_path = f"{logging_dir}/{record_type}.log"
    logger = file_logger_fetch(args.get('log_level'), filepath=log_file_path)
    logger = logger_fetch(args.get('log_level'))
    if args['test']:
        logger.debug("Testing phase")
        for _ in range(40000):
            logger.debug("Hello, world!")
    if args['execute']:
        logger.debug("Executing ")
        while (True):
            obj = djmodels.Record.objects.filter(
                Q(
                    Q(is_downloaded=False, record_type=record_type)
                    | Q(is_recurring_download=True,
                        record_type=record_type,
                        download_date__lte=datetime.now() - timedelta(
                            days=threshold_days)))).first().order_by("updated")
            if obj is None:
                time.sleep(5)
                continue
            logger.info(obj.id)
            obj.in_progress = True
            obj.save()
            time.sleep(2)
            obj.in_progress = False
            obj.save()

    logger.info("...END PROCESSING")
Esempio n. 12
0
def main():
    """Main Module of this program"""
    args = args_fetch()
    logger = logger_fetch(args.get('log_level'))
    if args["createBundle"]:
        location_sample = args.get("location_sample", None)
        location_type = args.get("locationType", None)
        location_code = args.get("locationCode", None)
        report_types = args.get("reportTypes", None)
        report_format = args.get("reportFormat", 'both')

        filename = args.get("zipfilename", None)
        bundle_title = args.get("bundleTitle", None)
        my_sample = LibtechSample(logger,
                                  sample_type=location_type,
                                  tag_name=location_sample,
                                  parent_location_code=location_code)
        url = my_sample.create_bundle(logger,
                                      report_types,
                                      filename=filename,
                                      title=bundle_title,
                                      report_format=report_format)
        logger.info(url)

    if args['populate']:
        func_name = args.get('func_name', None)
        is_not_nic = args.get('notnic', False)
        is_nic = not is_not_nic
        location_sample = args.get("location_sample", None)
        location_type = args.get("locationType", None)
        location_code = args.get("locationCode", None)
        priority = args.get("priority", 100)
        logger.info(location_code)
        my_sample = LibtechSample(logger,
                                  sample_type=location_type,
                                  tag_name=location_sample,
                                  parent_location_code=location_code,
                                  is_nic=is_nic)
        logger.info(my_sample.sample_location_codes)
        my_sample.populate_queue(logger, func_name, priority=priority)
    if args['insert']:
        logger.info("Inserting in Crawl Queue")
        location_code = args.get('locationCode', None)
        is_not_nic = args.get('notnic', None)
        func_name = args.get('func_name', None)
        location_type = args.get('locationType', 'panchayat')
        location_codes = []
        if location_type == 'block':
            location_codes.append(location_code)
        elif location_type == 'panchayat':
            if len(location_code) == 7:
                block_location = getattr(models, BLOCK_CLASS)(
                    logger=logger, location_code=location_code)
                location_codes = block_location.get_all_panchayats(logger)
            else:
                location_codes = [location_code]
        else:
            location_codes.append(location_code)

        data = {
            'report_type': func_name,
        }
        for location_code in location_codes:
            data['location_code'] = location_code
            data['location_class'] = LOCATION_CLASS
            create_task(logger, data)
    if args['verify']:
        state_codes = api_get_child_locations(logger, 0)
        csv_array = []
        columns = ["state", "code", "server status"]
        for state_code in state_codes:
            logger.info(state_code)
            ldict = get_location_dict(logger, location_code=state_code)
            state_name = ldict.get("name", "")
            status = nic_server_status(logger, state_code)
            a = [state_name, state_code, status]
            csv_array.append(a)
        logger.info(csv_array)
        df = pd.DataFrame(csv_array, columns=columns)
        df.to_csv('/tmp/stateStatus.csv')
    if args['test']:
        location_sample = args.get("location_sample", None)
        location_type = args.get("locationType", None)
        location_code = args.get("locationCode", None)
        logger.info(location_code)
        my_sample = LibtechSample(logger,
                                  sample_type=location_type,
                                  tag_name=location_sample,
                                  parent_location_code=location_code)
        logger.info(my_sample.sample_location_codes)
        exit(0)
        zipfilename = args.get("zipfilename", "zzz")
        tempDir = args.get("tempDir", "/tmp")
        if location_sample is not None:
            my_sample = getattr(samplemodels, location_sample)(logger)
        else:
            my_sample = LibtechSample(logger,
                                      sample_type='panchayat',
                                      parent_location_code="3406007")
        report_types = [
            "worker_register", "nic_stats", "work_payment",
            "jobcard_transactions", "block_rejected_transactions"
        ]
        report_types = ["nic_stats"]
        report_types = ["block_rejected_transactions", "nic_r4_1"]
        report_types = ["worker_register"]
        report_types = ["block_rejected_transactions"]
        logger.info(tempDir)
        logger.info(zipfilename)
        zip_file_name = tempDir + "/" + zipfilename
        download_dir = tempDir + "/" + str(datetime.datetime.now().timestamp())
        file_url = my_sample.create_bundle(logger,
                                           report_types,
                                           download_dir=download_dir,
                                           zip_file_name=zip_file_name)
        logger.info(f"file_url is {file_url}")
        #        data={}
        #        data['location_code'] = district_code
        #        data['location_class'] = "NREGADistrict"
        #        data['report_type'] = "nic_stat_urls"
        #        create_task(logger, data)
        exit(0)
        url = "http://mnregaweb2.nic.in/Netnrega/placeHolder1/placeHolder2/../../citizen_html/musternew.aspx?lflag=&id=1&state_name=CHHATTISGARH&district_name=JASHPUR&block_name=Manora&panchayat_name=Alori&block_code=3307016&msrno=5603&finyear=2016-2017&workcode=3307016001%2fWC%2f81094155&dtfrm=27%2f02%2f2017&dtto=05%2f03%2f2017&wn=Laghu+Sichai+Talab+Nirman+Pushani+%2fRengashu+(1.60+Lakhs)&Digest=nTMkfSq3BkT80yXpUwcuFw"
        extract_dict = {}
        extract_dict['pattern'] = f"CH-"
        extract_dict['table_id_array'] = [
            "ctl00_ContentPlaceHolder1_grdShowRecords",
            "ContentPlaceHolder1_grdShowRecords"
        ]
        extract_dict['split_cell_array'] = [1]
        cookies = None

        dataframe = get_dataframe_from_url(logger,
                                           url,
                                           mydict=extract_dict,
                                           cookies=cookies)
        logger.info(dataframe.head())
    if args['debug']:
        logger.info("Debug phase")
        if args['forceDownload']:
            force_download = True
        else:
            force_download = False
        location_code = args.get('locationCode', None)
        func_name = args.get('func_name', None)
        location_type = args.get('locationType', 'panchayat')
        if args['notnic']:
            is_nic = False
        else:
            is_nic = True
        report_name = func_name
        sample_name = args.get('sample_name', "on_demand")
        if sample_name is None:
            sample_name = "on_demand"
        result = download_report(logger,
                                 location_code,
                                 location_type,
                                 report_name,
                                 is_nic=is_nic,
                                 force_download=force_download)
        exit(0)
        logger.info(f"in debug sample name is {sample_name}")
        location_codes = []
        location_class = get_location_class(logger, location_type,
                                            args['notnic'])
        if location_type == 'block':
            location_codes.append(location_code)
        elif location_type == 'panchayat':
            if len(location_code) == 7:
                block_location = getattr(models, BLOCK_CLASS)(
                    logger=logger, location_code=location_code)
                location_codes = block_location.get_all_panchayats(logger)
            else:
                location_codes = [location_code]
        else:
            location_codes.append(location_code)

        for location_code in location_codes:
            my_location = getattr(models, location_class)(
                logger=logger,
                location_code=location_code,
                force_download=force_download,
                sample_name=sample_name)
            # my_location.muster_list(logger)
            method_to_call = getattr(my_location, func_name)
            method_to_call(logger)

    logger.info("...END PROCESSING")
Esempio n. 13
0
 def setUp(self):
     self.logger = logger_fetch('info')
     self.logger.info('BEGIN PROCESSING...')