def runTestSuite(): logger = loggerFetch("info") logger.info("BEGIN PROCESSING...") display = displayInitialize(1) driver = driverInitialize(path='/home/mayank/.mozilla/firefox/4s3bttuq.default/', timeout=timeout) driver.set_page_load_timeout(timeout) for version in versions: url = url_base + str(version) try: logger.info('Fetching URL[%s]' % url) driver.get(url) logger.info('After Fetch[%s]' % url) except Exception as e: logger.info("Warning %s", e) if os.path.exists(csv_file): version_file = 'CSVs/' + str(version) + '.csv' logger.info('Writing %s' % version_file) os.rename(csv_file, version_file) else: logger.error('Missed file[%s] from URL[%s]' % (version_file, url)) logger.info("CSV Fetched From [%s]" % url) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING")
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") db = dbInitialize(db="biharPDS", charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) inyear=args['year'] logger.info(inyear) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) #Start Program here url="http://www.google.com" driver.get(url) myhtml=driver.page_source print myhtml # End program here driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def runTestSuite(): logger = loggerFetch("info") logger.info("BEGIN PROCESSING...") display = displayInitialize(0) driver = driverInitialize() for mandal in mandals: report = generate_report(logger, driver, mandal, districts[mandal]) logger.info('Finally: \n%s' % report) filename = './mandals/' +mandal + '.csv' with open(filename, 'wb') as csv_file: logger.info("Writing to [%s]" % filename) csv_file.write(report.encode('utf-8')) if final_report[0][0] == '': rows = report.split('|') driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") return def main(): runTestSuite() exit(0) if __name__ == '__main__': main()
def downloadJobcards(logger, db, cmd=None, directory=None, url=None, isVisible=None, isPushInfo=None, query=None, fetch_jobcard_details=None): ''' Crawl the html for the musters ''' logger.info("BEGIN %s..." % cmd) if cmd == None: cmd="Downloading" if directory == None: directory = "./jobcards" if url == None: url = 'http://www.nrega.telangana.gov.in/Nregs/FrontServlet?requestType=HouseholdInf_engRH&actionVal=SearchJOB&JOB_No=' if isVisible == None: isVisible = 0 if isPushInfo == None: isPushInfo = False logger.info("Command[%s] Directory[%s] URL[%s]" % (cmd, directory, url)) if not query: # Mynk - use when b.name is not all 'Ghattu' query = 'select j.jobcard, p.name, b.name from jobcardRegister j, panchayats p, blocks b where j.blockCode=p.blockCode and j.panchayatCode=p.panchayatCode and j.blockCode=b.blockCode' query = 'select j.jobcard, p.name, p.panchayatCode from jobcardRegister j, panchayats p, blocks b where j.blockCode=p.blockCode and j.panchayatCode=p.panchayatCode and j.blockCode=b.blockCode and DATE_SUB(NOW(), INTERVAL 1 DAY) >= downloadDate order by j.downloadDate' logger.info('Executing query: [%s]', query) cur = db.cursor() cur.execute(query) jobcard_details = cur.fetchall() if jobcard_details: logger.debug("Jobcard Details [%s]" % str(jobcard_details)) display = displayInitialize(isVisible) driver = driverInitialize() for (jobcard, panchayat, panchayat_code) in jobcard_details: logger.info( "jobcard[%s] panchayat[%s] panchayat_code[%s]" % (jobcard, panchayat, panchayat_code)) dirname = directory + '/' + panchayat html_source = downloadJobcardHTML(logger, driver, db, jobcard, dirname) if isPushInfo or fetch_jobcard_details: if html_source: pushMusterInfo(logger, db, html_source, jobcard, panchayat_code, fetch_jobcard_details) else: query = 'update jobcardRegister set isDownloaded=0 where jobcard="%s"' % (jobcard) # Mynk logger.info('Executing query: [%s]', query) cur = db.cursor() cur.execute(query) driverFinalize(driver) displayFinalize(display) logger.info("...END %s" % cmd)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") if args['limit']: limit = int(args['limit']) else: limit =1 display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) url="http://nrega.nic.in/netnrega/sthome.aspx" driver.get(url) myMusters=Muster.objects.filter( Q(isDownloaded=False) | Q(musterDownloadAttemptDate__lt = musterTimeThreshold,isComplete=0) )[:limit] for eachMuster in myMusters: logger.info(eachMuster.musterURL) logger.info("Processing musterNo: %s FullblockCode: %s " % (eachMuster.musterNo,eachMuster.block.fullBlockCode)) driver.get(eachMuster.musterURL) driver.get(eachMuster.musterURL) myhtml = driver.page_source error,musterTable,musterSummaryTable=validateMusterHTML(eachMuster,myhtml) if error is None: outhtml='' outhtml+=stripTableAttributes(musterSummaryTable,"musterSummary") outhtml+=stripTableAttributes(musterTable,"musterDetails") title="Muster: %s state:%s District:%s block:%s finyear:%s " % (eachMuster.musterNo,eachMuster.block.district.state.name,eachMuster.block.district.name,eachMuster.block.name,getFullFinYear(eachMuster.finyear)) logger.info(title) outhtml=htmlWrapperLocal(title=title, head='<h1 aling="center">'+title+'</h1>', body=outhtml) try: outhtml=outhtml.encode("UTF-8") except: outhtml=outhtml filename="%s.html" % (eachMuster.musterNo) eachMuster.musterFile.save(filename, ContentFile(outhtml)) eachMuster.musterDownloadAttemptDate=datetime.now() eachMuster.isDownloaded=True eachMuster.save() else: logger.info("Muster Download Erorr: %s " % (error)) eachMuster.musterDownloadAttemptDate=datetime.now() eachMuster.downloadError=error eachMuster.save() # myMusters=Muster.objects.filter( driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") db = dbInitialize(db="surguja", charset="utf8") if not args['parse']: display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) if args['visible']: delay = 2 download_dir = args['directory'] if download_dir: download_dir = download_dir + '/' + strftime('%B-%Y') logger.info('download_dir[%s]' % download_dir) if not os.path.exists(download_dir): os.makedirs(download_dir) if args['prev']: pdsFetchPrev(logger, driver, db, download_dir, args['month'], args['year']) elif args['parse']: pdsReportParse(logger, db, download_dir) elif args['work_allocation']: downloadWorkAllocationHTML(driver, db, logger) # Mynk Fix Order elif args['fetch']: pdsFetchReports(logger, driver, db, download_dir) else: pdsFetch(logger, driver, db, download_dir) if not args['parse']: driverFinalize(driver) displayFinalize(display) dbFinalize(db) logger.info("...END PROCESSING") exit(0)
def runTestSuite(): logger = loggerFetch("info") logger.info("BEGIN PROCESSING...") display = displayInitialize(1) driver = driverInitialize(path='/opt/firefox/') driver.get(url) logger.info("Fetching...[%s]" % url) driver.get(url) # A double refresh required for the page to load logger.info("Refreshing...[%s]" % url) html_source = driver.page_source.replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') logger.debug("HTML Fetched [%s]" % html_source) bs = BeautifulSoup(html_source, "html.parser") tr_list = bs.findAll('tr', attrs={'class':['normalRow', 'alternateRow']}) logger.debug(str(tr_list)) for tr in tr_list: td = tr.find('td') td = td.findNext('td') panchayat = td.text.strip() logger.info("Panchayat[%s]", panchayat) elem = driver.find_element_by_link_text(panchayat) elem.click() filename="/tmp/%s.html" % panchayat with open(filename, 'wb') as html_file: logger.info("Writing [%s]" % filename) html_file.write(driver.page_source.encode('utf-8')) driver.back() driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING")
def runTestSuite(): logger = loggerFetch("info") logger.info("BEGIN PROCESSING...") display = displayInitialize(0) driver = driverInitialize() report = generate_report(logger, driver) logger.info('Finally: \n%s' % report) with open(filename, 'wb') as csv_file: logger.info("Writing to [%s]" % filename) csv_file.write(report.encode('utf-8')) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") return
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) if args['jobcard']: jobcard=args['jobcard'] else: jobcard='141975701001010679' logger.info("Fetching Jobcard[%s]..." % jobcard) html = downloadJobcardHTML(logger, driver, jobcard) logger.info(html) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") display = displayInitialize(args['visible']) driver = driverInitialize(browser=args['browser'] , path='/home/mayank/.mozilla/firefox/4s3bttuq.default/') base_url="https://www.skillshare.com/login" driver.get(base_url) logger.info('Fetching URL[%s]' % base_url) # driver.find_element_by_link_text("Sign In").click() try: driver.find_element_by_name("LoginForm[email]").clear() driver.find_element_by_name("LoginForm[email]").send_keys("*****@*****.**") driver.find_element_by_name("LoginForm[password]").clear() # driver.find_element_by_name("LoginForm[password]").send_keys("#######") # driver.find_element_by_xpath("//input[@value='Sign In']").click() time.sleep(100) # If you want to manually log in except Exception as e: logger.info('Already signed in [%s]', e) time.sleep(10) filename = "./z.csv" content = csv.reader(open(filename, 'r'), delimiter=',', quotechar='"') for (title, url) in content: # driver.get('https://www.skillshare.com/classes/Sketchbook-Practice-Bring-watercolour-to-Life-with-Line-Drawing/1053382271/classroom/discuss') logger.info('Fetching URL[%s]' % url) driver.get(url) time.sleep(10) escaped_title = re.sub(r"[^A-Za-z 0-9]+", '', title).replace(' ', '_') dirname = 'SkillsShare/' + escaped_title cmd = 'mkdir -p ' + dirname logger.info(cmd) os.system(cmd) els = driver.find_elements_by_class_name("session-item") for i, el in enumerate(els): logger.debug(str(el)) bs = BeautifulSoup(el.get_attribute('innerHTML'), "html.parser") p = bs.find('p') name = p.text name = "%02d" % (i+1) + '_' + re.sub(r"[^A-Za-z 0-9]+", '', name).replace(' ', '_') + '.mp4' logger.info(str(p) + name) el.click() time.sleep(10) html_source = driver.page_source bs = BeautifulSoup(html_source, "html.parser") html = bs.findAll('video', attrs={'class':['vjs-tech']}) str_html = str(html) logger.info(str_html) url = str_html[str_html.find("src=")+5:] fetch_url = url[:url.find("?pubId")] logger.info(fetch_url) if os.path.exists(dirname + '/' + name): continue cmd = 'cd %s && curl -s %s -o %s' % (dirname, fetch_url, name) logger.info(cmd) # os.system(cmd) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") db = dbInitialize(db=nregaDB, charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) if args['limit']: limit = int(args['limit']) else: limit =50000 limitString=" limit %s " % str(limit) additionalFilters='' if args['district']: additionalFilters+= " and b.districtName='%s' " % args['district'] if args['finyear']: additionalFilters+= " and finyear='%s' " % args['finyear'] if args["downloadWagelists"]: display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) url="http://164.100.129.6/netnrega/nregasearch1.aspx" driver.get(url) time.sleep(22) htmlsource = driver.page_source #writeFile("/tmp/a.html",htmlsource) stateName="CHHATTISGARH" districtName="SURGUJA" wagelistNo="3305002062WL005160" try: maintab = driver.current_window_handle Select(driver.find_element_by_id("ddl_search")).select_by_visible_text("WageList") driver.find_element_by_css_selector("option[value=\"WageList\"]").click() Select(driver.find_element_by_id("ddl_state")).select_by_visible_text(stateName.upper()) #myvalue='value="%s"' % stateCode driver.find_element_by_css_selector("option[value=\"33\"]").click() # driver.find_element_by_css_selector("option[%s]" % myvalue).click() Select(driver.find_element_by_id("ddl_district")).select_by_visible_text(districtName.upper()) #myvalue='value="%s"' % (stateCode+districtCode) driver.find_element_by_css_selector("option[value=\"3305\"]").click() #driver.find_element_by_css_selector("option[%s]" % myvalue).click() driver.find_element_by_id("txt_keyword2").clear() driver.find_element_by_id("txt_keyword2").send_keys(wagelistNo) driver.find_element_by_id("btn_go").click() time.sleep(30) #logger.info("Currently the number of active tabs are %s" % str(len(driver.window_handles))) error=0 except: error=1 wurl="http://164.100.129.6/netnrega/srch_wg_dtl.aspx?state_code=&district_code=3305&state_name=CHHATTISGARH&district_name=SURGUJA&block_code=3305002&wg_no=3305002062WL005160&short_name=CH&fin_year=2016-2017&mode=wg" driver.get(wurl) htmlsource = driver.page_source filename="%s/b.html" % tempDir writeFile(filename,htmlsource) query="select w.id,w.wagelistNo,b.rawBlockName,b.fullBlockCode,b.blockCode,b.districtCode,b.stateCode,b.stateShortCode,w.finyear,b.stateName,b.districtName from wagelists w,blocks b where w.fullBlockCode=b.fullBlockCode and ( (w.isDownloaded=0) or (w.isComplete=0 and TIMESTAMPDIFF(HOUR, w.downloadAttemptDate, now()) > 48 )) %s order by w.isDownloaded %s " % (additionalFilters,limitString) cur.execute(query) results=cur.fetchall() for row in results: [rowid,wagelistNo,blockName,fullBlockCode,blockCode,districtCode,stateCode,stateShortCode,finyear,stateName,districtName] = row fullfinyear=getFullFinYear(finyear) logger.info(" RowID : %s, wagelistNo: %s " % (str(rowid),wagelistNo)) jobcardPrefix="%s-%s-" % (stateShortCode,districtCode) #logger.info("Jobcard Prefix : %s " % jobcardPrefix) fullDistrictCode=stateCode+districtCode if wagelistNo != '': #logger.info(wagelistNo) wurl="http://%s/netnrega/srch_wg_dtl.aspx?state_code=&district_code=%s&state_name=%s&district_name=%s&block_code=%s&wg_no=%s&short_name=%s&fin_year=%s&mode=wg" % (searchIP,fullDistrictCode,stateName.upper(),districtName.upper(),fullBlockCode,wagelistNo,stateShortCode,fullfinyear) logger.info("URL: %s " % wurl) driver.get(wurl) htmlsource = driver.page_source htmlsource=htmlsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') success=0 isComplete=0 if ("WageList Agency Code" in htmlsource) and (jobcardPrefix in htmlsource): filepath=nregaRawDataDir.replace("stateName",stateName.upper()).replace("districtName",districtName.upper()) filename=filepath+blockName.upper()+"/WAGELISTS/"+fullfinyear+"/"+wagelistNo+".html" # filename=filepath+blockName.upper()+"/WAGELIST/"+fullfinyear+"/"+wagelistNo+".html" # logger.info(filename) writeFile(tempDir+wagelistNo+".html",htmlsource) writeFile(filename,htmlsource) success=1 isComplete=1 htmlsoup=BeautifulSoup(htmlsource) tables=htmlsoup.findAll('table') for table in tables: #logger.info("Found the Table") rows=table.findAll("tr") for row in rows: cols=row.findAll("td") ftoNo=cols[12].text if ftoNo != "FTO No.": #logger.info("FTO No : %s " % ftoNo) if stateShortCode not in ftoNo: isComplete=0 else: query="select * from ftos where finyear='%s' and fullBlockCode='%s' and ftoNo='%s'" % (finyear,fullBlockCode,ftoNo) #logger.info(query) cur.execute(query) if cur.rowcount == 0: query="insert into ftos (finyear,ftoNo,fullBlockCode,stateCode,districtCode,blockCode) values ('%s','%s','%s','%s','%s','%s') " % (finyear,ftoNo,fullBlockCode,stateCode,districtCode,blockCode) #logger.info(query) cur.execute(query) query="update wagelists set isDownloaded=%s,isComplete=%s,downloadAttemptDate=NOW() where id=%s" %(str(success),str(isComplete),str(rowid)) #logger.info(query) cur.execute(query) # driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w') # driver.switch_to_window(maintab) driverFinalize(driver) displayFinalize(display) if args["downloadFTOs"]: #additionalFilters='' #limitString= "limit 1 " query="select f.id,f.ftoNo,f.blockCode,f.finyear,f.fullBlockCode,b.rawBlockName,b.districtName,b.stateName,b.stateCode from ftos f,blocks b where f.fullBlockCode=b.fullBlockCode and f.isComplete=0 and ( (f.isDownloaded=0) or (TIMESTAMPDIFF(HOUR, f.downloadAttemptDate, now()) > 48 ) or f.downloadAttemptDate is NULL ) %s order by f.downloadAttemptDate,finyear %s " % (additionalFilters,limitString) logger.info(query) cur.execute(query) results=cur.fetchall() for row in results: [rowid,ftoNo,blockCode,finyear,fullBlockCode,blockName,districtName,stateName,stateCode]=row logger.info("districtName: %s, blockName: %s finyear: %s ftoNo: %s " % (districtName,blockName,finyear,ftoNo)) fullfinyear=getFullFinYear(finyear) if ftoNo != '': # filepath=nregaRawDataDir.replace("stateName",stateName.upper()).replace("districtName",districtName.upper()) filepath=nregaWebDir.replace("stateName",stateName.upper()).replace("districtName",districtName.upper()) filename=filepath+blockName.upper()+"/FTOs/"+fullfinyear+"/"+ftoNo+".html" logger.info("Downloading FTO: %s " % ftoNo) htmlresponse,htmlsource = getFTO(fullfinyear,stateCode,ftoNo) logger.info("Response = %s " % htmlresponse) success=0 isPopulatedString='' if htmlresponse['status'] == '200': logger.info("Status is 200") isPopulatedString="isPopulated=0," success,outhtml=alterFTO(cur,logger,htmlsource,stateName,districtName,blockName,ftoNo,rowid) #htmlsource=htmlsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) myfile = open(filename, "wb") myfile.write(outhtml.encode("UTF-8")) logger.info(filename) #writeFile(filename,htmlsource) #writeFile3("/home/libtech/webroot/nreganic.libtech.info/temp/"+ftoNo+".html",htmlsource) query="update ftos set isDownloaded=%s,%sdownloadAttemptDate=NOW() where id=%s" %(str(success),str(isPopulatedString),str(rowid)) logger.info(query) cur.execute(query) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) if args['limit']: limit = int(args['limit']) else: limit =1 nicBlockCode=args["nicBlockCode"] myBlock=Block.objects.filter(code=nicBlockCode).first() blockCode=myBlock.tcode blockName=myBlock.name url="http://www.nrega.telangana.gov.in/Nregs/FrontServlet?requestType=SmartCardreport_engRH&actionVal=MobnumberStatus&id=%s&Retype=null&type=null&file=%s" % (blockCode,blockName) logger.info(url) try: driver.get(url) driver.get(url) myhtml = driver.page_source error=0 except: error=1 if error==0: logger.info("No Error") # dom = lxml.html.fromstring(myhtml) # for link in dom.xpath('//a/@href'): # select the url in href for all a tags(links) # print(link) htmlsoup=BeautifulSoup(myhtml,"html.parser") table=htmlsoup.find('table',id="sortable") if table is not None: print("Found") for link in table.find_all('a'): logger.info(link['href']) panchayatLink="http://www.nrega.telangana.gov.in"+link['href'] myArray=panchayatLink.split("file=") panchayatName=myArray[1] logger.info(panchayatName+panchayatLink) myPanchayat=Panchayat.objects.filter(block=myBlock,name=panchayatName).first() if myPanchayat is not None: try: driver.get(panchayatLink) driver.get(panchayatLink) phtml = driver.page_source perror=0 except: perror=1 if perror==0: logger.info("No Error") phtmlsoup=BeautifulSoup(phtml,"html.parser") ptable=phtmlsoup.find('table',id="sortable") if ptable is not None: print("Found") for link in ptable.find_all('a'): logger.info(link['href']) villageLink="http://www.nrega.telangana.gov.in"+link['href'] myArray=villageLink.split("file=") villageName=myArray[1] par = parse_qs(urlparse(villageLink).query) villageID=str(par['id'][0]).lstrip().rstrip() villageName=str(par['file'][0]) logger.info(villageName+villageID) logger.info(par) logger.info(len(villageID)) myVillage=Village.objects.filter(tcode=villageID).first() if myVillage is None: Village.objects.create(tcode=villageID) myVillage=Village.objects.filter(tcode=villageID).first() myVillage.name=villageName myVillage.code=villageID myVillage.panchayat=myPanchayat myVillage.save() driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") exit(0)
def fetchMusterDetails(logger, db, cmd=None, directory=None, url=None, is_parse_info=None, is_push_info=None, is_visible=None): ''' Fetch the Muster Details for specified parameters in the specified directory ''' if not cmd: cmd = "FETCH MUSTER DETAILS" logger.info("BEGIN %s..." % cmd) if not directory: directory = "./Downloads" if not url: url = 'http://khadya.cg.nic.in/pdsonline/cgfsa/Report/SSRS_Reports/RptMonthWiseDeleteRestoreNew_RC.aspx' url = 'http://164.100.112.66/netnrega/Citizen_html/Musternew.aspx?id=2&lflag=eng&ExeL=GP&fin_year=2015-2016&state_code=33&district_code=3305&block_code=3305007&panchayat_code=3305007038&State_name=CHHATTISGARH&District_name=SURGUJA&Block_name=BATAULI&panchayat_name=Govindpur' if not is_visible: is_visible = 0 # Set to 1 for debugging selenium if not is_parse_info: is_parse_info = False if not is_push_info: is_push_info = False # The part below could be moved to a function downloadMusterDetails() to make it reusable filename = directory + '/' + 'test.html' # Use your naming logic + blockName + '_' + panchayat + '_' + shopCode + '.html' logger.info('filename[%s]' % filename) filepath = os.path.dirname(filename) if not os.path.exists(filepath): logger.info('Creating direcotry [%s] as it does not exist' % filepath) os.makedirs(filepath) display = displayInitialize(is_visible) driver = driverInitialize() logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) # driver.delete_all_cookies() logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) logger.info("Fetching...[%s]" % url) driver.get(url) logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) # Use double refresh if need be like in AP sites if False: logger.info("Refreshing...[%s]" % url) driver.get(url) # A double refresh required for the page to load logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) el = waitUntilID(logger, driver, 'ctl00_ContentPlaceHolder1_ddlwork', 10) if el: #el = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlwork') logger.info("Found El[%s]" % str(el)) html_source = driver.page_source.replace( '<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' ) logger.debug("HTML Fetched [%s]" % html_source) # cookieDump(driver) with open(filename, "wb") as html_file: logger.info("Writing [%s]" % filename) html_file.write(html_source.encode('UTF-8')) else: logger.error("Failed to fetch the page [%s]" % driver.current_url) logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) html_source = driver.page_source logger.info("HTML Fetched [%s]" % html_source) with open(filename, "wb") as html_file: logger.info("Writing [%s]" % filename) html_file.write(html_source.encode('UTF-8')) driverFinalize(driver) displayFinalize(display) return # Error condition to be dealt with ''' try: logger.info("Waiting for the page to load...") elem = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, 'ctl00_ContentPlaceHolder1_ddlwork')) ) logger.info("...done looking") except (NoSuchElementException, TimeoutException): logger.error("Failed to fetch the page") driverFinalize(driver) displayFinalize(display) return # Error condition to be dealt with finally: html_source = driver.page_source.replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') logger.debug("HTML Fetched [%s]" % html_source) with open(filename, "wb") as html_file: logger.info("Writing [%s]" % filename) html_file.write(html_source.encode('UTF-8')) ''' # If you have information to parse using Beautiful Soup if is_parse_info: bs = BeautifulSoup(html_source, "html.parser") tr_list = bs.findAll('tr', attrs={'class': ['normalRow', 'alternateRow']}) logger.debug(str(tr_list)) for tr in tr_list: td = tr.find('td') td = td.findNext('td') panchayat = td.text.strip() logger.info("Panchayat[%s]", panchayat) elem = driver.find_element_by_link_text(panchayat) elem.click() filename = "/tmp/%s.html" % panchayat with open(filename, 'w') as html_file: logger.info("Writing [%s]" % filename) html_file.write(driver.page_source) driver.back() driverFinalize(driver) displayFinalize(display) # If you want to push the information to the Database if is_push_info: query = 'select j.jobcard, p.name, p.panchayatCode from jobcardRegister j, panchayats p, blocks b where j.blockCode=p.blockCode and j.panchayatCode=p.panchayatCode and j.blockCode=b.blockCode and j.jobcard="%s"' % jobcard logger.info("Command[%s] Directory[%s] URL[%s] jobcard[%s]" % (cmd, dir, url, jobcard)) pushInfoIntoDB(logger, db, "POPULATE_DATABASE", dir, url, is_visible, is_push_info, query) # So that function can be shared logger.info("...END %s" % cmd)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") districtName=args['district'] logger.info("DistrictName "+districtName) limitString='' if args['limit']: limitString=" limit %s " % args['limit'] db = dbInitialize(db=districtName.lower(), charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) additionalFilters = '' if args['blockCode']: additionalFilters=" and b.blockCode='%s' " % args['blockCode'] #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) crawlIP,stateName,stateCode,stateShortCode,districtCode=getDistrictParams(cur,districtName) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) jobcardPrefix="%s-%s" % (stateShortCode,districtCode) logger.info("crawlIP "+crawlIP) logger.info("State Name "+stateName) jcReportFilePath=nregaDir.replace("districtName",districtName.lower())+districtName.upper()+"/" jcReportRawFilePath=nregaRawDir.replace("districtName",districtName.lower())+districtName.upper()+"/" #Start Program here url="http://nrega.nic.in/netnrega/sthome.aspx" driver.get(url) elem = driver.find_element_by_link_text(stateName) elem.send_keys(Keys.RETURN) time.sleep(1) elem = driver.find_element_by_link_text(districtName.upper()) elem.send_keys(Keys.RETURN) time.sleep(1) #Query to get all the blocks query="select b.blockCode,b.name,p.panchayatCode,p.name from blocks b,panchayats p where b.blockCode=p.blockCode and p.isRequired=1 and jobcardCrawlDate is not NULL order by jobcardDownloadDate %s %s" % (additionalFilters,limitString) cur.execute(query) results = cur.fetchall() for row in results: blockCode=row[0] blockName=row[1] panchayatCode=row[2] panchayatName=row[3] panchayatNameOnlyLetters=re.sub(r"[^A-Za-z]+", '', panchayatName) elem = driver.find_element_by_link_text(blockName) elem.send_keys(Keys.RETURN) elem = driver.find_element_by_link_text(panchayatName) elem.send_keys(Keys.RETURN) elem = driver.find_element_by_link_text("List of Worker with Aadhar No.(UID No.)") elem.send_keys(Keys.RETURN) time.sleep(15) query="update panchayats set jobcardDownloadDate=now() where blockCode='%s' and panchayatCode='%s' " % (blockCode,panchayatCode) cur.execute(query) jcsource = driver.page_source driver.back() time.sleep(5) driver.back() time.sleep(5) driver.back() rawhtml=jcsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') jcfilename=jcReportRawFilePath+blockName.upper()+"/"+panchayatNameOnlyLetters.upper()+"/jobcardRegister/workerList.html" logger.info(jcfilename) writeFile(jcfilename,rawhtml) driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") if args['enumerate']: myFPSShops=FPSShop.objects.all() for eachFPSShop in myFPSShops: startYear=2016 now = datetime.now() endYear=now.year yearArray=list(range(startYear,endYear+1)) for eachYear in yearArray: if (eachYear == now.year): maxMonth=now.month else: maxMonth=12 eachMonth=0 while eachMonth < maxMonth: eachMonth=eachMonth+1 logger.info("%d-%d" % (eachMonth,eachYear)) myShop=FPSStatus.objects.filter(fpsShop=eachFPSShop,fpsMonth=eachMonth,fpsYear=eachYear).first() if myShop is None: FPSStatus.objects.create(fpsShop=eachFPSShop,fpsMonth=eachMonth,fpsYear=eachYear) logger.info("Created object") myShop=FPSStatus.objects.filter(fpsShop=eachFPSShop,fpsMonth=eachMonth,fpsYear=eachYear).first() myVillages=FPSVillage.objects.filter(fpsShop=eachFPSShop) for eachVillage in myVillages: myVillageFPSStatus=VillageFPSStatus.objects.filter(fpsVillage=eachVillage,fpsStatus=myShop).first() if myVillageFPSStatus is None: VillageFPSStatus.objects.create(fpsVillage=eachVillage,fpsStatus=myShop) if args['crawl']: display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) #Start Program here base_url = "http://sfc.bihar.gov.in/" verificationErrors = [] accept_next_alert = True driver.get("http://sfc.bihar.gov.in/login.htm") driver.get(base_url + "/fpshopsSummaryDetails.htm") # Select(driver.find_element_by_id("year")).select_by_visible_text(inyear) # time.sleep(10) myBlocks=Block.objects.filter(fpsRequired=True) myBlocks=Block.objects.filter(fpsRequired=True) for eachBlock in myBlocks: logger.info("District Name: %s Block Name: %s " % (eachBlock.name,eachBlock.district.name)) districtCode=eachBlock.district.fpsCode blockCode=eachBlock.fpsCode Select(driver.find_element_by_id("district_id")).select_by_value(districtCode) time.sleep(10) Select(driver.find_element_by_id("block_id")).select_by_value(blockCode) time.sleep(10) fps_box = driver.find_element_by_id("fpshop_id") # if your select_box has a name.. why use xpath?..... this step could use either xpath or name, but name is sooo much easier. fpsOptions = [z for z in fps_box.find_elements_by_tag_name("option")] #this part is cool, because it searches the elements contained inside of select_box and then adds them to the list options if they have the tag name "options" for fpsElement in fpsOptions: fpsCode=fpsElement.get_attribute("value") # fpsName=fpsElement.get_attribute("text") # logger.info("fpsCode: %s, fpsName: %s " % (fpsCode,fpsName)) myFPSShop=FPSShop.objects.filter(fpsCode=fpsCode).first() if myFPSShop is None: FPSShop.objects.create(fpsCode=fpsCode,name=fpsName,block=eachBlock) # driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") exit(0)
def tearDown(self): driverFinalize(self.driver) displayFinalize(self.display) self.logger.info('...END PROCESSING')
#errorfile.write(errormessage) continue driver.back() driver.back() time.sleep(5) driver.back() time.sleep(5) # url="http://www.google.com" # driver.get(url) # myhtml=driver.page_source # print myhtml # End program here driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0) if __name__ == '__main__': main()
def runTestSuite(): logger = loggerFetch("info") logger.info("BEGIN PROCESSING...") display = displayInitialize(1) driver = driverInitialize() ''' content = csv.reader(open('./gats.csv', 'r'), delimiter=',', quotechar='"') for (gat, d) in content: ''' for gat in gat_list: logger.info('Fetching gat[%s]...' % gat) driver.get(url) try: driver.find_element_by_xpath( "//form[@id='aspnetForm']/div[3]/div/div/div[3]/a[3]/p").click( ) except: logger.error('Cant find element for [%s]' % gat) continue Select( driver.find_element_by_id("distSelect")).select_by_visible_text(dn) Select( driver.find_element_by_id("talSelect")).select_by_visible_text(tn) Select( driver.find_element_by_id("vilSelect")).select_by_visible_text(vn) # Select(driver.find_element_by_id("vilSelect")).select_by_visible_text(u"सार्पिली") # driver.find_element_by_css_selector("option[value=\"string:273200030399810000\"]").click() driver.find_element_by_id("rbsryno").click() driver.find_element_by_xpath("//input[@type='number']").clear() driver.find_element_by_xpath("//input[@type='number']").send_keys(gat) driver.find_element_by_css_selector("input[type=\"button\"]").click() ''' from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException try: WebDriverWait(driver, 5).until(EC.alert_is_present(), 'Waiting for alert timed out') alert = driver.switch_to_alert() alert.accept() logger.warning("alert accepted") except: logger.info("Yippie!") ''' if len(driver.window_handles) > 1: logger.info("Dialog Box Window [" + str(driver.window_handles) + "]") # self.assertEqual(u"सर्वे नंबर / गट नंबर तपासुन पहा.", self.close_alert_and_get_its_text()) # driver.find_element_by_xpath("//div[@id='block-desktop']/div[3]/div[2]/div").click() driver.switch_to_alert().accept() continue html_source = driver.page_source.replace( '<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' ) logger.debug("HTML Fetched [%s]" % html_source) soup = BeautifulSoup(driver.page_source, "html.parser") #dict_from_json = soup.find(attribute).text sno_select = soup.find( "select", {"ng-model": "selectedSno"}) # ng-model="selectedSno" sno_options = sno_select.findAll("option") snos = [sno_option.text for sno_option in sno_options] logger.debug("Found [%s]" % str(snos)) snos = snos[1:] logger.info("SNO List [%s]" % str(snos)) #time.sleep(5) for sno in snos: logger.info('Processing [%s]' % sno) filename = '/home/mayank/wd/SaatBaara/remaining/%s.html' % sno.replace( '/', '_') if os.path.exists(filename): #time.sleep(1) continue Select( driver.find_element_by_xpath( "//form[@id='aspnetForm']/div[3]/div/div/div[3]/div/div[3]/table/tbody/tr[3]/td/select" )).select_by_visible_text(sno) #logger.info(driver.find_element_by_link_text(sno)) driver.find_element_by_css_selector( "td.last-rows > input[type=\"button\"]").click() time.sleep(5) parent_handle = driver.current_window_handle logger.info("Handles : %s Number : %s" % (driver.window_handles, len(driver.window_handles))) if len(driver.window_handles) == 2: driver.switch_to_window(driver.window_handles[-1]) else: logger.error("Handlers gone wrong [" + str(driver.window_handles) + "]") driver.save_screenshot('z.png') html_source = driver.page_source.replace( '<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' ).encode('utf-8') logger.debug("HTML Fetched [%s]" % html_source) if (driver.title != '७/१२'): logger.error(driver.title) driver.close() driver.switch_to_window(parent_handle) continue bs = BeautifulSoup(html_source, "html.parser") body = bs.find('tbody') try: body = body.findNext('tbody') except: logger.error('Empty body for [%s]' % sno) driver.close() driver.switch_to_window(parent_handle) continue body = body.findNext('tbody') logger.debug(body) td = body.find('td') td = td.findAll('td') logger.info("Checking [%s]" % td[2].text) if (sno != td[2].text): logger.error('sno[%s] != td.text[%s]' % (sno, td[2].text)) driver.close() driver.switch_to_window(parent_handle) continue with open(filename, 'wb') as html_file: logger.info('Writing [%s]' % filename) html_file.write(html_source) driver.close() driver.switch_to_window(parent_handle) time.sleep(1) time.sleep(1) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING")
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") limitString='' if args['limit']: limitString=" limit %s " % args['limit'] additionalFilters='' if args['district']: additionalFilters+= " and p.districtName='%s' " % args['district'] db = dbInitialize(db=nregaDB, charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) url="http://nrega.nic.in/netnrega/sthome.aspx" driver.get(url) query="select p.stateCode,p.districtCode,p.blockCode,p.panchayatCode,p.stateName,p.districtName,p.rawBlockName,p.panchayatName,p.fullPanchayatCode,p.stateShortCode,p.crawlIP from panchayats p,panchayatStatus ps where p.fullPanchayatCode=ps.fullPanchayatCode and p.isRequired=1 and ( (TIMESTAMPDIFF(DAY, ps.jobcardCrawlDate, now()) > 7) or ps.jobcardCrawlDate is NULL) %s order by ps.jobcardCrawlDate,fullPanchayatCode %s" % (additionalFilters,limitString) cur.execute(query) results=cur.fetchall() for row in results: [stateCode,districtCode,blockCode,panchayatCode,stateName,districtName,blockName,panchayatName,fullPanchayatCode,stateShortCode,crawlIP]=row filepath=nregaWebDir.replace("stateName",stateName.upper()).replace("districtName",districtName.upper()) filename=filepath+blockName.upper()+"/%s/%s_jobcardRegister.html" % (panchayatName.upper(),panchayatName.upper()) logger.info(filename) jobcardPrefix="%s-" % (stateShortCode) logger.info("Processing %s-%s-%s-%s " % (stateName,districtName,blockName,panchayatName)) elem = driver.find_element_by_link_text(stateName) elem.send_keys(Keys.RETURN) time.sleep(1) elem = driver.find_element_by_link_text(districtName.upper()) elem.send_keys(Keys.RETURN) time.sleep(1) elem = driver.find_element_by_link_text(blockName) elem.send_keys(Keys.RETURN) time.sleep(1) compareText="Panchayat_Code=%s" % fullPanchayatCode elems = driver.find_elements_by_xpath("//a[@href]") foundCode=0 for elem in elems: hrefLink=str(elem.get_attribute("href")) if compareText in hrefLink: logger.info("Found the Code") foundCode=1 break #elem = driver.find_element_by_link_text(panchayatName) if foundCode==1: elem.send_keys(Keys.RETURN) time.sleep(1) #Before thsi lets download the applicatn Register: elem = driver.find_element_by_link_text("Registration Application Register") elem.send_keys(Keys.RETURN) time.sleep(5) html_source = driver.page_source #filename="%s/%s.html" % (tempDir,panchayatName) writeFile(filename,html_source) driver.back() elem = driver.find_element_by_link_text("Download Panchayatwise MGNREGA Bank A/C Detail") elem.send_keys(Keys.RETURN) time.sleep(15) jcsource = driver.page_source rawhtml=jcsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') filename=filepath+blockName.upper()+"/%s/%s_bankAccounts.html" % (panchayatName.upper(),panchayatName.upper()) writeFile(filename,rawhtml) driver.back() elem = driver.find_element_by_link_text("Download Panchayatwise MGNREGA Post Office Account Detail") elem.send_keys(Keys.RETURN) time.sleep(15) jcsource = driver.page_source rawhtml=jcsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') filename=filepath+blockName.upper()+"/%s/%s_postAccounts.html" % (panchayatName.upper(),panchayatName.upper()) writeFile(filename,rawhtml) driver.back() elem = driver.find_element_by_link_text("Download Panchayat Wise MGNREGA Co-operative Bank A/C Detail") elem.send_keys(Keys.RETURN) time.sleep(15) jcsource = driver.page_source rawhtml=jcsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') filename=filepath+blockName.upper()+"/%s/%s_cobankAccounts.html" % (panchayatName.upper(),panchayatName.upper()) writeFile(filename,rawhtml) driver.back() elem = driver.find_element_by_link_text("Job card/Employment Register") elem.send_keys(Keys.RETURN) time.sleep(5) curtime = time.strftime('%Y-%m-%d %H:%M:%S') html_source = driver.page_source htmlsoup=BeautifulSoup(html_source,"html.parser") try: table=htmlsoup.find('table',align="center") rows = table.findAll('tr') status=1 except: status=0 query="update panchayatStatus set jobcardCrawlDate=NOW() where fullPanchayatCode='%s'"%fullPanchayatCode logger.info(query) cur.execute(query) logger.info("Status is " + str(status)) if status==1: for tr in rows: cols = tr.findAll('td') jclink='' for link in tr.find_all('a'): jclink=link.get('href') if len(cols) > 2: jcno="".join(cols[1].text.split()) headOfFamily=cols[2].text.replace("'","").lstrip().rstrip() logger.info("%s-%s" % (jcno,jobcardPrefix)) if jobcardPrefix in jcno: logger.info(jcno) jcNumber=getjcNumber(jcno) query="select * from jobcards where jobcard='%s' " % jcno cur.execute(query) if cur.rowcount == 0: query="insert into jobcards (name,jobcard,stateCode,districtCode,blockCode,panchayatCode,fullPanchayatCode) values ('"+headOfFamily+"','"+jcno+"','"+stateCode+"','"+districtCode+"','"+blockCode+"','"+panchayatCode+"',"+fullPanchayatCode+")" logger.info(query) cur.execute(query) driver.back() driver.back() driver.back() driver.back() driver.back() driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") db = dbInitialize(db="biharPDS", charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) inyear=args['year'] logger.info(inyear) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) #Start Program here base_url = "http://sfc.bihar.gov.in/" verificationErrors = [] accept_next_alert = True driver.get("http://sfc.bihar.gov.in/login.htm") driver.get(base_url + "/fpshopsSummaryDetails.htm") Select(driver.find_element_by_id("year")).select_by_visible_text(inyear) time.sleep(10) select_box = driver.find_element_by_id("district_id") # if your select_box has a name.. why use xpath?..... this step could use either xpath or name, but name is sooo much easier. options = [x for x in select_box.find_elements_by_tag_name("option")] #this part is cool, because it searches the elements contained inside of select_box and then adds them to the list options if they have the tag name "options" for element in options: distCode=element.get_attribute("value") # distName=element.get_attribute("text") # logger.info("District Code: %s District Name: %s " %(distCode,distName)) Select(driver.find_element_by_id("district_id")).select_by_value(distCode) time.sleep(10) block_box = driver.find_element_by_id("block_id") # if your select_box has a name.. why use xpath?..... this step could use either xpath or name, but name is sooo much easier. blockOptions = [y for y in block_box.find_elements_by_tag_name("option")] #this part is cool, because it searches the elements contained inside of select_box and then adds them to the list options if they have the tag name "options" for blockElement in blockOptions: blockCode=blockElement.get_attribute("value") # blockName=blockElement.get_attribute("text") # logger.info("distCode:%s distName:%s blockCode:%s blockName:%s " % (distCode,distName,blockCode,blockName)) Select(driver.find_element_by_id("block_id")).select_by_value(blockCode) time.sleep(10) fps_box = driver.find_element_by_id("fpshop_id") # if your select_box has a name.. why use xpath?..... this step could use either xpath or name, but name is sooo much easier. fpsOptions = [z for z in fps_box.find_elements_by_tag_name("option")] #this part is cool, because it searches the elements contained inside of select_box and then adds them to the list options if they have the tag name "options" for fpsElement in fpsOptions: fpsCode=fpsElement.get_attribute("value") # fpsName=fpsElement.get_attribute("text") # myString=distCode+','+distName+','+blockCode+','+blockName+','+fpsCode+','+fpsName logger.info(myString) if "Select" in myString: logger.info("This will not be entered into Database") else: fpsName1=cleanFPSName(fpsName) whereClause="where fpsCode='%s' and blockCode='%s' and distCode='%s' " % (fpsCode,blockCode,distCode) query="select * from pdsShops %s " % (whereClause) cur.execute(query) if cur.rowcount == 0: query="insert into pdsShops (fpsCode,blockCode,distCode) values ('%s','%s','%s') " % (fpsCode,blockCode,distCode) cur.execute(query) query="update pdsShops set distName='%s',blockName='%s',fpsName='%s' %s " % (distName,blockName,fpsName1,whereClause) logger.info(query) cur.execute(query) # End program here driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") districtName=args['district'] finyear=args['finyear'] logger.info("DistrictName "+districtName) limitString='' if args['limit']: limitString=" limit %s " % args['limit'] db = dbInitialize(db=districtName.lower(), charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) additionalFilters = '' if args['blockCode']: additionalFilters=" and b.blockCode='%s' " % args['blockCode'] #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) crawlIP,stateName,stateCode,stateShortCode,districtCode=getDistrictParams(cur,districtName) filepath=nregaRawDataDir.replace("districtName",districtName.lower()) fullfinyear=getFullFinYear(finyear) fullDistrictCode=stateCode+districtCode display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) url="http://164.100.129.6/netnrega/nregasearch1.aspx" driver.get(url) time.sleep(22) htmlsource = driver.page_source writeFile("/home/libtech/webroot/nreganic.libtech.info/temp/a.html",htmlsource) query="select w.id,w.wagelistNo,b.name from wagelists w,blocks b where w.blockCode=b.blockCode and ( (w.isDownloaded=0) or (w.isComplete=0 and TIMESTAMPDIFF(HOUR, w.downloadAttemptDate, now()) > 48 )) and finyear='%s' %s order by w.isDownloaded %s " % (finyear,additionalFilters,limitString) query="select w.id,w.wagelistNo,b.name from wagelists w,blocks b where w.blockCode=b.blockCode and w.id=1 and finyear='%s' %s order by w.isDownloaded limit 1 " % (finyear,additionalFilters) logger.info(query) cur.execute(query) results=cur.fetchall() for row in results: rowid=str(row[0]) wagelistNo=row[1] blockName=row[2] logger.info("Same WagelistNo %s " % wagelistNo) if wagelistNo != '': # logger.info(wagelistNo) maintab = driver.current_window_handle Select(driver.find_element_by_id("ddl_search")).select_by_visible_text("WageList") driver.find_element_by_css_selector("option[value=\"WageList\"]").click() Select(driver.find_element_by_id("ddl_state")).select_by_visible_text(stateName.upper()) myvalue='value="%s"' % stateCode #driver.find_element_by_css_selector("option[value=\"33\"]").click() driver.find_element_by_css_selector("option[%s]" % myvalue).click() Select(driver.find_element_by_id("ddl_district")).select_by_visible_text(districtName.upper()) myvalue='value="%s"' % (stateCode+districtCode) #driver.find_element_by_css_selector("option[value=\"3305\"]").click() driver.find_element_by_css_selector("option[%s]" % myvalue).click() driver.find_element_by_id("txt_keyword2").clear() driver.find_element_by_id("txt_keyword2").send_keys(wagelistNo) driver.find_element_by_id("btn_go").click() time.sleep(30) logger.info("Currently the number of active tabs are %s" % str(len(driver.window_handles))) if len(driver.window_handles) > 1: logger.info("There are multiple tabs") driver.switch_to_window(driver.window_handles[1]) #htmlsource = driver.page_source #logger.info(htmlsource) # ERROR: Caught exception [ERROR: Unsupported command [waitForPopUp | | 30000]] # ERROR: Caught exception [ERROR: Unsupported command [selectWindow | null | ]] elems = driver.find_elements_by_xpath("//a[@href]") if len(elems) > 0: query="select w.id,w.wagelistNo,b.name,b.blockCode from wagelists w,blocks b where w.blockCode=b.blockCode and ( (w.isDownloaded=0) or (w.isComplete=0 and TIMESTAMPDIFF(HOUR, w.downloadAttemptDate, now()) > 48 )) and finyear='%s' %s order by w.isDownloaded %s " % (finyear,additionalFilters,limitString) cur.execute(query) results1=cur.fetchall() for row1 in results1: rowid=str(row1[0]) wagelistNo=row1[1] blockName=row1[2] blockCode=row1[3] jobcardPrefix="%s-%s-%s-" % (stateShortCode,districtCode,blockCode) logger.info("Jobcard Prefix : %s " % jobcardPrefix) fullBlockCode=stateCode+districtCode+blockCode if wagelistNo != '': logger.info(wagelistNo) wurl="http://%s/netnrega/srch_wg_dtl.aspx?state_code=&district_code=%s&state_name=%s&district_name=%s&block_code=%s&wg_no=%s&short_name=%s&fin_year=%s&mode=wg" % (searchIP,fullDistrictCode,stateName.upper(),districtName.upper(),fullBlockCode,wagelistNo,stateShortCode,fullfinyear) logger.info("URL: %s " % wurl) driver.get(wurl) htmlsource = driver.page_source htmlsource=htmlsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') success=0 isPopulatedString='' if ("WageList Agency Code" in htmlsource) and (jobcardPrefix in htmlsource): filename=filepath+blockName.upper()+"/WAGELIST/"+fullfinyear+"/"+wagelistNo+".html" logger.info(filename) writeFile("/home/libtech/webroot/nreganic.libtech.info/temp/"+wagelistNo+".html",htmlsource) writeFile(filename,htmlsource) success=1 isPopulatedString="isProcessed=0," query="update wagelists set isDownloaded=%s,%sdownloadAttemptDate=NOW() where id=%s" %(str(success),isPopulatedString,str(rowid)) logger.info(query) cur.execute(query) # elem=driver.find_element_by_link_text(wagelistNo) # hrefLink=str(elem.get_attribute("href")) # logger.info(hrefLink) # driver.get(hrefLink) # htmlsource = driver.page_source # htmlsource=htmlsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') # success=0 # isPopulatedString='' # if "WageList Agency Code" in htmlsource: # filename=filepath+blockName.upper()+"/WAGELIST/"+fullfinyear+"/"+wagelistNo+".html" # logger.info(filename) # writeFile("/home/libtech/webroot/nreganic.libtech.info/temp/"+wagelistNo+".html",htmlsource) # writeFile(filename,htmlsource) # success=1 # isPopulatedString="isProcessed=0," # query="update wagelists set isDownloaded=%s,%sdownloadAttemptDate=NOW() where id=%s" %(str(success),isPopulatedString,str(rowid)) # logger.info(query) # cur.execute(query) driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w') driver.switch_to_window(maintab) driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) logger.info("BEGIN PROCESSING...") districtName=args['district'] logger.info("DistrictName "+districtName) limitString='' if args['limit']: limitString=" limit %s " % args['limit'] db = dbInitialize(db=districtName.lower(), charset="utf8") # The rest is updated automatically in the function cur=db.cursor() db.autocommit(True) #Query to set up Database to read Hindi Characters query="SET NAMES utf8" cur.execute(query) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) jobcardPrefix="%s-%s" % (stateShortCode,districtCode) logger.info("crawlIP "+crawlIP) logger.info("State Name "+stateName) jcReportFilePath=nregaDownloadsDir.replace("districtName",districtName.lower())+districtName.upper()+"/" jcReportRawFilePath=nregaRawDownloadsDir.replace("districtName",districtName.lower())+districtName.upper()+"/" #Start Program here url="http://nrega.nic.in/netnrega/sthome.aspx" driver.get(url) elem = driver.find_element_by_link_text(stateName) elem.send_keys(Keys.RETURN) time.sleep(1) elem = driver.find_element_by_link_text(districtName.upper()) elem.send_keys(Keys.RETURN) time.sleep(1) #Query to get all the blocks query="select b.blockCode,b.name,p.panchayatCode,p.name from blocks b,panchayats p where b.blockCode=p.blockCode and p.isRequired=1 order by jobcardCrawlDate %s" % (limitString) cur.execute(query) results = cur.fetchall() for row in results: blockCode=row[0] blockName=row[1] panchayatCode=row[2] panchayatName=row[3] elem = driver.find_element_by_link_text(blockName) elem.send_keys(Keys.RETURN) elem = driver.find_element_by_link_text(panchayatName) elem.send_keys(Keys.RETURN) elem = driver.find_element_by_link_text("Job card/Employment Register") elem.send_keys(Keys.RETURN) time.sleep(5) query="update panchayats set jobcardCrawlDate=now() where blockCode='%s' and panchayatCode='%s' " % (blockCode,panchayatCode) cur.execute(query) query="select jobcard from jobcardRegister where isDownloaded=0 and stateCode='"+stateCode+"' and districtCode='"+districtCode+"' and blockCode='"+blockCode+"' and panchayatCode='"+panchayatCode+"' limit 50" cur.execute(query) jcresults = cur.fetchall() for jcrow in jcresults: jobcard=jcrow[0] logger.info("blockName %s panchayatName: %s jobcard: %s" % (blockName,panchayatName,jobcard) ) elem = driver.find_element_by_link_text(jobcard) elem.send_keys(Keys.RETURN) jcsource = driver.page_source driver.back() rawhtml=jcsource.replace('<head>','<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') jcfilename=jcReportRawFilePath+blockName.upper()+"/"+panchayatName.upper()+"/jobcardRegister/"+jobcard.replace("/","_")+".html" logger.info(jcfilename) writeFile(jcfilename,rawhtml) category,isBPL=getCategoryBPL(rawhtml) logger.info("Category : %s " % category) logger.info("isBPL : %s " % isBPL) htmlSoup=BeautifulSoup(rawhtml,"lxml") myhtml='' myhtml+=getSpans(htmlSoup,rawhtml) myhtml+=rewriteTable(htmlSoup,"Family Details","GridView4") myhtml+=rewriteTable(htmlSoup,"Requested Period of Employment","GridView1") myhtml+=rewriteTable(htmlSoup,"Period and Work on which Employment Offered","GridView2") myhtml+=rewriteTable(htmlSoup,"Period and Work on which Employment Given","GridView3") myhtml=htmlWrapperLocal(title="Jobcard Details", head='<h1 aling="center">'+jobcard+'</h1>', body=myhtml) jcfilename=jcReportFilePath+blockName.upper()+"/"+panchayatName.upper()+"/jobcardRegister/"+jobcard.replace("/","_")+".html" logger.info(jcfilename) writeFile(jcfilename,myhtml) query="update jobcardRegister set isDownloaded=1 where jobcard='"+jobcard+"'" cur.execute(query) # if not os.path.exists(os.path.dirname(jcfilename)): # os.makedirs(os.path.dirname(jcfilename)) # myfile = open(jcfilename, "w") # myfile.write(myhtml.encode("UTF-8")) # query="update jobcardRegister set isDownloaded=1 where jobcard='"+jobcard+"'" # cur.execute(query) driverFinalize(driver) displayFinalize(display) dbFinalize(db) # Make sure you put this if there are other exit paths or errors logger.info("...END PROCESSING") exit(0)
def fetchMusterDetails(logger, db, cmd=None, directory=None, url=None, is_parse_info=None, is_push_info=None, is_visible=None): ''' Fetch the Muster Details for specified parameters in the specified directory ''' if not cmd: cmd="FETCH MUSTER DETAILS" logger.info("BEGIN %s..." % cmd) if not directory: directory = "./Downloads" if not url: url = 'http://khadya.cg.nic.in/pdsonline/cgfsa/Report/SSRS_Reports/RptMonthWiseDeleteRestoreNew_RC.aspx' url = 'http://164.100.112.66/netnrega/Citizen_html/Musternew.aspx?id=2&lflag=eng&ExeL=GP&fin_year=2015-2016&state_code=33&district_code=3305&block_code=3305007&panchayat_code=3305007038&State_name=CHHATTISGARH&District_name=SURGUJA&Block_name=BATAULI&panchayat_name=Govindpur' if not is_visible: is_visible = 0 # Set to 1 for debugging selenium if not is_parse_info: is_parse_info = False if not is_push_info: is_push_info = False # The part below could be moved to a function downloadMusterDetails() to make it reusable filename = directory + '/' + 'test.html' # Use your naming logic + blockName + '_' + panchayat + '_' + shopCode + '.html' logger.info('filename[%s]' % filename) filepath = os.path.dirname(filename) if not os.path.exists(filepath): logger.info('Creating direcotry [%s] as it does not exist' % filepath) os.makedirs(filepath) display = displayInitialize(is_visible) driver = driverInitialize() logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) # driver.delete_all_cookies() logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) logger.info("Fetching...[%s]" % url) driver.get(url) logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) # Use double refresh if need be like in AP sites if False: logger.info("Refreshing...[%s]" % url) driver.get(url) # A double refresh required for the page to load logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) el = waitUntilID(logger, driver, 'ctl00_ContentPlaceHolder1_ddlwork', 10) if el: #el = driver.find_element_by_id('ctl00_ContentPlaceHolder1_ddlwork') logger.info("Found El[%s]" % str(el)) html_source = driver.page_source.replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') logger.debug("HTML Fetched [%s]" % html_source) # cookieDump(driver) with open(filename, "wb") as html_file: logger.info("Writing [%s]" % filename) html_file.write(html_source.encode('UTF-8')) else: logger.error("Failed to fetch the page [%s]" % driver.current_url) logger.error("Current URL [%s] Title [%s]" % (driver.current_url, driver.title)) # cookieDump(driver) html_source = driver.page_source logger.info("HTML Fetched [%s]" % html_source) with open(filename, "wb") as html_file: logger.info("Writing [%s]" % filename) html_file.write(html_source.encode('UTF-8')) driverFinalize(driver) displayFinalize(display) return # Error condition to be dealt with ''' try: logger.info("Waiting for the page to load...") elem = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, 'ctl00_ContentPlaceHolder1_ddlwork')) ) logger.info("...done looking") except (NoSuchElementException, TimeoutException): logger.error("Failed to fetch the page") driverFinalize(driver) displayFinalize(display) return # Error condition to be dealt with finally: html_source = driver.page_source.replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') logger.debug("HTML Fetched [%s]" % html_source) with open(filename, "wb") as html_file: logger.info("Writing [%s]" % filename) html_file.write(html_source.encode('UTF-8')) ''' # If you have information to parse using Beautiful Soup if is_parse_info: bs = BeautifulSoup(html_source, "html.parser") tr_list = bs.findAll('tr', attrs={'class':['normalRow', 'alternateRow']}) logger.debug(str(tr_list)) for tr in tr_list: td = tr.find('td') td = td.findNext('td') panchayat = td.text.strip() logger.info("Panchayat[%s]", panchayat) elem = driver.find_element_by_link_text(panchayat) elem.click() filename="/tmp/%s.html" % panchayat with open(filename, 'w') as html_file: logger.info("Writing [%s]" % filename) html_file.write(driver.page_source) driver.back() driverFinalize(driver) displayFinalize(display) # If you want to push the information to the Database if is_push_info: query = 'select j.jobcard, p.name, p.panchayatCode from jobcardRegister j, panchayats p, blocks b where j.blockCode=p.blockCode and j.panchayatCode=p.panchayatCode and j.blockCode=b.blockCode and j.jobcard="%s"' % jobcard logger.info("Command[%s] Directory[%s] URL[%s] jobcard[%s]" % (cmd, dir, url, jobcard)) pushInfoIntoDB(logger, db, "POPULATE_DATABASE", dir, url, is_visible, is_push_info, query) # So that function can be shared logger.info("...END %s" % cmd)
def main(): args = argsFetch() logger = loggerFetch(args.get('log_level')) logger.info('args: %s', str(args)) display = displayInitialize(args['visible']) driver = driverInitialize(args['browser']) if args['limit']: limit = int(args['limit']) else: limit =1 myVillages=Village.objects.all()[:limit] for eachVillage in myVillages: logger.info(eachVillage.name) stateName=eachVillage.panchayat.block.district.state.name districtName=eachVillage.panchayat.block.district.name blockName=eachVillage.panchayat.block.name panchayatName=eachVillage.panchayat.name villageName=eachVillage.name eachPanchayat=eachVillage.panchayat url="http://www.nrega.telangana.gov.in/Nregs/FrontServlet?requestType=SmartCardreport_engRH&actionVal=MobnumberStatus&id=%s&Retype=null&type=null&file=%s" % (eachVillage.tcode,eachVillage.name) logger.info(url) try: driver.get(url) driver.get(url) myhtml = driver.page_source error=0 except: error=1 if error==0: logger.info("No Error") error1,myTable=validateWorkerList(myhtml) if error1 is None: logger.info("Worker List found") outhtml='' outcsv='' outhtml+=stripTableAttributes(myTable,"myTable") outcsv+=table2csv(myTable) title="WorkerList state:%s District:%s block:%s panchayat: %s vilage:%s " % (stateName,districtName,blockName,panchayatName,villageName) outhtml=htmlWrapperLocal(title=title, head='<h1 aling="center">'+title+'</h1>', body=outhtml) try: outhtml=outhtml.encode("UTF-8") except: outhtml=outhtml try: outcsv=outcsv.encode("UTF-8") except: outcsv=outcsv filename=eachVillage.slug+"_tjr.html" filenamecsv=eachVillage.slug+"_tjr.csv" finyear=getCurrentFinYear() reportType="telJobcardRegisterHTML" saveVillageReport(logger,eachVillage,finyear,reportType,filename,outhtml) reportType="telJobcardRegisterCSV" saveVillageReport(logger,eachVillage,finyear,reportType,filenamecsv,outcsv) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING") exit(0)
def runTestSuite(): logger = loggerFetch("info") logger.info("BEGIN PROCESSING...") display = displayInitialize(0) driver = driverInitialize() ''' content = csv.reader(open('./gats.csv', 'r'), delimiter=',', quotechar='"') for (gat, d) in content: ''' for gat in gat_list: logger.info('Fetching gat[%s]...' % gat) driver.get(url) try: driver.find_element_by_xpath("//form[@id='aspnetForm']/div[3]/div/div/div[3]/a[3]/p").click() except: logger.error('Cant find element for [%s]' % gat) continue Select(driver.find_element_by_id("distSelect")).select_by_visible_text(dn) Select(driver.find_element_by_id("talSelect")).select_by_visible_text(tn) Select(driver.find_element_by_id("vilSelect")).select_by_visible_text(vn) # Select(driver.find_element_by_id("vilSelect")).select_by_visible_text(u"सार्पिली") # driver.find_element_by_css_selector("option[value=\"string:273200030399810000\"]").click() driver.find_element_by_id("rbsryno").click() driver.find_element_by_xpath("//input[@type='number']").clear() driver.find_element_by_xpath("//input[@type='number']").send_keys(gat) driver.find_element_by_css_selector("input[type=\"button\"]").click() ''' from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException try: WebDriverWait(driver, 5).until(EC.alert_is_present(), 'Waiting for alert timed out') alert = driver.switch_to_alert() alert.accept() logger.warning("alert accepted") except: logger.info("Yippie!") ''' if len(driver.window_handles) > 1: logger.info("Dialog Box Window [" + str(driver.window_handles) + "]") # self.assertEqual(u"सर्वे नंबर / गट नंबर तपासुन पहा.", self.close_alert_and_get_its_text()) # driver.find_element_by_xpath("//div[@id='block-desktop']/div[3]/div[2]/div").click() driver.switch_to_alert().accept() continue html_source = driver.page_source.replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>') logger.debug("HTML Fetched [%s]" % html_source) soup = BeautifulSoup(driver.page_source, "html.parser") #dict_from_json = soup.find(attribute).text sno_select = soup.find("select", {"ng-model" : "selectedSno"}) # ng-model="selectedSno" sno_options = sno_select.findAll("option") snos = [ sno_option.text for sno_option in sno_options ] logger.debug("Found [%s]" % str(snos)) snos = snos[1:] logger.info("SNO List [%s]" % str(snos)) #time.sleep(5) for sno in snos: logger.info('Processing [%s]' % sno) filename = '/home/mayank/wd/SaatBaara/7-12-mahabhulekh/%s.html' % sno.replace('/','_') if os.path.exists(filename): #time.sleep(1) continue Select(driver.find_element_by_xpath("//form[@id='aspnetForm']/div[3]/div/div/div[3]/div/div[3]/table/tbody/tr[3]/td/select")).select_by_visible_text(sno) #logger.info(driver.find_element_by_link_text(sno)) driver.find_element_by_css_selector("td.last-rows > input[type=\"button\"]").click() time.sleep(5) parent_handle = driver.current_window_handle logger.info("Handles : %s Number : %s" % (driver.window_handles, len(driver.window_handles))) if len(driver.window_handles) == 2: driver.switch_to_window(driver.window_handles[-1]) else: logger.error("Handlers gone wrong [" + str(driver.window_handles) + "]") driver.save_screenshot('z.png') html_source = driver.page_source.replace('<head>', '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>').encode('utf-8') logger.debug("HTML Fetched [%s]" % html_source) if(driver.title != '७/१२'): logger.error(driver.title) driver.close() driver.switch_to_window(parent_handle) continue bs = BeautifulSoup(html_source, "html.parser") body = bs.find('tbody') try: body = body.findNext('tbody') except: logger.error('Empty body for [%s]' % sno) driver.close() driver.switch_to_window(parent_handle) continue body = body.findNext('tbody') logger.debug(body) td = body.find('td') td = td.findAll('td') logger.info("Checking [%s]" % td[2].text) if(sno != td[2].text): logger.error('sno[%s] != td.text[%s]' % (sno, td[2].text)) driver.close() driver.switch_to_window(parent_handle) continue with open(filename, 'wb') as html_file: logger.info('Writing [%s]' % filename) html_file.write(html_source) driver.close() driver.switch_to_window(parent_handle) time.sleep(1) time.sleep(1) driverFinalize(driver) displayFinalize(display) logger.info("...END PROCESSING")
def __del__(self): if self.is_selenium: driverFinalize(self.driver) displayFinalize(self.display) self.logger.info(f'Destructor({type(self).__name__})')