def get_mavat_gush_json(gush_id): """ Get JSON data for gush_id from the Ministry of interior's website """ ses = requests.Session() # Get the base search page and save the aspx session cookie r = ses.get(BASE_URL) # Solve the challenge if needed if 'X-AA-Challenge' in r.text: challenge = parse_challenge(r.text) r = ses.get(BASE_URL, headers={ 'X-AA-Challenge': challenge['challenge'], 'X-AA-Challenge-ID': challenge['challenge_id'], 'X-AA-Challenge-Result': challenge['challenge_result'] }) # Yet another request, this time with the BotMitigationCookie cookie yum = r.cookies r = ses.get(BASE_URL, cookies=yum) yum = r.cookies # Gather session identifiers html = BeautifulSoup(r.text, 'lxml', from_encoding=SITE_ENCODING) view_state = html('input', id='__VIEWSTATE')[0]['value'] view_state_encrypted = html('input', id='__VIEWSTATEENCRYPTED')[0]['value'] event_validation = html('input', id='__EVENTVALIDATION')[0]['value'] # This is the data we send with the request. From what I saw, removing random fields will make the scraper not work # All parameters were actually sent by browser, maybe not all of the are necessary request_data = { '__EVENTTARGET':'', '__EVENTARGUMENT':'', '__VIEWSTATE':view_state, '__VIEWSTATEENCRYPTED':view_state_encrypted, '__EVENTVALIDATION':event_validation, 'ctl00$ContentPlaceHolder1$txtNumb':'', 'ctl00$ContentPlaceHolder1$cboEntities':'-1', 'ctl00$ContentPlaceHolder1$cboSubEntities':'-1', 'ctl00$ContentPlaceHolder1$txtFromBlock':gush_id, 'ctl00$ContentPlaceHolder1$txtToBlock':gush_id, 'ctl00$ContentPlaceHolder1$txtFromParcel':'', 'ctl00$ContentPlaceHolder1$txtToParcel':'', 'ctl00$ContentPlaceHolder1$cboFilterDistrict':'-1', 'ctl00$ContentPlaceHolder1$cboFilterArea':'-1', 'ctl00$ContentPlaceHolder1$cboFilterJurst':'-1', 'ctl00$ContentPlaceHolder1$cboFilterCity':'-1', 'ctl00$ContentPlaceHolder1$cboFilterStreet':'-1', 'ctl00$ContentPlaceHolder1$txtGoals':'', 'ctl00$ContentPlaceHolder1$txtFilterFromApprovedDate':'', 'ctl00$ContentPlaceHolder1$txtFilterToApprovedDate':'', 'ctl00$ContentPlaceHolder1$txtFromMeetingDate':FROM_MEETING_DATE_FILTER, 'ctl00$ContentPlaceHolder1$txtToMeetingDate':'', 'ctl00$ContentPlaceHolder1$btnFilter.x':'49', 'ctl00$ContentPlaceHolder1$btnFilter.y':'14', 'ctl00$ContentPlaceHolder1$SubEntityID':'-1', 'ctl00$ContentPlaceHolder1$SelectedPlanId':'0', 'ctl00$ContentPlaceHolder1$SelectedPlanNumber':'0', 'ctl00$ContentPlaceHolder1$StreetID':'-1', 'ctl00$ContentPlaceHolder1$CityID':'-1', 'ctl00$ContentPlaceHolder1$JurstID':'-1', 'ctl00$ContentPlaceHolder1$AreaID':'-1', 'ctl00$ContentPlaceHolder1$PID':'-1', 'ctl00$ContentPlaceHolder1$JID':'-1', 'ctl00$ContentPlaceHolder1$CCID':'-1', 'ctl00$ContentPlaceHolder1$SLY':'-1', 'ctl00$ContentPlaceHolder1$ButtonCode':'-1', 'ctl00$ContentPlaceHolder1$ShowSearchResult':'0' } plans = [] curr_page = 1 while True: # If we're requesting a page that's not the first one we need to update the request parameters if curr_page > 1: event_target = 'ctl00$ContentPlaceHolder1$entitiesPaging$pagingForward' event_argument = html('input', id='__EVENTARGUMENT') if len(event_argument) > 0: event_argument = event_argument[0]['value'] else: event_argument = '' # Update the actual parameters request_data.update({'ctl00$ContentPlaceHolder1$entitiesPaging$currentPage':str(curr_page)}) request_data.update({'__EVENTTARGET':event_target}) request_data.update({'__EVENTARGUMENT':event_argument}) request_data.update({'__VIEWSTATE':view_state}) request_data.update({'__VIEWSTATEENCRYPTED':view_state_encrypted}) request_data.update({'__EVENTVALIDATION':event_validation}) request_data.pop('ctl00$ContentPlaceHolder1$btnFilter.x', None) request_data.pop('ctl00$ContentPlaceHolder1$btnFilter.y', None) # Search plans by gush id r = ses.post(BASE_URL, headers={'Content-Type':'application/x-www-form-urlencoded'}, cookies=yum, data=request_data) # On the first page, get the total number of pages if it exists if curr_page == 1: number_of_page_str = re.search(number_of_pages_pattern, r.text) if number_of_page_str: number_of_pages = int(number_of_page_str.groups()[1]) else: number_of_pages = 1 page_plans = [] # Get the new view state and friends for future requests (following page) html = BeautifulSoup(r.text, 'lxml', from_encoding=SITE_ENCODING) view_state = html('input', id='__VIEWSTATE')[0]['value'] view_state_encrypted = html('input', id='__VIEWSTATEENCRYPTED')[0]['value'] event_validation = html('input', id='__EVENTVALIDATION')[0]['value'] # Parse the table of plans in the result HTML for tr in html('tbody')[0].children: if type(tr) == Tag: # 1 = code, 3 = machoz, 5 = yeshuv/rashut mekomit/merchav tichnun, 7 = yeshut tichnunit, 9 = mispar tochnit, 11 = shem tochnit, 13 = ishur reshumot/itonim page_plans.append({'code':tr.contents[1].string,'number':tr.contents[9].string.strip().replace('/ ', '/'),'files':[], 'meetings':[]}) # Get plan files and meetings for each plan for plan in page_plans: (plan['files'], plan['meetings']) = get_mavat_plan_docs_meetings_json(ses, yum, plan['code']) # Add the page's plans to the total plans plans += page_plans # Break if we hit the page number if curr_page >= number_of_pages: break else: curr_page += 1 ses.close() return plans
def get_mavat_gush_json(gush_id): """ Get JSON data for gush_id from the Ministry of interior's website """ ses = requests.Session() # Get the base search page and save the aspx session cookie r = ses.get(BASE_URL) # Solve the challenge if needed if "X-AA-Challenge" in r.text: challenge = parse_challenge(r.text) r = ses.get( BASE_URL, headers={ "X-AA-Challenge": challenge["challenge"], "X-AA-Challenge-ID": challenge["challenge_id"], "X-AA-Challenge-Result": challenge["challenge_result"], }, ) # Yet another request, this time with the BotMitigationCookie cookie yum = r.cookies r = ses.get(BASE_URL, cookies=yum) yum = r.cookies # Gather session identifiers html = BeautifulSoup(r.text, "lxml", from_encoding=SITE_ENCODING) view_state = html("input", id="__VIEWSTATE")[0]["value"] view_state_encrypted = html("input", id="__VIEWSTATEENCRYPTED")[0]["value"] event_validation = html("input", id="__EVENTVALIDATION")[0]["value"] # This is the data we send with the request. From what I saw, removing random fields will make the scraper not work # All parameters were actually sent by browser, maybe not all of the are necessary request_data = { "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": view_state, "__VIEWSTATEENCRYPTED": view_state_encrypted, "__EVENTVALIDATION": event_validation, "ctl00$ContentPlaceHolder1$txtNumb": "", "ctl00$ContentPlaceHolder1$cboEntities": "-1", "ctl00$ContentPlaceHolder1$cboSubEntities": "-1", "ctl00$ContentPlaceHolder1$txtFromBlock": gush_id, "ctl00$ContentPlaceHolder1$txtToBlock": gush_id, "ctl00$ContentPlaceHolder1$txtFromParcel": "", "ctl00$ContentPlaceHolder1$txtToParcel": "", "ctl00$ContentPlaceHolder1$cboFilterDistrict": "-1", "ctl00$ContentPlaceHolder1$cboFilterArea": "-1", "ctl00$ContentPlaceHolder1$cboFilterJurst": "-1", "ctl00$ContentPlaceHolder1$cboFilterCity": "-1", "ctl00$ContentPlaceHolder1$cboFilterStreet": "-1", "ctl00$ContentPlaceHolder1$txtGoals": "", "ctl00$ContentPlaceHolder1$txtFilterFromApprovedDate": "", "ctl00$ContentPlaceHolder1$txtFilterToApprovedDate": "", "ctl00$ContentPlaceHolder1$txtFromMeetingDate": FROM_MEETING_DATE_FILTER, "ctl00$ContentPlaceHolder1$txtToMeetingDate": "", "ctl00$ContentPlaceHolder1$btnFilter.x": "49", "ctl00$ContentPlaceHolder1$btnFilter.y": "14", "ctl00$ContentPlaceHolder1$SubEntityID": "-1", "ctl00$ContentPlaceHolder1$SelectedPlanId": "0", "ctl00$ContentPlaceHolder1$SelectedPlanNumber": "0", "ctl00$ContentPlaceHolder1$StreetID": "-1", "ctl00$ContentPlaceHolder1$CityID": "-1", "ctl00$ContentPlaceHolder1$JurstID": "-1", "ctl00$ContentPlaceHolder1$AreaID": "-1", "ctl00$ContentPlaceHolder1$PID": "-1", "ctl00$ContentPlaceHolder1$JID": "-1", "ctl00$ContentPlaceHolder1$CCID": "-1", "ctl00$ContentPlaceHolder1$SLY": "-1", "ctl00$ContentPlaceHolder1$ButtonCode": "-1", "ctl00$ContentPlaceHolder1$ShowSearchResult": "0", } plans = [] curr_page = 1 while True: # If we're requesting a page that's not the first one we need to update the request parameters if curr_page > 1: event_target = "ctl00$ContentPlaceHolder1$entitiesPaging$pagingForward" event_argument = html("input", id="__EVENTARGUMENT") if len(event_argument) > 0: event_argument = event_argument[0]["value"] else: event_argument = "" # Update the actual parameters request_data.update({"ctl00$ContentPlaceHolder1$entitiesPaging$currentPage": str(curr_page)}) request_data.update({"__EVENTTARGET": event_target}) request_data.update({"__EVENTARGUMENT": event_argument}) request_data.update({"__VIEWSTATE": view_state}) request_data.update({"__VIEWSTATEENCRYPTED": view_state_encrypted}) request_data.update({"__EVENTVALIDATION": event_validation}) request_data.pop("ctl00$ContentPlaceHolder1$btnFilter.x", None) request_data.pop("ctl00$ContentPlaceHolder1$btnFilter.y", None) # Search plans by gush id r = ses.post( BASE_URL, headers={"Content-Type": "application/x-www-form-urlencoded"}, cookies=yum, data=request_data ) # On the first page, get the total number of pages if it exists if curr_page == 1: number_of_page_str = re.search(number_of_pages_pattern, r.text) if number_of_page_str: number_of_pages = int(number_of_page_str.groups()[1]) else: number_of_pages = 1 page_plans = [] # Get the new view state and friends for future requests (following page) html = BeautifulSoup(r.text, "lxml", from_encoding=SITE_ENCODING) view_state = html("input", id="__VIEWSTATE")[0]["value"] view_state_encrypted = html("input", id="__VIEWSTATEENCRYPTED")[0]["value"] event_validation = html("input", id="__EVENTVALIDATION")[0]["value"] # Parse the table of plans in the result HTML for tr in html("tbody")[0].children: if type(tr) == Tag: # 1 = code, 3 = machoz, 5 = yeshuv/rashut mekomit/merchav tichnun, 7 = yeshut tichnunit, 9 = mispar tochnit, 11 = shem tochnit, 13 = ishur reshumot/itonim page_plans.append( { "code": tr.contents[1].string, "number": tr.contents[9].string.strip().replace("/ ", "/"), "files": [], "meetings": [], } ) # Get plan files and meetings for each plan for plan in page_plans: (plan["files"], plan["meetings"]) = get_mavat_plan_docs_meetings_json(ses, yum, plan["code"]) # Add the page's plans to the total plans plans += page_plans # Break if we hit the page number if curr_page >= number_of_pages: break else: curr_page += 1 ses.close() return plans
def get_mmi_gush_json(gush_id): """ Get JSON data for gush_id from the Minhal's website """ ses = requests.Session() # Get the base search page and save the aspx session cookie, data source and view state r = ses.get('%s/iturTabot2/taba1.aspx' % BASE_URL) # Solve the challenge if needed if 'X-AA-Challenge' in r.text: challenge = parse_challenge(r.text) r = ses.get('%s/iturTabot2/taba1.aspx' % BASE_URL, headers={ 'X-AA-Challenge': challenge['challenge'], 'X-AA-Challenge-ID': challenge['challenge_id'], 'X-AA-Challenge-Result': challenge['challenge_result'] }) # Yet another request, this time with the BotMitigationCookie cookie yum = r.cookies r = ses.get('%s/iturTabot2/taba1.aspx' % BASE_URL, cookies=yum) yum = r.cookies data_source = re.findall(r'tblView_[A-Za-z0-9]+', r.text)[-1] html = BeautifulSoup(r.text, 'lxml', from_encoding=SITE_ENCODING) view_state = html('input', id='__VIEWSTATE')[0]['value'] # Tell the server which fields we are displaying r = ses.post('%s/iturTabot2/taba1.aspx' % BASE_URL, cookies=yum, data={ 'scriptManagerId_HiddenField':None, '__EVENTTARGET':None, '__EVENTARGUMENT':None, '__VIEWSTATE':view_state, 'cpe_ClientState':None, 'txtMsTochnit':None, 'cmsStatusim$textBox':None, 'txtGush':None, 'txtwinCal1$textBox':None, 'txtwinCal1$popupWin$time':None, 'txtwinCal1$popupWin$mskTime_ClientState':None, 'txtFromHelka':None, 'txtwinCal2$textBox':None, 'txtwinCal2$popupWin$time':None, 'txtwinCal2$popupWin$mskTime_ClientState':None, 'txtMakom':None, 'cmsMerchaveiTichnun$textBox':None, 'cmsYeudRashi$textBox':None, 'txtMatara':None, 'cmsYeshuvim$textBox':None, 'cmsKodAchrai$textBox':None, 'cmsTakanon$textBox':None, 'txtAchrai':None, 'cmsSug$textBox':None, 'cmsMmg$textBox':None, 'cmsKodMetachnen$textBox':None, 'cmsTasrit$textBox':None, 'txtMetachnen':None, '__CALLBACKID':'scriptManagerId', '__CALLBACKPARAM': 'Mmi.Tashtiot.UI.AjaxComponent.TableView$#$~$#$GetData$#${"P0":"'+data_source+'","P1":0,"P2":-1,"P3":["mtysvShemYishuv","Link","Status","tbMahut","Takanon","Tasrit","Nispach","Mmg","tbMakom","tbYechidotDiur","mtmrthTirgumMerchav","mtstTargumSugTochnit","svtTargumSugVaadatTichnun","tbTochnitId", "tbMsTochnit"],"P4":"~","P5":"~","P6":true,"P7":true}' }) # Note and warning: other available fields for selction are: "tbMerchav","tbMsTochnitYashan","tbKodIshuv","tbSug","tbTamlilSaruk","tbMmg","mtmhzShemMachoz","tbTabaSruka","mtsttKvutzatStatusim","tbAchrai","tbMetachnen","tbShemMetachnen","mtkyPianuachYeud","tbYalkut","tbTaarichDigitation","tUniqueID" # DO NOT, however, select the field "tbMatara", as it reduces the amount of results in jerusalem from ~15000 to ~1500 (true for June 18th 2014) # and, if fields are added here they should be added above as well in the get_gush_json_page function # Send a parameterized request to the server (just search for the gush) r = ses.post( '%s/iturTabot2/taba1.aspx/getNetuneiTochniotByAllParames' % BASE_URL, headers={'Content-Type':'application/json'}, cookies=yum, data=json.dumps({ 'IsOneRow': False,'SourceName': data_source,'bBProjects': False,'conMachoz': 0,'iFromHelka': "-1",'iGush': gush_id,'iMaamadMoncipali': "-1",'iMachoz': "-1",'iNumOfRows': 300,'iToHelka': "-1",'rtncol': 2,'sAchrai': "~",'sFromTaarichStatus': "~",'sKodAchrai': "~",'sKodIshuv': "~",'sKodMetachnen': "~",'sKvutzatStatusim': "~",'sMakom': "~",'sMatara': "~",'sMerchav': "~",'sMetachnen': "~",'sMisTochnit': "~",'sMmg': "~",'sSug': "~",'sTabaSruka': "~",'sTakanon': "~",'sTasrit': "~",'sTik': "~",'sToTaarichStatus': "~",'sVaada': "~",'sYeudRashi': "~" }) ) result = [] page = 0 # Get the first page of results and extra data first_page = get_mmi_gush_json_page(ses, page, yum, view_state, data_source) result = result + json.loads(re.findall('\[.*?\]', first_page)[0]) # Get the number of pages from the first page (every page has this) pages = int(re.findall('\$([\-#0-9]*)', re.findall('\](.*?){', first_page)[0])[7]) # Get the rest of the pages while page < pages: page = page + 1 result = result + json.loads('[' + re.findall('\[(.*?)\]', get_mmi_gush_json_page(ses, page, yum, view_state, data_source))[0] + ']') ses.close() return result