Example #1
0
def get_mavat_gush_json(gush_id):
    """
    Get JSON data for gush_id from the Ministry of interior's website
    """
    ses = requests.Session()

    # Get the base search page and save the aspx session cookie
    r = ses.get(BASE_URL)
    
    # Solve the challenge if needed
    if 'X-AA-Challenge' in r.text:
        challenge = parse_challenge(r.text)
        r = ses.get(BASE_URL, headers={
            'X-AA-Challenge': challenge['challenge'],
            'X-AA-Challenge-ID': challenge['challenge_id'],
            'X-AA-Challenge-Result': challenge['challenge_result']
        })
        
        # Yet another request, this time with the BotMitigationCookie cookie
        yum = r.cookies
        r = ses.get(BASE_URL, cookies=yum)
    
    yum = r.cookies

    # Gather session identifiers
    html = BeautifulSoup(r.text, 'lxml', from_encoding=SITE_ENCODING)
    view_state = html('input', id='__VIEWSTATE')[0]['value']
    view_state_encrypted = html('input', id='__VIEWSTATEENCRYPTED')[0]['value']
    event_validation = html('input', id='__EVENTVALIDATION')[0]['value']
    
    # This is the data we send with the request. From what I saw, removing random fields will make the scraper not work
    # All parameters were actually sent by browser, maybe not all of the are necessary
    request_data = {
            '__EVENTTARGET':'',
            '__EVENTARGUMENT':'',
            '__VIEWSTATE':view_state,
            '__VIEWSTATEENCRYPTED':view_state_encrypted,
            '__EVENTVALIDATION':event_validation,
            'ctl00$ContentPlaceHolder1$txtNumb':'',
            'ctl00$ContentPlaceHolder1$cboEntities':'-1',
            'ctl00$ContentPlaceHolder1$cboSubEntities':'-1',
            'ctl00$ContentPlaceHolder1$txtFromBlock':gush_id,
            'ctl00$ContentPlaceHolder1$txtToBlock':gush_id,
            'ctl00$ContentPlaceHolder1$txtFromParcel':'',
            'ctl00$ContentPlaceHolder1$txtToParcel':'',
            'ctl00$ContentPlaceHolder1$cboFilterDistrict':'-1',
            'ctl00$ContentPlaceHolder1$cboFilterArea':'-1',
            'ctl00$ContentPlaceHolder1$cboFilterJurst':'-1',
            'ctl00$ContentPlaceHolder1$cboFilterCity':'-1',
            'ctl00$ContentPlaceHolder1$cboFilterStreet':'-1',
            'ctl00$ContentPlaceHolder1$txtGoals':'',
            'ctl00$ContentPlaceHolder1$txtFilterFromApprovedDate':'',
            'ctl00$ContentPlaceHolder1$txtFilterToApprovedDate':'',
            'ctl00$ContentPlaceHolder1$txtFromMeetingDate':FROM_MEETING_DATE_FILTER,
            'ctl00$ContentPlaceHolder1$txtToMeetingDate':'',
            'ctl00$ContentPlaceHolder1$btnFilter.x':'49',
            'ctl00$ContentPlaceHolder1$btnFilter.y':'14',
            'ctl00$ContentPlaceHolder1$SubEntityID':'-1',
            'ctl00$ContentPlaceHolder1$SelectedPlanId':'0',
            'ctl00$ContentPlaceHolder1$SelectedPlanNumber':'0',
            'ctl00$ContentPlaceHolder1$StreetID':'-1',
            'ctl00$ContentPlaceHolder1$CityID':'-1',
            'ctl00$ContentPlaceHolder1$JurstID':'-1',
            'ctl00$ContentPlaceHolder1$AreaID':'-1',
            'ctl00$ContentPlaceHolder1$PID':'-1',
            'ctl00$ContentPlaceHolder1$JID':'-1',
            'ctl00$ContentPlaceHolder1$CCID':'-1',
            'ctl00$ContentPlaceHolder1$SLY':'-1',
            'ctl00$ContentPlaceHolder1$ButtonCode':'-1',
            'ctl00$ContentPlaceHolder1$ShowSearchResult':'0'
        }

    plans = []
    curr_page = 1
    
    while True:
        # If we're requesting a page that's not the first one we need to update the request parameters
        if curr_page > 1:
            event_target = 'ctl00$ContentPlaceHolder1$entitiesPaging$pagingForward'
            event_argument = html('input', id='__EVENTARGUMENT')
            if len(event_argument) > 0:
                event_argument = event_argument[0]['value']
            else:
                event_argument = ''
            
            # Update the actual parameters
            request_data.update({'ctl00$ContentPlaceHolder1$entitiesPaging$currentPage':str(curr_page)})
            request_data.update({'__EVENTTARGET':event_target})
            request_data.update({'__EVENTARGUMENT':event_argument})
            request_data.update({'__VIEWSTATE':view_state})
            request_data.update({'__VIEWSTATEENCRYPTED':view_state_encrypted})
            request_data.update({'__EVENTVALIDATION':event_validation})
            request_data.pop('ctl00$ContentPlaceHolder1$btnFilter.x', None)
            request_data.pop('ctl00$ContentPlaceHolder1$btnFilter.y', None)

        # Search plans by gush id
        r = ses.post(BASE_URL, headers={'Content-Type':'application/x-www-form-urlencoded'}, 
            cookies=yum, data=request_data)
        
        # On the first page, get the total number of pages if it exists
        if curr_page == 1:
            number_of_page_str = re.search(number_of_pages_pattern, r.text)
            if number_of_page_str:
                number_of_pages = int(number_of_page_str.groups()[1])
            else:
                number_of_pages = 1
        
        page_plans = []

        # Get the new view state and friends for future requests (following page)
        html = BeautifulSoup(r.text, 'lxml', from_encoding=SITE_ENCODING)
        view_state = html('input', id='__VIEWSTATE')[0]['value']
        view_state_encrypted = html('input', id='__VIEWSTATEENCRYPTED')[0]['value']
        event_validation = html('input', id='__EVENTVALIDATION')[0]['value']
        
        # Parse the table of plans in the result HTML
        for tr in html('tbody')[0].children:
            if type(tr) == Tag:
                # 1 = code, 3 = machoz, 5 = yeshuv/rashut mekomit/merchav tichnun, 7 = yeshut tichnunit, 9 = mispar tochnit, 11 = shem tochnit, 13 = ishur reshumot/itonim
                page_plans.append({'code':tr.contents[1].string,'number':tr.contents[9].string.strip().replace('/ ', '/'),'files':[], 'meetings':[]})

        # Get plan files and meetings for each plan
        for plan in page_plans:
            (plan['files'], plan['meetings']) = get_mavat_plan_docs_meetings_json(ses, yum, plan['code'])
        
        # Add the page's plans to the total plans
        plans += page_plans
        
        # Break if we hit the page number
        if curr_page >= number_of_pages:
            break
        else:
            curr_page += 1

    ses.close()    
    return plans
Example #2
0
def get_mavat_gush_json(gush_id):
    """
    Get JSON data for gush_id from the Ministry of interior's website
    """
    ses = requests.Session()

    # Get the base search page and save the aspx session cookie
    r = ses.get(BASE_URL)

    # Solve the challenge if needed
    if "X-AA-Challenge" in r.text:
        challenge = parse_challenge(r.text)
        r = ses.get(
            BASE_URL,
            headers={
                "X-AA-Challenge": challenge["challenge"],
                "X-AA-Challenge-ID": challenge["challenge_id"],
                "X-AA-Challenge-Result": challenge["challenge_result"],
            },
        )

        # Yet another request, this time with the BotMitigationCookie cookie
        yum = r.cookies
        r = ses.get(BASE_URL, cookies=yum)

    yum = r.cookies

    # Gather session identifiers
    html = BeautifulSoup(r.text, "lxml", from_encoding=SITE_ENCODING)
    view_state = html("input", id="__VIEWSTATE")[0]["value"]
    view_state_encrypted = html("input", id="__VIEWSTATEENCRYPTED")[0]["value"]
    event_validation = html("input", id="__EVENTVALIDATION")[0]["value"]

    # This is the data we send with the request. From what I saw, removing random fields will make the scraper not work
    # All parameters were actually sent by browser, maybe not all of the are necessary
    request_data = {
        "__EVENTTARGET": "",
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": view_state,
        "__VIEWSTATEENCRYPTED": view_state_encrypted,
        "__EVENTVALIDATION": event_validation,
        "ctl00$ContentPlaceHolder1$txtNumb": "",
        "ctl00$ContentPlaceHolder1$cboEntities": "-1",
        "ctl00$ContentPlaceHolder1$cboSubEntities": "-1",
        "ctl00$ContentPlaceHolder1$txtFromBlock": gush_id,
        "ctl00$ContentPlaceHolder1$txtToBlock": gush_id,
        "ctl00$ContentPlaceHolder1$txtFromParcel": "",
        "ctl00$ContentPlaceHolder1$txtToParcel": "",
        "ctl00$ContentPlaceHolder1$cboFilterDistrict": "-1",
        "ctl00$ContentPlaceHolder1$cboFilterArea": "-1",
        "ctl00$ContentPlaceHolder1$cboFilterJurst": "-1",
        "ctl00$ContentPlaceHolder1$cboFilterCity": "-1",
        "ctl00$ContentPlaceHolder1$cboFilterStreet": "-1",
        "ctl00$ContentPlaceHolder1$txtGoals": "",
        "ctl00$ContentPlaceHolder1$txtFilterFromApprovedDate": "",
        "ctl00$ContentPlaceHolder1$txtFilterToApprovedDate": "",
        "ctl00$ContentPlaceHolder1$txtFromMeetingDate": FROM_MEETING_DATE_FILTER,
        "ctl00$ContentPlaceHolder1$txtToMeetingDate": "",
        "ctl00$ContentPlaceHolder1$btnFilter.x": "49",
        "ctl00$ContentPlaceHolder1$btnFilter.y": "14",
        "ctl00$ContentPlaceHolder1$SubEntityID": "-1",
        "ctl00$ContentPlaceHolder1$SelectedPlanId": "0",
        "ctl00$ContentPlaceHolder1$SelectedPlanNumber": "0",
        "ctl00$ContentPlaceHolder1$StreetID": "-1",
        "ctl00$ContentPlaceHolder1$CityID": "-1",
        "ctl00$ContentPlaceHolder1$JurstID": "-1",
        "ctl00$ContentPlaceHolder1$AreaID": "-1",
        "ctl00$ContentPlaceHolder1$PID": "-1",
        "ctl00$ContentPlaceHolder1$JID": "-1",
        "ctl00$ContentPlaceHolder1$CCID": "-1",
        "ctl00$ContentPlaceHolder1$SLY": "-1",
        "ctl00$ContentPlaceHolder1$ButtonCode": "-1",
        "ctl00$ContentPlaceHolder1$ShowSearchResult": "0",
    }

    plans = []
    curr_page = 1

    while True:
        # If we're requesting a page that's not the first one we need to update the request parameters
        if curr_page > 1:
            event_target = "ctl00$ContentPlaceHolder1$entitiesPaging$pagingForward"
            event_argument = html("input", id="__EVENTARGUMENT")
            if len(event_argument) > 0:
                event_argument = event_argument[0]["value"]
            else:
                event_argument = ""

            # Update the actual parameters
            request_data.update({"ctl00$ContentPlaceHolder1$entitiesPaging$currentPage": str(curr_page)})
            request_data.update({"__EVENTTARGET": event_target})
            request_data.update({"__EVENTARGUMENT": event_argument})
            request_data.update({"__VIEWSTATE": view_state})
            request_data.update({"__VIEWSTATEENCRYPTED": view_state_encrypted})
            request_data.update({"__EVENTVALIDATION": event_validation})
            request_data.pop("ctl00$ContentPlaceHolder1$btnFilter.x", None)
            request_data.pop("ctl00$ContentPlaceHolder1$btnFilter.y", None)

        # Search plans by gush id
        r = ses.post(
            BASE_URL, headers={"Content-Type": "application/x-www-form-urlencoded"}, cookies=yum, data=request_data
        )

        # On the first page, get the total number of pages if it exists
        if curr_page == 1:
            number_of_page_str = re.search(number_of_pages_pattern, r.text)
            if number_of_page_str:
                number_of_pages = int(number_of_page_str.groups()[1])
            else:
                number_of_pages = 1

        page_plans = []

        # Get the new view state and friends for future requests (following page)
        html = BeautifulSoup(r.text, "lxml", from_encoding=SITE_ENCODING)
        view_state = html("input", id="__VIEWSTATE")[0]["value"]
        view_state_encrypted = html("input", id="__VIEWSTATEENCRYPTED")[0]["value"]
        event_validation = html("input", id="__EVENTVALIDATION")[0]["value"]

        # Parse the table of plans in the result HTML
        for tr in html("tbody")[0].children:
            if type(tr) == Tag:
                # 1 = code, 3 = machoz, 5 = yeshuv/rashut mekomit/merchav tichnun, 7 = yeshut tichnunit, 9 = mispar tochnit, 11 = shem tochnit, 13 = ishur reshumot/itonim
                page_plans.append(
                    {
                        "code": tr.contents[1].string,
                        "number": tr.contents[9].string.strip().replace("/ ", "/"),
                        "files": [],
                        "meetings": [],
                    }
                )

        # Get plan files and meetings for each plan
        for plan in page_plans:
            (plan["files"], plan["meetings"]) = get_mavat_plan_docs_meetings_json(ses, yum, plan["code"])

        # Add the page's plans to the total plans
        plans += page_plans

        # Break if we hit the page number
        if curr_page >= number_of_pages:
            break
        else:
            curr_page += 1

    ses.close()
    return plans
Example #3
0
def get_mmi_gush_json(gush_id):
    """
    Get JSON data for gush_id from the Minhal's website
    """
    ses = requests.Session()

    # Get the base search page and save the aspx session cookie, data source and view state
    r = ses.get('%s/iturTabot2/taba1.aspx' % BASE_URL)
    
    # Solve the challenge if needed
    if 'X-AA-Challenge' in r.text:
        challenge = parse_challenge(r.text)
        r = ses.get('%s/iturTabot2/taba1.aspx' % BASE_URL, headers={
            'X-AA-Challenge': challenge['challenge'],
            'X-AA-Challenge-ID': challenge['challenge_id'],
            'X-AA-Challenge-Result': challenge['challenge_result']
        })
        
        # Yet another request, this time with the BotMitigationCookie cookie
        yum = r.cookies
        r = ses.get('%s/iturTabot2/taba1.aspx' % BASE_URL, cookies=yum)
    
    yum = r.cookies

    data_source = re.findall(r'tblView_[A-Za-z0-9]+', r.text)[-1]

    html = BeautifulSoup(r.text, 'lxml', from_encoding=SITE_ENCODING)
    view_state = html('input', id='__VIEWSTATE')[0]['value']

    # Tell the server which fields we are displaying
    r = ses.post('%s/iturTabot2/taba1.aspx' % BASE_URL, cookies=yum, data={
        'scriptManagerId_HiddenField':None,
        '__EVENTTARGET':None,
        '__EVENTARGUMENT':None,
        '__VIEWSTATE':view_state,
        'cpe_ClientState':None,
        'txtMsTochnit':None,
        'cmsStatusim$textBox':None,
        'txtGush':None,
        'txtwinCal1$textBox':None,
        'txtwinCal1$popupWin$time':None,
        'txtwinCal1$popupWin$mskTime_ClientState':None,
        'txtFromHelka':None,
        'txtwinCal2$textBox':None,
        'txtwinCal2$popupWin$time':None,
        'txtwinCal2$popupWin$mskTime_ClientState':None,
        'txtMakom':None,
        'cmsMerchaveiTichnun$textBox':None,
        'cmsYeudRashi$textBox':None,
        'txtMatara':None,
        'cmsYeshuvim$textBox':None,
        'cmsKodAchrai$textBox':None,
        'cmsTakanon$textBox':None,
        'txtAchrai':None,
        'cmsSug$textBox':None,
        'cmsMmg$textBox':None,
        'cmsKodMetachnen$textBox':None,
        'cmsTasrit$textBox':None,
        'txtMetachnen':None,
        '__CALLBACKID':'scriptManagerId',
        '__CALLBACKPARAM': 'Mmi.Tashtiot.UI.AjaxComponent.TableView$#$~$#$GetData$#${"P0":"'+data_source+'","P1":0,"P2":-1,"P3":["mtysvShemYishuv","Link","Status","tbMahut","Takanon","Tasrit","Nispach","Mmg","tbMakom","tbYechidotDiur","mtmrthTirgumMerchav","mtstTargumSugTochnit","svtTargumSugVaadatTichnun","tbTochnitId", "tbMsTochnit"],"P4":"~","P5":"~","P6":true,"P7":true}'
        })
        # Note and warning: other available fields for selction are: "tbMerchav","tbMsTochnitYashan","tbKodIshuv","tbSug","tbTamlilSaruk","tbMmg","mtmhzShemMachoz","tbTabaSruka","mtsttKvutzatStatusim","tbAchrai","tbMetachnen","tbShemMetachnen","mtkyPianuachYeud","tbYalkut","tbTaarichDigitation","tUniqueID"
        # DO NOT, however, select the field "tbMatara", as it reduces the amount of results in jerusalem from ~15000 to ~1500 (true for June 18th 2014)
        # and, if fields are added here they should be added above as well in the get_gush_json_page function

    # Send a parameterized request to the server (just search for the gush)
    r = ses.post(
        '%s/iturTabot2/taba1.aspx/getNetuneiTochniotByAllParames' % BASE_URL, 
        headers={'Content-Type':'application/json'}, 
        cookies=yum, 
        data=json.dumps({
            'IsOneRow': False,'SourceName': data_source,'bBProjects': False,'conMachoz': 0,'iFromHelka': "-1",'iGush': gush_id,'iMaamadMoncipali': "-1",'iMachoz': "-1",'iNumOfRows': 300,'iToHelka': "-1",'rtncol': 2,'sAchrai': "~",'sFromTaarichStatus': "~",'sKodAchrai': "~",'sKodIshuv': "~",'sKodMetachnen': "~",'sKvutzatStatusim': "~",'sMakom': "~",'sMatara': "~",'sMerchav': "~",'sMetachnen': "~",'sMisTochnit': "~",'sMmg': "~",'sSug': "~",'sTabaSruka': "~",'sTakanon': "~",'sTasrit': "~",'sTik': "~",'sToTaarichStatus': "~",'sVaada': "~",'sYeudRashi': "~"
        })
    )

    result = []
    page = 0

    # Get the first page of results and extra data
    first_page = get_mmi_gush_json_page(ses, page, yum, view_state, data_source)
    result = result + json.loads(re.findall('\[.*?\]', first_page)[0])

    # Get the number of pages from the first page (every page has this)
    pages = int(re.findall('\$([\-#0-9]*)', re.findall('\](.*?){', first_page)[0])[7])

    # Get the rest of the pages
    while page < pages:
        page = page + 1
        result = result + json.loads('[' + re.findall('\[(.*?)\]', get_mmi_gush_json_page(ses, page, yum, view_state, data_source))[0] + ']')

    ses.close()
    return result