def get_TA125_by_dates(start_date, end_date): days_count = (end_date - start_date).days + 1 manual_requests = get_requests_from_HAR_file('HARS/bizportal_HAR.json') my_request = {} my_request[ 'url'] = "http://www.bizportal.co.il/Quote/Transactions/HistoricalRates_AjaxBinding_Read/33333333?startD={s_day}%2F{s_mon}%2F{s_year}&endD={e_day}%2F{e_mon}%2F{e_year}&take={page_size}&skip=0&page=1&pageSize={page_size}&sort%5B0%5D%5Bfield%5D=DealDate&sort%5B0%5D%5Bdir%5D=desc".format( s_year=start_date.year, s_mon='{:02d}'.format(start_date.month), s_day='{:02d}'.format(start_date.day), e_year=end_date.year, e_mon='{:02d}'.format(end_date.month), e_day='{:02d}'.format(end_date.day), page_size=days_count) #my_request['url'] = "http://www.bizportal.co.il/Quote/Transactions/HistoricalRates_AjaxBinding_Read/33333333?startD=10%2F03%2F2016&endD=26%2F03%2F2016&take={page_size}&skip=0&page=1&pageSize={page_size}&sort%5B0%5D%5Bfield%5D=DealDate&sort%5B0%5D%5Bdir%5D=desc".format(page_size=10) my_request['headers'] = HAR_to_dict( manual_requests[1]['request']['headers']) my_request['cookies'] = HAR_to_dict( manual_requests[1]['request']['cookies']) response = requests.get(url=my_request['url'], headers=my_request['headers'], cookies=my_request['cookies'], verify=False) json_str_content = response.content.replace('"', '\"') results = json.loads(json_str_content) if results['Errors'] is None: return results['Data'] else: return None
def get_weather_monthly_data(month, year): from time import strptime my_request[ 'url'] = "https://www.wunderground.com/history/airport/LLBG/{year}/{month}/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=".format( month=month, year=year) my_request['headers'] = HAR_to_dict( manual_requests[0]['request']['headers']) my_request['cookies'] = HAR_to_dict( manual_requests[0]['request']['cookies']) response = requests.get(url=my_request['url'], headers=my_request['headers'], cookies=my_request['cookies']) tree = html.fromstring(response.content) #check we got the right mothly info page_year = tree.xpath('//*[@id="obsTable"]/thead/tr/th[1]')[0].text page_month = tree.xpath('//*[@id="obsTable"]/tbody[1]/tr/td[1]')[0].text if ((month != strptime(page_month, '%b').tm_mon) or (int(page_year) != year)): print "Error. Got wrong month/year ({page_month}/{page_year}) instead of {month}/{year}".format( month=month, year=year, page_year=page_year, page_month=page_month) return None data = {} fields = [ 'Day', 'Temp_H', 'Temp_A', 'Temp_M', 'DewPoint_H', 'DewPoint_A', 'DewPoint_M', 'Humidity_H', 'Humidity_A', 'Humidity_M', 'SeaLevelPress_H', 'SeaLevelPress_A', 'SeaLevelPress_M', 'Visibility_H', 'Visibility_A', 'Visibility_M', 'Wind_H', 'Wind_A', 'Wind_M', 'Precip', 'Events' ] #fields_xpath_suffixes = ['td[1]',td[3]/span','td[6]/span','td[9]/span','td[12]/span','td[15]/span','td[18]/span','td[20]/span','td[21]'] idx = 2 while (True): columns = [ td.text_content().strip() for td in tree.xpath( '//*[@id="obsTable"]/tbody[{row}]/tr/td'.format(row=idx)) ] if (not columns): break date_key = "{year}/{month}/{day}".format(year=year, month=month, day=columns[0]) data[date_key] = dict(zip(fields, columns)) idx = idx + 1 return data
def testing_tase_data(): ##################################################################################################### #trying with csv request - got stuck in the csv reader or maybe even the request itself is not good ##################################################################################################### manual_requests = get_requests_from_HAR_file('HARS/tase_csv_request.json') my_request = {} my_request['url'] = manual_requests[0]['request']['url'] my_request['headers'] = HAR_to_dict( manual_requests[0]['request']['headers']) my_request['cookies'] = HAR_to_dict( manual_requests[0]['request']['cookies']) #response = requests.get(url=my_request['url'], headers=my_request['headers'], cookies=my_request['cookies'], verify=False)#, queryString=my_request['queryString']) import csv cr = csv.reader(open(my_request['url'], "rb")) for row in cr: print row sys.exit() ############################################################################################ #trying scrapping the html page...doen't work for some reason - maybe the script issue... ############################################################################################ manual_requests = get_requests_from_HAR_file('HARS/tase_HAR.json') my_request = {} cache_flag = True my_request[ 'url'] = "https://www.tase.co.il/Heb/MarketData/Indices/MarketCap/Pages/IndexHistoryData.aspx?Action=3&addTab=&IndexId=137" my_request['headers'] = HAR_to_dict( manual_requests[0]['request']['headers']) my_request['cookies'] = HAR_to_dict( manual_requests[0]['request']['cookies']) my_request['queryString'] = HAR_to_dict( manual_requests[0]['request']['queryString']) my_request['postData'] = manual_requests[0]['request']['postData']['text'] #print response.status_code #print response.content #sys.exit() html_content = None if (cache_flag is True): html_content = open('response.html', 'r').read() else: response = requests.post( url=my_request['url'], headers=my_request['headers'], cookies=my_request['cookies'], data=my_request['postData'], verify=False) #, queryString=my_request['queryString']) html_content = response.content tree = html.fromstring(html_content) idx = 2 while (True): #row_xpath='//*[@id="ctl00_SPWebPartManager1_g_54223d45_af2f_49cf_88ed_9e3db1499c51_ctl00_HistoryData1_gridHistoryData_DataGrid1"]/tbody/tr[{row}]/td[{column}]'.format(row=idx,column=7) row_xpath = '//*[@id="u1st_Skip-links"]/span[1]/a' #date is 7. base madad is 6 and so on a = tree.xpath(row_xpath) print a if not a: break idx = idx + 1
def set_my_keywords(keyword=None, os_version='android'): ios_headers_for_add_keyword_HAR = [{ "name": "Cookie", "value": "__uvt=; _ga=GA1.2.873370462.1510860176; intercom-id-pjtwd42d=cbbd47f7-be1f-4db9-9f3a-86593757378c; mp_f9c053f6cb8aa27c2fe7abfb4847484a_mixpanel=%7B%22distinct_id%22%3A%20%2215fc65ea98426b-07d4a50c6429e1-3b3e5906-15f900-15fc65ea98510d%22%2C%22utm_source%22%3A%20%22searchman%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fsearchman.com%2Fios%2Fapp%2Fus%2F493253309%2Fen%2Fblockchain%2Fblockchain-bitcoin-wallet%2F%3Fd%3DiPhone%22%2C%22%24initial_referring_domain%22%3A%20%22searchman.com%22%7D; ag_uh=3a435e642830ef73bee7e0e0d5ed1358; ag_uhc=75b60a7055763f595e1d3e899bc578ed; ag-portfolio=%5B%22ios-886427730%22%5D; ag_bh=806719182%3AUS%2C1291851950%3AUS%2C1023123599%3AUS%2C868077558%3AUS%2Cio.voodoo.dune%3AUS%2C915637540%3AUS%2C886427730%3AUS%2C493253309%3AUS%2C; ag_lang=en; ag_public_imps=21; __utmt=1; __utma=247563269.873370462.1510860176.1511200723.1511203314.8; __utmb=247563269.6.10.1511203314; __utmc=247563269; __utmz=247563269.1510860176.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); mp_2b6156b771e3a1688ea2424a5f3e5aba_mixpanel=%7B%22distinct_id%22%3A%20%22100923%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fsearchman.com%2Fsignin%3Fnext%3D%252F%22%2C%22%24initial_referring_domain%22%3A%20%22searchman.com%22%2C%22__mps%22%3A%20%7B%7D%2C%22__mpso%22%3A%20%7B%7D%2C%22__mpus%22%3A%20%7B%7D%2C%22__mpa%22%3A%20%7B%7D%2C%22__mpu%22%3A%20%7B%7D%2C%22__mpap%22%3A%20%5B%5D%7D; uvts=6lwMqzNSEByfwHAD; __stripe_sid=a39cba9b-8469-49a0-a8fa-1c5903e0a120; __stripe_mid=2a7dca9a-3c7e-463c-ab35-80d4cb4b798e; mp_mixpanel__c=5" }, { "name": "Origin", "value": "https://searchman.com" }, { "name": "Accept-Encoding", "value": "gzip, deflate, br" }, { "name": "Host", "value": "searchman.com" }, { "name": "Accept-Language", "value": "en-US,en;q=0.9,he;q=0.8" }, { "name": "User-Agent", "value": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" }, { "name": "Content-Type", "value": "application/x-www-form-urlencoded; charset=UTF-8" }, { "name": "Accept", "value": "application/json, text/javascript, */*; q=0.01" }, { "name": "Referer", "value": "https://searchman.com/ios/my_keywords/886427730/US/" }, { "name": "X-Requested-With", "value": "XMLHttpRequest" }, { "name": "Connection", "value": "keep-alive" }, { "name": "Content-Length", "value": "31" }] android_headers_for_add_keyword_HAR = [{ "name": "Cookie", "value": "__uvt=; _ga=GA1.2.873370462.1510860176; intercom-id-pjtwd42d=cbbd47f7-be1f-4db9-9f3a-86593757378c; mp_f9c053f6cb8aa27c2fe7abfb4847484a_mixpanel=%7B%22distinct_id%22%3A%20%2215fc65ea98426b-07d4a50c6429e1-3b3e5906-15f900-15fc65ea98510d%22%2C%22utm_source%22%3A%20%22searchman%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fsearchman.com%2Fios%2Fapp%2Fus%2F493253309%2Fen%2Fblockchain%2Fblockchain-bitcoin-wallet%2F%3Fd%3DiPhone%22%2C%22%24initial_referring_domain%22%3A%20%22searchman.com%22%7D; ag_uh=3a435e642830ef73bee7e0e0d5ed1358; ag_uhc=75b60a7055763f595e1d3e899bc578ed; ag-portfolio=%5B%22ios-886427730%22%5D; __utmt=1; ag_public_imps=23; ag_lang=en; ag_bh=com.coinbase.android%3AUS%2C806719182%3AUS%2C1291851950%3AUS%2C1023123599%3AUS%2C868077558%3AUS%2Cio.voodoo.dune%3AUS%2C915637540%3AUS%2C886427730%3AUS%2C493253309%3AUS%2C; __utma=247563269.873370462.1510860176.1511250766.1511261486.11; __utmb=247563269.7.10.1511261486; __utmc=247563269; __utmz=247563269.1510860176.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); mp_2b6156b771e3a1688ea2424a5f3e5aba_mixpanel=%7B%22distinct_id%22%3A%20%22100923%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fsearchman.com%2Fsignin%3Fnext%3D%252F%22%2C%22%24initial_referring_domain%22%3A%20%22searchman.com%22%2C%22__mps%22%3A%20%7B%7D%2C%22__mpso%22%3A%20%7B%7D%2C%22__mpus%22%3A%20%7B%7D%2C%22__mpa%22%3A%20%7B%7D%2C%22__mpu%22%3A%20%7B%7D%2C%22__mpap%22%3A%20%5B%5D%7D; uvts=6lwMqzNSEByfwHAD; __stripe_sid=fd046fe6-efa7-4687-83e3-81a441a00dc8; __stripe_mid=2a7dca9a-3c7e-463c-ab35-80d4cb4b798e; mp_mixpanel__c=6" }, { "name": "Origin", "value": "https://searchman.com" }, { "name": "Accept-Encoding", "value": "gzip, deflate, br" }, { "name": "Host", "value": "searchman.com" }, { "name": "Accept-Language", "value": "en-US,en;q=0.9,he;q=0.8" }, { "name": "User-Agent", "value": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" }, { "name": "Content-Type", "value": "application/x-www-form-urlencoded; charset=UTF-8" }, { "name": "Accept", "value": "application/json, text/javascript, */*; q=0.01" }, { "name": "Referer", "value": "https://searchman.com/android/my_keywords/com.coinbase.android/US/" }, { "name": "X-Requested-With", "value": "XMLHttpRequest" }, { "name": "Connection", "value": "keep-alive" }, { "name": "Content-Length", "value": "28" }] ios_headers_for_add_keyword = HAR_to_dict(ios_headers_for_add_keyword_HAR) android_headers_for_add_keyword = HAR_to_dict( android_headers_for_add_keyword_HAR) ios_url_for_add_keyword = "https://searchman.com/ios/my_keywords_save/886427730/US" android_url_for_add_keyword = "https://searchman.com/android/my_keywords_save/com.coinbase.android/US" data = {'terms': keyword, 'term_type': '3'} if os_version is 'android': response = requests.post(url=android_url_for_add_keyword, headers=android_headers_for_add_keyword, data=data) if os_version is 'ios': response = requests.post(url=ios_url_for_add_keyword, headers=ios_headers_for_add_keyword, data=data) #raise if response not ok response.raise_for_status()
"referer", "value": "https://adwords.google.com/um/GetStarted/Home?__u=8366573661&__c=6519561002&authuser=0" }] data1 = "7|0|76|https://adwords.google.com/um/GetStarted/com.google.ads.apps.usermgmt.getstarted.client.main.Module/|22D5B3FA3CEBB378F43793C10A1E70DB|com.google.ads.api.gwt.rpc.client.BatchedInvocationService|invoke|com.google.ads.api.gwt.rpc.client.BatchedInvocationRequest/2983766987|com.google.ads.apps.common.shared.header.BatchRequestHeaderImpl/2595329959|java.util.HashMap/1797211028|com.google.ads.apps.common.shared.header.ApiHeaderType/3992732687|com.google.ads.apps.common.shared.header.BatchAdsApiRequestHeaderInfo/1561664655|com.google.ads.api.modules.request.headers.GrubbyHeader$ChangeIdMode/272930539|com.google.ads.apps.common.usagetracking.server.UsageTrackingService.logImpression|rGRQNhESwLJgUXz8hBjKCfxyUFw:1510910684674|java.util.ArrayList/4159755760|com.google.ads.apps.common.usagetracking.shared.UsageTrackingServiceGwt$ImpressionRequest/3960782202|com.google.ads.apps.common.shared.header.SingleAdsApiRequestHeader/4098801396|com.google.ads.apps.common.shared.header.ClientCacheHint/2402802613|com.google.ads.apps.common.shared.header.ServerCacheHint/3129959624|java.lang.Boolean/476441737|com.google.ads.api.modules.request.headers.ApiVersion/450371163|com.google.ads.api.modules.request.headers.GrubbyHeader$CustomerIdMode/45453300|com.google.ads.api.modules.request.headers.GrubbyHeader$DatabaseReadMode/1150601902|com.google.common.collect.RegularImmutableList/440499227|com.google.ads.common.logging.MetricEntries$ImpressionEntry/454587110|[Lcom.google.ads.common.logging.ApexExperimentMetrics;/1267822276|com.google.ads.common.logging.ApexExperimentMetrics/1204991806|TREATMENT|java.lang.String/2004016611|AWSM|AWN_INTERNALOPS|MCC|NOTIFICATIONS|UM|ADWORDS_NEXT_BILLING||CUES|ADWORDS_NEXT_MCC|ADWORDS_NEXT_INTERNALOPS|ADWORDS_NEXT_ACCESS_TO_ALL|TREATMENT_SIGNUP_FLOW_CLICKS_WITH_BADGES|CM|CM_GROWTH_MOBILE_PROMO|CT|PRIME|AWN_PRIME|AWN_CM|ADWORDS_NEXT|ADWORDS_NEXT_NEW_CUSTOMERS|KP|ADWORDS_NEXT_KEYWORD_PLANNER|https://adwords.google.com/um/GetStarted/Home?__u=8366573661&__c=6519561002&authuser=0#oc|[Lcom.google.ads.common.logging.ExperimentMetrics;/2152658160|com.google.ads.common.logging.ExperimentMetrics/1213839764|enable-call-consent|com.google.common.collect.RegularImmutableMap/1085455152|remove-progress-bar|enable-appstore|orinoco-megablox|get-started-w-logo|show-estimated-reach-panel|expanded-text-ad|youtube-linked-accounts|show-top-ad-preview|enable-goldmine-auto-expand|u2-migration|default-opt-in|billing-all-countries|enable-auto-expand|rewire-guided-orinoco-billing-for-budgets-in-ads|mobile-compatible-orinoco|policy-certificate-expiration|target-cpa-suggestion|electrum-account-linking-ui|[Ljava.lang.String;/2600011424|Orinoco.oc.keywords-editor-keywords-input-focused|oc.keywords-editor-keywords-input-focused|com.google.ads.apps.common.uimode.shared.UiMode/4208379950|1|2|3|4|1|5|5|6|quRMuCHZo|45|7|1|8|0|9|A|10|2|GEmJsq|Hyr8hd|11|12|13|1|14|15|16|0|0|17|13|0|18|0|18|1|0|0|19|6|0|A|0|0|0|0|0|0|0|0|20|0|21|1|0|0|0|0|0|1|1|0|0|0|0|0|0|0|0|0|0|0|0|22|0|0|0|0|7|0|0|0|0|23|24|8|25|0|1|26|22|5|27|28|27|29|27|30|27|31|27|32|33|0|252|34|FUlReeu8O|25|0|1|26|22|3|-24|27|35|-28|36|0|252|34|FWguhina5|25|0|1|26|22|3|-24|-31|-28|37|0|252|34|FW9cs8QQc|25|0|1|26|22|3|-24|-31|-28|38|0|749|34|FW9cs8QQc|25|0|1|39|22|2|27|40|-28|41|0|340|34|FV4L2$Qzk|25|0|0|34|22|9|-24|-31|-28|27|42|27|43|27|44|-25|27|45|-38|46|0|252|34|FWzMJiJ_3|25|0|1|26|22|3|-24|-31|-28|47|0|252|34|FWzhhJjZK|25|0|0|34|22|3|-24|-28|27|48|49|0|252|34|FW1uiRGFm|50|0|51|19|52|53|54|0|1|1|1|52|55|-52|1|1|1|52|56|-52|1|1|1|52|57|-52|1|1|1|52|58|-52|1|1|1|52|59|-52|1|1|1|52|60|-52|1|1|1|52|61|-52|1|1|0|52|62|-52|1|1|1|52|63|-52|1|1|1|52|64|-52|1|1|1|52|65|-52|1|1|1|52|66|-52|1|1|1|52|67|-52|1|1|1|52|68|-52|1|1|1|52|69|-52|1|1|1|52|70|-52|1|1|1|52|71|-52|1|1|1|52|72|-52|1|1|1|V_JTowC|73|0|-1|74|73|1|75|76|0|" url1 = "https://adwords.google.com/um/GetStarted/g?authuser=0&__u=8366573661&__c=6519561002" # response = requests.post(url1,headers=HAR_to_dict(headers1),data=data1) # print "RESPOSNE 1:" # print json.dumps(response.content,indent=4) requests_dict = json.load(open("HAR_req.json", 'r')) my_requests = requests_dict['log']['entries'] responses = [] for request in my_requests: url = request['request']['url'] headers = HAR_to_dict(request['request']['headers']) method = request['request']['method'] if method == "POST": data = request['request']['postData'] responses.append(requests.post(url=url, headers=headers, data=data)) if method == "GET": responses.append(requests.get(url=url, headers=headers)) for response in responses: print "RESPONSE:" print response.content
def get_weather_custom_data(start_date, end_date): from time import strptime start_date_str = "{year}/{month}/{day}".format(year=start_date.year, month=start_date.month, day=start_date.day) #my_request['url'] = "https://www.wunderground.com/history/airport/LLBG/{year}/{month}/1/MonthlyHistory.html?&reqdb.zip=&reqdb.magic=&reqdb.wmo=".format(month=month,year=year) my_request[ 'url'] = "https://www.wunderground.com/history/airport/LLBG/{start_date_str}/CustomHistory.html?dayend={end_day}&monthend={end_month}&yearend={end_year}&req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo=".format( start_date_str=start_date_str, end_year=end_date.year, end_month=end_date.month, end_day=end_date.day) my_request['headers'] = HAR_to_dict( manual_requests[0]['request']['headers']) my_request['cookies'] = HAR_to_dict( manual_requests[0]['request']['cookies']) response = requests.get(url=my_request['url'], headers=my_request['headers'], cookies=my_request['cookies']) tree = html.fromstring(response.content) data = {} fields = [ 'Day', 'Temp_H', 'Temp_A', 'Temp_M', 'DewPoint_H', 'DewPoint_A', 'DewPoint_M', 'Humidity_H', 'Humidity_A', 'Humidity_M', 'SeaLevelPress_H', 'SeaLevelPress_A', 'SeaLevelPress_M', 'Visibility_H', 'Visibility_A', 'Visibility_M', 'Wind_H', 'Wind_A', 'Wind_M', 'Precip', 'Events' ] #fields_xpath_suffixes = ['td[1]',td[3]/span','td[6]/span','td[9]/span','td[12]/span','td[15]/span','td[18]/span','td[20]/span','td[21]'] months_dict = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' } idx = 1 curr_month = None curr_year = start_date.year next_year_flag = False while (True): columns = [ td.text_content().strip() for td in tree.xpath( '//*[@id="obsTable"]/tbody[{row}]/tr/td'.format(row=idx)) ] if (not columns): break #check if its a month row if columns[0] in months_dict.keys(): if next_year_flag is True: curr_year = str(int(curr_year) + 1) next_year_flag = False if columns[0] == 'Dec': next_year_flag = True curr_month = months_dict[columns[0]] idx += 1 continue date_key = "{year}/{month}/{day}".format(year=curr_year, month=curr_month, day=columns[0]) data[date_key] = dict(zip(fields, columns)) cond_data = get_weather_daily_data(date_key) data[date_key]['ConditionsScore'] = cond_data['cond_code'] data[date_key]['CloudsScore'] = cond_data['cond_str_code'] data[date_key]['Events'] = handle_events_string( data[date_key]['Events'].replace("\t", "").replace("\n", "").replace(",", ";")) #print date_key #print json.dumps(data[date_key],indent=4) #sys.stdout.flush() idx = idx + 1 return data
def get_weather_daily_data(date_str): my_request[ 'url'] = 'https://www.wunderground.com/history/airport/LLBG/{date_str}/DailyHistory.html'.format( date_str=date_str) my_request['headers'] = HAR_to_dict( manual_requests[0]['request']['headers']) my_request['cookies'] = HAR_to_dict( manual_requests[0]['request']['cookies']) response = requests.get(url=my_request['url'], headers=my_request['headers'], cookies=my_request['cookies']) tree = html.fromstring(response.content) #key: time(hour), value: dict of data (see list of fields) daily_data = {} fields = [ 'Time', 'Temp', 'DewPoint', 'Humidity', 'Pressure', 'Visibility', 'WindDir', 'WindSpeed', 'GustSpeed', 'Precip', 'Events', 'Conditions' ] fields_xpath_suffixes = [ 'td[1]', 'td[2]/span/span[1]', 'td[3]/span/span[1]', 'td[4]', 'td[5]/span/span[1]', 'td[6]', 'td[7]', 'td[8]/span[1]/span[1]', 'td[9]', 'td[10]', 'td[11]', 'td[12]' ] fields_titles = [ th.text_content().strip() for th in tree.xpath('//*[@id="obsTable"]/thead/tr/th') ] fields_xpath_suffixes = [ '', 'td[2]/span/span[1]', 'td[3]/span/span[1]', 'td[4]', 'td[5]/span/span[1]', 'td[6]', 'td[7]', 'td[8]/span[1]/span[1]', 'td[9]', 'td[10]', 'td[11]', 'td[12]' ] #print [r.text_content().strip() for r in tree.xpath('//*[@id="obsTable"]/tbody/tr[3]/td')] #return #loop over rows row_num = 1 while (tree.xpath('//*[@id="obsTable"]/tbody/tr[{row_num}]'.format( row_num=row_num))): data = {} for idx, field in enumerate(fields_titles): try: xpath = '//*[@id="obsTable"]/tbody/tr[{row}]/td[{col}]/span/span[1]'.format( row=row_num, col=idx + 1) data[field] = tree.xpath(xpath)[0].text except: xpath = '//*[@id="obsTable"]/tbody/tr[{row}]/td[{col}]'.format( row=row_num, col=idx + 1) data[field] = tree.xpath(xpath)[0].text #handle conditions string conditions_string = tree.xpath( '//*[@id="obsTable"]/tbody/tr[{row}]/td[2]'.format(row=row_num + 1))[0].text if data['Conditions'] != "Clear": code = handle_conditions_string(conditions_string) else: code = 0 data['conditions_details'] = {'code': code, 'str': conditions_string} daily_data[data[fields_titles[0]]] = data row_num = row_num + 2 #calculate conditions average sum_str_code = 0.0 sum_cond_code = 0.0 len = 0 i = 0 for time in daily_data.values(): i = i + 1 if time['conditions_details']['code'] is not None: sum_str_code = sum_str_code + time['conditions_details']['code'] sum_cond_code = sum_cond_code + handle_conditions_column_string( time['Conditions']) len = len + 1 return { 'cond_str_code': sum_str_code / len, 'cond_code': sum_cond_code / len }
def lambda_handler(event, context): #check offline or online auth_flag = True write_to_file_flag = False return_dict = {} #authenticate auth_url = "http://www.kolnoapeer.co.il/wp-content/themes/KolnoaPeer/inc/physical/login-handle.php" auth_headers_HAR = [{ "name": "Origin", "value": "http://www.kolnoapeer.co.il" }, { "name": "Accept-Encoding", "value": "gzip, deflate" }, { "name": "Host", "value": "www.kolnoapeer.co.il" }, { "name": "Accept-Language", "value": "en-US,en;q=0.9,he;q=0.8" }, { "name": "User-Agent", "value": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" }, { "name": "Content-Type", "value": "application/x-www-form-urlencoded; charset=UTF-8" }, { "name": "Accept", "value": "*/*" }, { "name": "Referer", "value": "http://www.kolnoapeer.co.il/" }, { "name": "Cookie", "value": "optimizelyEndUserId=oeu1512829976741r0.20758536549030615; _hjIncludedInSample=1; optimizelySegments=%7B%225493192264%22%3A%22gc%22%2C%225520510180%22%3A%22false%22%2C%225515770230%22%3A%22search%22%7D; optimizelyBuckets=%7B%7D; _ga=GA1.3.644062575.1512829977; _gid=GA1.3.783463494.1513163597; _gat=1" }, { "name": "Connection", "value": "keep-alive" }, { "name": "Content-Length", "value": "38" }] auth_headers = HAR_to_dict(auth_headers_HAR) auth_cookies_HAR = [{ "name": "_ga", "value": "GA1.3.644062575.1512829977", "expires": None, "httpOnly": False, "secure": False }, { "name": "_gat", "value": "1", "expires": None, "httpOnly": False, "secure": False }, { "name": "_gid", "value": "GA1.3.783463494.1513163597", "expires": None, "httpOnly": False, "secure": False }, { "name": "_hjIncludedInSample", "value": "1", "expires": None, "httpOnly": False, "secure": False }, { "name": "optimizelyBuckets", "value": "%7B%7D", "expires": None, "httpOnly": False, "secure": False }, { "name": "optimizelyEndUserId", "value": "oeu1512829976741r0.20758536549030615", "expires": None, "httpOnly": False, "secure": False }, { "name": "optimizelySegments", "value": "%7B%225493192264%22%3A%22gc%22%2C%225520510180%22%3A%22false%22%2C%225515770230%22%3A%22search%22%7D", "expires": None, "httpOnly": False, "secure": False }] auth_cookies = HAR_to_dict(auth_cookies_HAR) auth_data = { "mimeType": "application/x-www-form-urlencoded; charset=UTF-8", "text": "userEmail=0507609679&userPass=12345678", "params": [{ "name": "userEmail", "value": "0507609679" }, { "name": "userPass", "value": "12345678" }] } auth_data = "userEmail=0507609679&userPass=12345678" if (auth_flag): response = requests.post(url=auth_url, headers=auth_headers, data=auth_data) #,cookies=auth_cookies) auth_response_json = json.loads(response.content) session_token = auth_response_json['user_token'] else: session_token = None #get weekly sched headers_HAR = [{ "name": "Cookie", "value": "optimizelyEndUserId=oeu1511636150491r0.03070219004762409; _hjIncludedInSample=1; peerUserLogged=true; peerUserID=14611; peerToken=3b5279c1775d41cabe6c0db93b0d7761; peerUserFirstName=%D7%90%D7%9C%D7%94; peerUserLastName=%D7%90%D7%9C%D7%A4%D7%A1%D7%99; peerUserEmail=alfasi21%40hotmail.com; peerUserPhone=%20; peerUserDetails=%7B%22status%22%3A%221%22%2C%22userid%22%3A14611%2C%22user_p_name%22%3A%22%5Cu05d0%5Cu05dc%5Cu05d4%22%2C%22user_token%22%3A%2272c424c0446f42f89bd3d5f4784fb391%22%2C%22user_l_name%22%3A%22%5Cu05d0%5Cu05dc%5Cu05e4%5Cu05e1%5Cu05d9%22%2C%22user_info%22%3A%7B%22CompanyID%22%3A201%2C%22BranchID%22%3A1%2C%22ID%22%3A14611%2C%22FirstName%22%3A%22%5Cu05d0%5Cu05dc%5Cu05d4%22%2C%22LastName%22%3A%22%5Cu05d0%5Cu05dc%5Cu05e4%5Cu05e1%5Cu05d9%22%2C%22MobilePhone%22%3A%22050-7609679%22%2C%22HomePhone%22%3A%22%20%22%2C%22WorkPhone%22%3A%22%20%22%2C%22CardNumber%22%3A0%2C%22Email%22%3A%22alfasi21%40hotmail.com%22%2C%22DateOfBirth%22%3A%221986-07-28T00%3A00%3A00%22%2C%22SignedRegulations%22%3Afalse%2C%22SignedRetulationsInt%22%3Anull%2C%22HasMedical%22%3Afalse%2C%22PaymentLeft%22%3A0%2C%22HasTrainingPlan%22%3Afalse%2C%22IsInsured%22%3Afalse%2C%22InsuranceEndDate%22%3Anull%2C%22CityName%22%3A%22%5Cu05ea%5Cu05dc%20%5Cu05d0%5Cu05d1%5Cu05d9%5Cu05d1%22%2C%22HouseNumber%22%3A0%2C%22GroupCode%22%3A0%2C%22IDNumber%22%3A21967583%2C%22NeighborhoodCode%22%3Anull%2C%22Cars%22%3A%7B%7D%7D%7D; optimizelySegments=%7B%225493192264%22%3A%22gc%22%2C%225520510180%22%3A%22false%22%2C%225515770230%22%3A%22search%22%7D; optimizelyBuckets=%7B%7D; _ga=GA1.3.462259338.1511636151; _gid=GA1.3.185880343.1511636151; _gat_UA-37156680-1=1" }, { "name": "Origin", "value": "http://www.kolnoapeer.co.il" }, { "name": "Accept-Encoding", "value": "gzip, deflate" }, { "name": "Host", "value": "www.kolnoapeer.co.il" }, { "name": "Accept-Language", "value": "en-US,en;q=0.9,he;q=0.8" }, { "name": "User-Agent", "value": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36" }, { "name": "Content-Type", "value": "application/x-www-form-urlencoded; charset=UTF-8" }, { "name": "Accept", "value": "*/*" }, { "name": "Referer", "value": "http://www.kolnoapeer.co.il/%d7%9e%d7%a2%d7%a8%d7%9b%d7%aa-%d7%97%d7%95%d7%92%d7%99%d7%9d-%d7%a9%d7%91%d7%95%d7%a2%d7%99/" }, { "name": "X-Requested-With", "value": "XMLHttpRequest" }, { "name": "Proxy-Connection", "value": "keep-alive" }, { "name": "Content-Length", "value": "211" }] headers = HAR_to_dict(headers_HAR) #indicates which week to take from calander. [0/1] for [current/next week]. important to scheduling Sunday classes week_flag = 0 url = "http://www.kolnoapeer.co.il/wp-content/themes/KolnoaPeer/inc/physical/weeklySched.php?week={week_flag}".format( week_flag=week_flag) data = { "mimeType": "application/x-www-form-urlencoded; charset=UTF-8", "text": "site_url=&permalink=http%3A%2F%2Fwww.kolnoapeer.co.il%2F%25d7%259e%25d7%25a2%25d7%25a8%25d7%259b%25d7%25aa-%25d7%2597%25d7%2595%25d7%2592%25d7%2599%25d7%259d-%25d7%25a9%25d7%2591%25d7%2595%25d7%25a2%25d7%2599%2F", "params": [{ "name": "site_url", "value": "" }, { "name": "permalink", "value": "http%3A%2F%2Fwww.kolnoapeer.co.il%2F%25d7%259e%25d7%25a2%25d7%25a8%25d7%259b%25d7%25aa-%25d7%2597%25d7%2595%25d7%2592%25d7%2599%25d7%259d-%25d7%25a9%25d7%2591%25d7%2595%25d7%25a2%25d7%2599%2F" }] } cookies_HAR = [{ "name": "optimizelyEndUserId", "value": "oeu1512829976741r0.20758536549030615", "expires": None, "httpOnly": False, "secure": False }, { "name": "_gat", "value": "1", "expires": None, "httpOnly": False, "secure": False }, { "name": "_hjIncludedInSample", "value": "1", "expires": None, "httpOnly": False, "secure": False }, { "name": "optimizelySegments", "value": "%7B%225493192264%22%3A%22gc%22%2C%225520510180%22%3A%22false%22%2C%225515770230%22%3A%22search%22%7D", "expires": None, "httpOnly": False, "secure": False }, { "name": "optimizelyBuckets", "value": "%7B%7D", "expires": None, "httpOnly": False, "secure": False }, { "name": "_ga", "value": "GA1.3.644062575.1512829977", "expires": None, "httpOnly": False, "secure": False }, { "name": "_gid", "value": "GA1.3.783463494.1513163597", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserLogged", "value": "true", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserID", "value": "14611", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerToken", "value": "3b5279c1775d41cabe6c0db93b0d7761", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserFirstName", "value": "%D7%90%D7%9C%D7%94", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserLastName", "value": "%D7%90%D7%9C%D7%A4%D7%A1%D7%99", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserEmail", "value": "alfasi21%40hotmail.com", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserPhone", "value": "%20", "expires": None, "httpOnly": False, "secure": False }, { "name": "peerUserDetails", "value": "%7B%22status%22%3A%221%22%2C%22userid%22%3A14611%2C%22user_p_name%22%3A%22%5Cu05d0%5Cu05dc%5Cu05d4%22%2C%22user_token%22%3A%223b5279c1775d41cabe6c0db93b0d7761%22%2C%22user_l_name%22%3A%22%5Cu05d0%5Cu05dc%5Cu05e4%5Cu05e1%5Cu05d9%22%2C%22user_info%22%3A%7B%22CompanyID%22%3A201%2C%22BranchID%22%3A1%2C%22ID%22%3A14611%2C%22FirstName%22%3A%22%5Cu05d0%5Cu05dc%5Cu05d4%22%2C%22LastName%22%3A%22%5Cu05d0%5Cu05dc%5Cu05e4%5Cu05e1%5Cu05d9%22%2C%22MobilePhone%22%3A%22050-7609679%22%2C%22HomePhone%22%3A%22%20%22%2C%22WorkPhone%22%3A%22%20%22%2C%22CardNumber%22%3A0%2C%22Email%22%3A%22alfasi21%40hotmail.com%22%2C%22DateOfBirth%22%3A%221986-07-28T00%3A00%3A00%22%2C%22SignedRegulations%22%3Afalse%2C%22SignedRetulationsInt%22%3Anull%2C%22HasMedical%22%3Afalse%2C%22PaymentLeft%22%3A0%2C%22HasTrainingPlan%22%3Afalse%2C%22IsInsured%22%3Afalse%2C%22InsuranceEndDate%22%3Anull%2C%22CityName%22%3A%22%5Cu05ea%5Cu05dc%20%5Cu05d0%5Cu05d1%5Cu05d9%5Cu05d1%22%2C%22HouseNumber%22%3A0%2C%22GroupCode%22%3A0%2C%22IDNumber%22%3A21967583%2C%22ZipCode%22%3A0%2C%22NeighborhoodCode%22%3Anull%2C%22Cars%22%3A%7B%7D%7D%7D", "expires": None, "httpOnly": False, "secure": False }] cookies = HAR_to_dict(cookies_HAR) # request to auth_url returns a json with the user_token # we take the token and injects it into our template cookie and header's cookie field cookies['peerToken'] = session_token headers[ 'Cookie'] = "optimizelyEndUserId=oeu1511636150491r0.03070219004762409; _hjIncludedInSample=1; peerUserLogged=true; peerUserID=14611; peerToken={session_token}; peerUserFirstName=%D7%90%D7%9C%D7%94; peerUserLastName=%D7%90%D7%9C%D7%A4%D7%A1%D7%99; peerUserEmail=alfasi21%40hotmail.com; peerUserPhone=%20; peerUserDetails=%7B%22status%22%3A%221%22%2C%22userid%22%3A14611%2C%22user_p_name%22%3A%22%5Cu05d0%5Cu05dc%5Cu05d4%22%2C%22user_token%22%3A%2272c424c0446f42f89bd3d5f4784fb391%22%2C%22user_l_name%22%3A%22%5Cu05d0%5Cu05dc%5Cu05e4%5Cu05e1%5Cu05d9%22%2C%22user_info%22%3A%7B%22CompanyID%22%3A201%2C%22BranchID%22%3A1%2C%22ID%22%3A14611%2C%22FirstName%22%3A%22%5Cu05d0%5Cu05dc%5Cu05d4%22%2C%22LastName%22%3A%22%5Cu05d0%5Cu05dc%5Cu05e4%5Cu05e1%5Cu05d9%22%2C%22MobilePhone%22%3A%22050-7609679%22%2C%22HomePhone%22%3A%22%20%22%2C%22WorkPhone%22%3A%22%20%22%2C%22CardNumber%22%3A0%2C%22Email%22%3A%22alfasi21%40hotmail.com%22%2C%22DateOfBirth%22%3A%221986-07-28T00%3A00%3A00%22%2C%22SignedRegulations%22%3Afalse%2C%22SignedRetulationsInt%22%3Anull%2C%22HasMedical%22%3Afalse%2C%22PaymentLeft%22%3A0%2C%22HasTrainingPlan%22%3Afalse%2C%22IsInsured%22%3Afalse%2C%22InsuranceEndDate%22%3Anull%2C%22CityName%22%3A%22%5Cu05ea%5Cu05dc%20%5Cu05d0%5Cu05d1%5Cu05d9%5Cu05d1%22%2C%22HouseNumber%22%3A0%2C%22GroupCode%22%3A0%2C%22IDNumber%22%3A21967583%2C%22NeighborhoodCode%22%3Anull%2C%22Cars%22%3A%7B%7D%7D%7D; optimizelySegments=%7B%225493192264%22%3A%22gc%22%2C%225520510180%22%3A%22false%22%2C%225515770230%22%3A%22search%22%7D; optimizelyBuckets=%7B%7D; _ga=GA1.3.462259338.1511636151; _gid=GA1.3.185880343.1511636151; _gat_UA-37156680-1=1".format( session_token=session_token) #get weekly sched if (auth_flag): response = requests.post(url=url, headers=headers, data=data, cookies=cookies) if (write_to_file_flag): weekly_sched_response_file = open('response.html', 'w') weekly_sched_response_file.write(response.content) weekly_sched_response_file.close() html_prefix = '<!DOCTYPE html><html lang="en" dir="ltr" class="com"><head>dsd</head><body>' html_suffix = '</body></html>' if (write_to_file_flag): with open("response.html") as weekly_sched_response_file: data = weekly_sched_response_file.read() else: data = response.content tree = html.fromstring(html_prefix + data + html_suffix) ############################################################### #xpath Examples: #xpath for all classes in specific day (1-7) #xpath = '/html/body/div/div[2]/article[6]/div' #xpath for the 8th class - for the number of seats section #xpath = '/html/body/div/div[2]/article[5]/div[8]/section[2]' #xpath for the 2th lesson - for the lesson name #xpath = '/html/body/div/div[2]/article[6]/div[2]/h4' ############################################################### #get all classes of DAY_NUMBER i = 0 week_day = (datetime.datetime.today().weekday() + 3) % 7 if week_day is 0: week_day = 7 xpath = '/html/body/div/div[2]/article[{week_day}]/div'.format( week_day=week_day) lessons = tree.xpath(xpath) #looping over all lessons for tomorrow while (True): lesson = lessons[i] i = i + 1 xpath = '/html/body/div/div[2]/article[{week_day}]/div[{lesson_number}]/h4'.format( week_day=week_day, lesson_number=i) lesson_name = tree.xpath(xpath)[0].text #xpath = '/html/body/div/div[2]/article[{week_day}]/div[{lesson_number}]/section[2]'.format(week_day=week_day,lesson_number=i) #lesson_status = tree.xpath(xpath)[0].text date = lesson.attrib['data-date'] xdate = urllib.quote_plus(date) hour = lesson.attrib['data-hour'] xhour = urllib.quote_plus(hour) lessonID = lesson.attrib['data-classid'] xlessonID = urllib.quote_plus(lessonID) instructor = lesson.attrib['data-instructor'].encode('UTF-8') xinstructor = urllib.quote_plus(instructor) if lesson_name in ['Kickboxing', 'Hiit Trx']: return_dict['lesson'] = lesson_name break #attrib = {'data-classid': '81', 'data-dur': '50', 'data-hour': '073000', 'data-date': '2017-12-14T07:30:00+02:00', 'data-instructor': u'\xd7\x9c\xd7\x99\xd7\xa8\xd7\x95\xd7\x9f \xd7\x9c.', 'class': 'one-course '} #<div class="one-course " data-classid="385" data-date="2017-12-15T09:00:00+02:00" data-hour="090000" data-dur="90" data-instructor="לינור מ."> #sign up to class request = { "method": "POST", "url": "http://www.kolnoapeer.co.il/wp-content/themes/KolnoaPeer/inc/physical/signToClass.php", "httpVersion": "HTTP/1.1", "headers": "ERASED", "queryString": [], "cookies": "ERASED", "headersSize": 2234, "bodySize": 114, "postData": { "mimeType": "application/x-www-form-urlencoded; charset=UTF-8", "text": "date=2017-12-15T09%3A00%3A00%2B02%3A00&hour=090000&lessonID=385&myInstructor=%D7%9C%D7%99%D7%A0%D7%95%D7%A8%20%D7%9E.", "params": [{ "name": "date", "value": "2017-12-15T09%3A00%3A00%2B02%3A00" }, { "name": "hour", "value": "090000" }, { "name": "lessonID", "value": "385" }, { "name": "myInstructor", "value": "%D7%9C%D7%99%D7%A0%D7%95%D7%A8%20%D7%9E." }] } } url = request['url'] data = "date={date}&hour={hour}&lessonID={lessonID}&myInstructor={myInstructor}".format( date=xdate, hour=xhour, lessonID=xlessonID, myInstructor=xinstructor) return_dict['form_data'] = data response = requests.post(url=url, headers=headers, cookies=cookies, data=data) return_dict['response_content'] = response.content return_dict['response_status_code'] = response.status_code #if response.content == 'success' and response.status_code is 200: # return "signup for class succeed" #else: return return_dict