def write_to_json_file(): """ 爬虫数据写入json文件, 而不是直接写入数据库 """ logging.info("enter write_to_json_file...") #获得最大页数,确定爬虫范围 last_page_number = get_last_web_page_number() #打开文件 current_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))+"/json" yyyy = time.strftime("%Y") mm = time.strftime("%m") dd = time.strftime("%d") folder = os.path.join(current_dir, yyyy, mm, dd) if not os.path.exists(folder): os.makedirs(folder) os.chdir(folder) logging.debug("max page is :%s"%last_page_number) for i in range(1,last_page_number+1): with open("news_%s.json"%i, "w+") as news: #获取页面 page = get_web_page_by_url(URL.format(page_number=i)) #解析 for item_json_data in load_and_parse(page): #写入文件 news.write(item_json_data+"\n") logging.info("leaving write_to_json_file...")
def get_last_web_page_number(): """ 解析web page,获得最大页数 """ r = get_web_page_by_url(url = URL.format(page_number=1)) soup = BeautifulSoup(r) max_number = int(soup.find("div", attrs={"id":"pageTurnning"}).find("span", attrs={"class":"pageinfo"}).strong.get_text()) return max_number
def get_stats(teamId=0, leagueId=0): url = "statistics/{leagueId}/{teamId}" url = URL(url.format(leagueId=leagueId, teamId=teamId)) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["statistics"] return None if not (count > 0) else items
def get_team(teamId = 0): url = "teams/team/{teamId}" url = URL(url.format(teamId = teamId)) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["teams"] return None if not(count == 1) or count != len(items) else items[0]
def get_h2h(team1Id=0, team2Id=0): url = "fixtures/h2h/{team1Id}/{team2Id}" url = URL(url.format(team1Id=team1Id, team2Id=team2Id)) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["fixtures"] return None if not (count > 0) or count != len(items) else items
def get_teams(leagueId = 0): url = "teams/league/{leagueId}" url = URL(url.format(leagueId = leagueId)) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["teams"] return None if not(count > 0) or count != len(items) else items
def get_leagues(): url = "leagues" url = URL(url) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["leagues"] return None if not (count > 0) or count != len(items) else items
def get_leagues_season(season=0): url = "leagues/season/{season}" url = URL(url.format(season=season)) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["leagues"] return None if not (count > 0) or count != len(items) else items
def get_stats_date(teamId=0, leagueId=0, date=None): format = '%Y-%m-%d' date = '' if not (type(date) == dt.date) else dt.strftime(date, format) url = "statistics/{leagueId}/{teamId}/{date}" url = URL(url.format(leagueId=leagueId, teamId=teamId, date=date)) response = requests.get(url, headers=headers) response = response.json() api = response["api"] count = api["results"] items = api["statistics"] return None if not (count > 0) else items
def connect(): if not os.path.isdir('.temp'): os.mkdir('.temp') with requests.Session() as request: form_data = {} try: response = request.get(URL, headers=HEADER) soup = BeautifulSoup(response.text, 'html.parser') #Bypassing Captcha #----------------- link = soup.find('img', {'id': 'imgCaptcha'}) captcha = link.get('src') captchaLink = URL.split('Combine_GradeCard.aspx')[0] + captcha urllib.request.urlretrieve(captchaLink, '.temp/captcha.jpg') captchaText = pytesseract.image_to_string( Image.open('.temp/captcha.jpg')) #----------------- viewstate = soup.select("#__VIEWSTATE")[0]['value'] eventValidation = soup.select("#__EVENTVALIDATION")[0]['value'] viewstateGenerator = soup.select( '#__VIEWSTATEGENERATOR')[0]['value'] form_data = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewstateGenerator, '__EVENTVALIDATION': eventValidation, 'captcha': "rb_captcha_image", 'txtcaptcha': captchaText, 'btnsearch': 'Print+Score+Card' } except: return form_data return form_data
def asking(halt, halt_id): # With no timeout, if internet is disabled while the requests are going out, # they will hang indefinetely response = requests.get(URL.format(halt_id), timeout=10) if response.status_code == 200: res = [] tree = ElementTree.fromstring(response.content) all_stop = list(tree.getiterator(tag='waitingtime')) for each in all_stop: info = {i.tag: i.text for i in each} if info["line"] in INTERESTING_HALTS[halt][ 1] and info["destination"] != "ULB": if len(info["minutes"]) == 1: info["minutes"] = " " + info["minutes"] res.append( [info["line"], info["destination"], info["minutes"]]) return res else: return -1
def write_to_mongo_db(): """ 爬虫数据写入json文件, 而不是直接写入数据库 """ logging.info("enter write_mongo_db") #获得最大页数,确定爬虫范围 last_page_number = get_last_web_page_number() logging.debug("max page is :%s"%last_page_number) for i in range(1,last_page_number+1): #获取页面 page = get_web_page_by_url(URL.format(page_number=i)) #解析页面 for item_raw_str in load_and_parse(page): #写入mongo数据库 item_json_data = json.loads(item_raw_str) mongo.insert_news(item_json_data) logging.info("leaving write_to_mongo_db")
def getCoursesNames(): try: response = requests.get(URL.split('Combine_GradeCard.aspx')[0] + 'List_Of_Declared_Results.aspx', headers=HEADER) soup = BeautifulSoup(response.text, 'html.parser') cells = soup.find('table', attrs={ 'id': "gvshow_Reg" }).findAll('td')[2:] courses = [] for i in range(1, len(cells), 6): courses.append(''.join([s for s in cells[i].text])) courses = sorted(set(courses)) with open('Resources/CoursesNames.txt', 'w') as f: for i, name in enumerate(courses): f.write(f'{i+1}) {name}\n') except: #print('Error in fetching Course names.') return False
def getData(SELECTED_CITY): if not SELECTED_CITY: SELECTED_CITY = CITY resp = requests.get(URL.format(CITY=SELECTED_CITY, API_KEY=API_KEY)) data = resp.json() data = [{ 'city': str(data['name']), 'longitude': str(abs(data['coord']['lon'])) + ('° E ' if (data['coord']['lon'] >= 0) else '° W '), 'latitude': str(abs(data['coord']['lat'])) + ('° N ' if (data['coord']['lat'] >= 0) else '° S ') }, { 'weather': str(data['weather'][0]['description']), 'temperature': str(data['main']['temp']) + '° C ', 'pressure': str(data['main']['pressure']) + ' hPa ', 'humidity': str(data['main']['humidity']) + ' % ', 'wind': str(data['wind']['speed']) + ' m/s ' }] return data
def isResultOut(courseName, sem): try: if courseName == '' and sem == '': return True response = requests.get(URL.split('Combine_GradeCard.aspx')[0] + 'List_Of_Declared_Results.aspx', headers=HEADER) soup = BeautifulSoup(response.text, 'html.parser') cells = soup.find('table', attrs={ 'id': "gvshow_Reg" }).findAll('td')[2:] courseName = ''.join([s for s in courseName if s.isalnum()]) course = [] semester = [] for i in range(1, len(cells), 6): course.append(''.join( [s for s in cells[i].text.lower() if s.isalnum()])) semester.append(cells[i + 2].text) course_sem_dict = dict(zip(course, semester)) flag = 0 for course, semester in course_sem_dict.items(): if (courseName.lower() in course) and (sem.lower() == semester.lower()): flag = 1 if flag == 1: return True else: return False except: #print('Error occurred in fetching result. Retrying...') pass return False
def onResponse(response): deferred = readBody(response) deferred.addCallback(get_response) return deferred def get_response(body): soup = BeautifulSoup(body, "html.parser") movie = soup.find('h1').text[:-5] play(movie) @implementer(IPolicyForHTTPS) class IgnoreHTTPS: def creatorForNetloc(self, hostname, port): options = ssl.CertificateOptions(verify=False) return _sslverify.ClientTLSOptions(hostname.decode('ascii'), options.getContext()) agent = Agent(reactor, IgnoreHTTPS()) closedDeferredes = [] d = agent.request(b'GET', URL.encode()) d.addCallback(onResponse) d.addErrback(onError) closedDeferredes.append(d) gatherResults(closedDeferredes).addCallback(lambda ignored: reactor.stop()) reactor.run()