Exemple #1
0
def write_to_json_file():
  """
  爬虫数据写入json文件,
  而不是直接写入数据库
  """
  logging.info("enter write_to_json_file...")
  #获得最大页数,确定爬虫范围
  last_page_number = get_last_web_page_number()
  #打开文件
  current_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))+"/json"
  yyyy = time.strftime("%Y")
  mm = time.strftime("%m")
  dd = time.strftime("%d")
  folder = os.path.join(current_dir, yyyy, mm, dd)
  if not os.path.exists(folder):
    os.makedirs(folder)
  os.chdir(folder)

  logging.debug("max page is :%s"%last_page_number)
  for i in range(1,last_page_number+1):
    with open("news_%s.json"%i, "w+") as news:
      #获取页面
      page = get_web_page_by_url(URL.format(page_number=i))
      #解析
      for item_json_data in load_and_parse(page):
        #写入文件
        news.write(item_json_data+"\n")

  logging.info("leaving write_to_json_file...")
Exemple #2
0
def get_last_web_page_number():
  """
  解析web page,获得最大页数
  """
  r = get_web_page_by_url(url = URL.format(page_number=1))
  soup = BeautifulSoup(r)
  max_number = int(soup.find("div", attrs={"id":"pageTurnning"}).find("span", attrs={"class":"pageinfo"}).strong.get_text())
  return max_number
Exemple #3
0
def get_stats(teamId=0, leagueId=0):
    url = "statistics/{leagueId}/{teamId}"
    url = URL(url.format(leagueId=leagueId, teamId=teamId))
    response = requests.get(url, headers=headers)
    response = response.json()
    api = response["api"]
    count = api["results"]
    items = api["statistics"]
    return None if not (count > 0) else items
Exemple #4
0
def get_team(teamId = 0):
	url = "teams/team/{teamId}"
	url = URL(url.format(teamId = teamId))
	response = requests.get(url, headers=headers)
	response = response.json()
	api = response["api"]
	count = api["results"]
	items = api["teams"]
	return None if not(count == 1) or count != len(items) else items[0]
Exemple #5
0
def get_h2h(team1Id=0, team2Id=0):
    url = "fixtures/h2h/{team1Id}/{team2Id}"
    url = URL(url.format(team1Id=team1Id, team2Id=team2Id))
    response = requests.get(url, headers=headers)
    response = response.json()
    api = response["api"]
    count = api["results"]
    items = api["fixtures"]
    return None if not (count > 0) or count != len(items) else items
Exemple #6
0
def get_teams(leagueId = 0):
	url = "teams/league/{leagueId}"
	url = URL(url.format(leagueId = leagueId))
	response = requests.get(url, headers=headers)
	response = response.json()
	api = response["api"]
	count = api["results"]
	items = api["teams"]
	return None if not(count > 0) or count != len(items) else items
Exemple #7
0
def get_leagues():
    url = "leagues"
    url = URL(url)
    response = requests.get(url, headers=headers)
    response = response.json()
    api = response["api"]
    count = api["results"]
    items = api["leagues"]
    return None if not (count > 0) or count != len(items) else items
Exemple #8
0
def get_leagues_season(season=0):
    url = "leagues/season/{season}"
    url = URL(url.format(season=season))
    response = requests.get(url, headers=headers)
    response = response.json()
    api = response["api"]
    count = api["results"]
    items = api["leagues"]
    return None if not (count > 0) or count != len(items) else items
Exemple #9
0
def get_stats_date(teamId=0, leagueId=0, date=None):
    format = '%Y-%m-%d'
    date = '' if not (type(date) == dt.date) else dt.strftime(date, format)
    url = "statistics/{leagueId}/{teamId}/{date}"
    url = URL(url.format(leagueId=leagueId, teamId=teamId, date=date))
    response = requests.get(url, headers=headers)
    response = response.json()
    api = response["api"]
    count = api["results"]
    items = api["statistics"]
    return None if not (count > 0) else items
def connect():

    if not os.path.isdir('.temp'):
        os.mkdir('.temp')

    with requests.Session() as request:

        form_data = {}

        try:
            response = request.get(URL, headers=HEADER)
            soup = BeautifulSoup(response.text, 'html.parser')

            #Bypassing Captcha
            #-----------------
            link = soup.find('img', {'id': 'imgCaptcha'})
            captcha = link.get('src')

            captchaLink = URL.split('Combine_GradeCard.aspx')[0] + captcha
            urllib.request.urlretrieve(captchaLink, '.temp/captcha.jpg')
            captchaText = pytesseract.image_to_string(
                Image.open('.temp/captcha.jpg'))
            #-----------------

            viewstate = soup.select("#__VIEWSTATE")[0]['value']
            eventValidation = soup.select("#__EVENTVALIDATION")[0]['value']
            viewstateGenerator = soup.select(
                '#__VIEWSTATEGENERATOR')[0]['value']

            form_data = {
                '__EVENTTARGET': '',
                '__EVENTARGUMENT': '',
                '__VIEWSTATE': viewstate,
                '__VIEWSTATEGENERATOR': viewstateGenerator,
                '__EVENTVALIDATION': eventValidation,
                'captcha': "rb_captcha_image",
                'txtcaptcha': captchaText,
                'btnsearch': 'Print+Score+Card'
            }
        except:
            return form_data

    return form_data
Exemple #11
0
def asking(halt, halt_id):
    # With no timeout, if internet is disabled while the requests are going out,
    #   they will hang indefinetely
    response = requests.get(URL.format(halt_id), timeout=10)
    if response.status_code == 200:
        res = []
        tree = ElementTree.fromstring(response.content)
        all_stop = list(tree.getiterator(tag='waitingtime'))
        for each in all_stop:
            info = {i.tag: i.text for i in each}
            if info["line"] in INTERESTING_HALTS[halt][
                    1] and info["destination"] != "ULB":
                if len(info["minutes"]) == 1:
                    info["minutes"] = " " + info["minutes"]
                res.append(
                    [info["line"], info["destination"], info["minutes"]])
        return res
    else:
        return -1
Exemple #12
0
def write_to_mongo_db():
  """
  爬虫数据写入json文件,
  而不是直接写入数据库
  """
  logging.info("enter write_mongo_db")

  #获得最大页数,确定爬虫范围
  last_page_number = get_last_web_page_number()

  logging.debug("max page is :%s"%last_page_number)
  for i in range(1,last_page_number+1):
    #获取页面
    page = get_web_page_by_url(URL.format(page_number=i))
    #解析页面
    for item_raw_str in load_and_parse(page):
      #写入mongo数据库
      item_json_data = json.loads(item_raw_str)
      mongo.insert_news(item_json_data)

  logging.info("leaving write_to_mongo_db")
def getCoursesNames():
    try:
        response = requests.get(URL.split('Combine_GradeCard.aspx')[0] +
                                'List_Of_Declared_Results.aspx',
                                headers=HEADER)
        soup = BeautifulSoup(response.text, 'html.parser')
        cells = soup.find('table', attrs={
            'id': "gvshow_Reg"
        }).findAll('td')[2:]

        courses = []
        for i in range(1, len(cells), 6):
            courses.append(''.join([s for s in cells[i].text]))
        courses = sorted(set(courses))

        with open('Resources/CoursesNames.txt', 'w') as f:
            for i, name in enumerate(courses):
                f.write(f'{i+1}) {name}\n')

    except:
        #print('Error in fetching Course names.')
        return False
def getData(SELECTED_CITY):
    if not SELECTED_CITY:
        SELECTED_CITY = CITY
    resp = requests.get(URL.format(CITY=SELECTED_CITY, API_KEY=API_KEY))
    data = resp.json()
    data = [{
        'city':
        str(data['name']),
        'longitude':
        str(abs(data['coord']['lon'])) +
        ('° E ' if (data['coord']['lon'] >= 0) else '° W '),
        'latitude':
        str(abs(data['coord']['lat'])) +
        ('° N ' if (data['coord']['lat'] >= 0) else '° S ')
    }, {
        'weather': str(data['weather'][0]['description']),
        'temperature': str(data['main']['temp']) + '° C ',
        'pressure': str(data['main']['pressure']) + ' hPa ',
        'humidity': str(data['main']['humidity']) + ' % ',
        'wind': str(data['wind']['speed']) + ' m/s '
    }]
    return data
def isResultOut(courseName, sem):
    try:
        if courseName == '' and sem == '':
            return True

        response = requests.get(URL.split('Combine_GradeCard.aspx')[0] +
                                'List_Of_Declared_Results.aspx',
                                headers=HEADER)
        soup = BeautifulSoup(response.text, 'html.parser')
        cells = soup.find('table', attrs={
            'id': "gvshow_Reg"
        }).findAll('td')[2:]
        courseName = ''.join([s for s in courseName if s.isalnum()])

        course = []
        semester = []
        for i in range(1, len(cells), 6):
            course.append(''.join(
                [s for s in cells[i].text.lower() if s.isalnum()]))
            semester.append(cells[i + 2].text)

        course_sem_dict = dict(zip(course, semester))

        flag = 0
        for course, semester in course_sem_dict.items():
            if (courseName.lower() in course) and (sem.lower()
                                                   == semester.lower()):
                flag = 1
        if flag == 1:
            return True
        else:
            return False

    except:
        #print('Error occurred in fetching result. Retrying...')
        pass

    return False
Exemple #16
0
def onResponse(response):
    deferred = readBody(response)
    deferred.addCallback(get_response)
    return deferred


def get_response(body):
    soup = BeautifulSoup(body, "html.parser")
    movie = soup.find('h1').text[:-5]
    play(movie)


@implementer(IPolicyForHTTPS)
class IgnoreHTTPS:
    def creatorForNetloc(self, hostname, port):
        options = ssl.CertificateOptions(verify=False)
        return _sslverify.ClientTLSOptions(hostname.decode('ascii'),
                                           options.getContext())


agent = Agent(reactor, IgnoreHTTPS())
closedDeferredes = []
d = agent.request(b'GET', URL.encode())

d.addCallback(onResponse)
d.addErrback(onError)
closedDeferredes.append(d)
gatherResults(closedDeferredes).addCallback(lambda ignored: reactor.stop())
reactor.run()