Ejemplo n.º 1
0
def crawlGradeFromGwgl():
    session = HTMLSession()
    jar = requests.cookies.RequestsCookieJar()

    jar.set('JSESSIONID', '28191DA0466EDA27D69CB81417772905.node1')
    jar.set('C2RT', 'a33e03cddc0fb11b90f118ae407641dc')
    jar.set('bocms_visite_user_session', 'C816B689B1A91CC278FD5FCD7CD1CD61')
    jar.set('SERVERNAME', 'xk2')
    jar.set('GSESSIONID', '28191DA0466EDA27D69CB81417772905.node1')
    session.cookies = jar

    courseIdToFind = [
        '12160007.08', '22163171.01', '22163219.01', '22163280.01'
    ]
    # request_toJWGL = session.get('http://jwglnew.hunnu.edu.cn/eams/teach/grade/course/person!search.action?semesterId=82&projectType=&_=1578409087439')
    result = session.get(
        'http://jwglnew.hunnu.edu.cn/eams/teach/grade/course/person!historyCourseGrade.action'
    )
    gradeRows = result.html.find('div.grid>table.gridtable>tbody tr',
                                 containing=courseIdToFind)

    if len(gradeRows) > 0:
        textBuffer = ''
        for item in gradeRows:
            courseId = item.find('td:nth-child(3)')[0].text
            courseName = item.find('td:nth-child(4)')[0].text
            courseScore = item.find('td:nth-last-child(3)')[0].text
            textBuffer += courseId + ' ' + courseName + ': ' + courseScore + '\n'
        print(textBuffer)
        requests.get(
            'http://127.0.0.1:5700/send_private_msg?user_id=806361380&message='
            + textBuffer)
    else:
        print('[' + ','.join(courseIdToFind) + '] Not Found')
Ejemplo n.º 2
0
    def get_data(self):
        source = HotspotSource.objects.get(code=1)
        uri = 'https://www.zhihu.com/hot'

        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.3",
            "Referer": "https://www.zhihu.com/"
        }

        session = HTMLSession()
        session.cookies = cookiejar.LWPCookieJar(filename='./hotspot/utils/cookies.txt')
        session.cookies.load(ignore_discard=True)

        # TODO: 知乎必须要登录, 没法绕过, 后续添加验证码自动识别方案, 暂时写死cookie
        r = session.get(url=uri, headers=headers).html
        sections = r.find('#TopstoryContent > div > div > div.HotList-list', first=True).find('section')

        tmp = []

        for section in sections:
            order, _, desc, _, *t = section.text.split('\n')
            hot_uri = section.find('a', first=True).attrs['href']
            title = section.find('a', first=True).attrs['title']
            count = section.find('.HotItem-metrics', first=True).text.replace('分享', '')

            data = {
                'title': title,
                'uri': hot_uri,
                'extra': json.dumps({
                    'count': count,
                    'order': order,
                    'desc': desc
                }),
                'hotspot_source': source.id
            }

            tmp.append(data)

        return tmp
Ejemplo n.º 3
0
    def __init__(self, **kwargs):
        '''
        Base class for common scraping tasks

        Args:

        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()

        # delay/expire
        if kwargs.get('delay'):
            self.delay = kwargs['delay']
        else:
            self.delay = 2

        if kwargs.get('expire_hours'):
            self.expire_hours = kwargs['expire_hours']
        else:
            self.expire_hours = 168

        # add cookies
        if kwargs.get('cookies'):
            _s.cookies = kwargs['cookies']
        else:
            try:
                import cookielib
                _s.cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError):
                import http.cookiejar
                _s.cookies = http.cookiejar.MozillaCookieJar()
                
        # add headers
        if kwargs.get('headers'):
            _s.headers = kwargs['headers']
        else:
            ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
            _s.headers = {'User-Agent': ua}

        # add proxies
        if kwargs.get('proxies'):
            _s.proxies = kwargs['proxies']

        # add cache
        if not '/' in kwargs.get('cache_name', ''):
            self.cache_name = os.path.join('/tmp', kwargs['cache_name'])
        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache
            _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), 
                                    cache_etags = False,
                                    heuristic=ExpiresAfter(hours=self.expire_hours)))
        except ImportError as e:
            try:
                import requests_cache
                requests_cache.install_cache(self.cache_name)
            except:
                logging.exception('could not install cache')
        self.s = _s
Ejemplo n.º 4
0
    def __init__(self, **kwargs):
        """
        """
        logging.getLogger(__name__).addHandler(logging.NullHandler())
        self.urls = []

        # use requests HTML to aid parsing
        # has all same methods as requests.Session
        _s = HTMLSession()
        self.delay = kwargs.get("delay", 2)
        self.expire_hours = kwargs.get("expire_hours", 168)

        # add cookies
        if kwargs.get("cookies"):
            _s.cookies = kwargs["cookies"]
        else:
            import http.cookiejar

            _s.cookies = http.cookiejar.MozillaCookieJar()

        # add headers
        default_headers = {
            "User-Agent": random.choice(USER_AGENTS),
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9",
            "accept": "application/json, text/plain, */*",
        }
        _s.headers.update(default_headers)
        if kwargs.get("headers"):
            _s.headers.update(kwargs["headers"])

        # add proxies
        if kwargs.get("proxies"):
            _s.proxies = kwargs["proxies"]

        # add cache
        if not kwargs.get("cache_name"):
            self.cache_name = os.path.join("/tmp", random_string(32))
        elif "/" not in kwargs.get("cache_name", ""):
            self.cache_name = os.path.join("/tmp", kwargs["cache_name"])
        else:
            self.cache_name = kwargs.get("cache_name")

        try:
            from cachecontrol import CacheControlAdapter
            from cachecontrol.heuristics import ExpiresAfter
            from cachecontrol.caches import FileCache

            _s.mount(
                "http://",
                CacheControlAdapter(
                    cache=FileCache(self.cache_name),
                    cache_etags=False,
                    heuristic=ExpiresAfter(hours=self.expire_hours),
                ),
            )
        except ImportError:
            try:
                import requests_cache

                requests_cache.install_cache(self.cache_name)
            except BaseException:
                logging.exception("could not install cache")
        self.session = _s
from sklearn.metrics import mean_absolute_error
from sklearn import preprocessing

from sklearn.model_selection import GridSearchCV
from itertools import chain, combinations
from sklearn.model_selection import cross_val_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures


##########Downlaod data
# build a session
session = HTMLSession()
session.cookies = http.cookiejar.LWPCookieJar('cookie')

headers = {'Host':'freddiemac.embs.com',
           'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
           'Referer':'https://freddiemac.embs.com/FLoan/secure/login.php?pagename=download'}
def get_cookie():
	try:
		session.cookies.load(ignore_discard = True)
	except IOError:
		print('Cannot load cookie!')
		
def login(un, pwd):
	"""
	entering username and password
	"""
	auth_url = 'https://freddiemac.embs.com/FLoan/secure/auth.php'
Ejemplo n.º 6
0
import http.cookiejar
from math import *

import requests
from scipy import integrate

s = requests.Session()
from requests_html import HTMLSession

sqrt(1)
session = HTMLSession()
session.cookies = http.cookiejar.MozillaCookieJar("anything.txt")
mn, mx = 0, 0


def int_(x):
    def wrapper(a, b=x):
        global mn, mx
        mn = b
        mx = a
        return 0

    return wrapper


def frac(x):
    def wrapper(a, b=x):
        return b / a

    return wrapper