Example #1
0
def readweb():
    print u'get jobdict.....'
    configmap = {}
    jobnamesfile = 'dict/jobnames.pkl'
    if os.path.isfile(jobnamesfile):
        configmap = pickle.load(open(jobnamesfile))
        return configmap
    p = Proxy()
    while (True):
        proxies = p.getproxies()
        try:
            r = requests.get(url='http://www.lagou.com/',
                             proxies=proxies, timeout=60)
            break
        except Exception, e:
            p.nextip()
            logging.debug(str(e))
Example #2
0
def scrapy(jobname):
    # print 'crawling ' + jobname + '.....'
    p = Proxy()
    db = MySQLdb.connect(dbadd, user, password, database,
                         use_unicode=True, charset="utf8")
    cursor = db.cursor()
    req_url = 'http://www.lagou.com/jobs/positionAjax.json?'
    headers = {'content-type': 'application/json;charset=UTF-8'}
    while (True):
        proxies = p.getproxies()
        try:
            req = requests.post(req_url, params={'first': 'false', 'pn': 1, 'kd': jobname}, headers=headers, timeout=60,
                                proxies=proxies, allow_redirects=False)
            totalCount = req.json()['content']['positionResult']['totalCount']
            pageSize = req.json()['content']['positionResult']['pageSize']
            maxpagenum = totalCount / pageSize

            break
        except Exception, e:
            p.nextip()
            logging.debug(str(e))