Example #1
0
def readweb():
    print u'get jobdict.....'
    configmap = {}
    jobnamesfile = 'dict/jobnames.pkl'
    if os.path.isfile(jobnamesfile):
        configmap = pickle.load(open(jobnamesfile))
        return configmap
    p = Proxy()
    while (True):
        proxies = p.getproxies()
        try:
            r = requests.get(url='http://www.lagou.com/',
                             proxies=proxies, timeout=60)
            break
        except Exception, e:
            p.nextip()
            logging.debug(str(e))
Example #2
0
def get_company_description(fetchallist):
    db = MySQLdb.connect(dbadd, user, password, database,
                         use_unicode=True, charset="utf8")
    cursor = db.cursor()
    p = Proxy()
    for id in fetchallist:
        while (True):
            try:
                values = get_company_info_byid(id[0], p)
                values.append(id[0])
                cursor.execute(
                    'update company set companyUrl = %s,description = %s,fullName = %s,shortName = %s,detailPosition = %s,industryField = %s,companySize = %s,city = %s,financeStage = %s,profile = %s where companyId = %s',
                    values)
                db.commit()
                print u"update:", id[0]
                break
            except Exception, e:
                logging.debug(str(e))
                p.nextip()
Example #3
0
def scrapy(jobname):
    # print 'crawling ' + jobname + '.....'
    p = Proxy()
    db = MySQLdb.connect(dbadd, user, password, database,
                         use_unicode=True, charset="utf8")
    cursor = db.cursor()
    req_url = 'http://www.lagou.com/jobs/positionAjax.json?'
    headers = {'content-type': 'application/json;charset=UTF-8'}
    while (True):
        proxies = p.getproxies()
        try:
            req = requests.post(req_url, params={'first': 'false', 'pn': 1, 'kd': jobname}, headers=headers, timeout=60,
                                proxies=proxies, allow_redirects=False)
            totalCount = req.json()['content']['positionResult']['totalCount']
            pageSize = req.json()['content']['positionResult']['pageSize']
            maxpagenum = totalCount / pageSize

            break
        except Exception, e:
            p.nextip()
            logging.debug(str(e))