def qichachaFromIndustry(f, t): myLogging.info('start from %s to %s ', f, t) indBaseUrl = 'http://www.qichacha.com/gongsi_industry?industryCode=' conn, csor = getComConnCsor() for code in range(f, t + 1): industCode = chr(code + 65) industOrder = code inductBasePageUrl = indBaseUrl + industCode + '&industryorder=' + str( industOrder) try: myLogging.info('start indust base pages, %s', inductBasePageUrl) # qichachaFromIndustPageUrl(inductBasePageUrl,conn, csor) myLogging.info('end indust base pages, %s', inductBasePageUrl) myLogging.info('start indust subIndust pages, %s', inductBasePageUrl) pageContent = getQichachaHtml(inductBasePageUrl) pageSoup = getSoupByStrEncode(pageContent, 'utf-8') subUrlTags = pageSoup.select('.filter-tag')[1] if not subUrlTags: myLogging.error('no subUrls, skipped, %s', inductBasePageUrl) for tag in subUrlTags.select('a'): subUri = tag['href'] subUrl = urlparse.urljoin(indBaseUrl, subUri) myLogging.info('start sub indust base pages, %s', subUrl) qichachaFromIndustPageUrl(subUrl, conn, csor) myLogging.info('end sub indust base pages, %s', subUrl) except Exception as e: myLogging.error('indust error, industCode: %s url: %s; error: %s ', industCode, inductBasePageUrl, e)
def insertInvestList(uid, content): global conn, csor if not conn or (not csor): conn, csor = getComConnCsor() csor.execute('insert ignore com_invest (uid, investList) values (%s, %s)', (uid, content)) conn.commit()
def loadComNameByLength(nameLength): global conn, csor if not conn or (not csor): conn, csor = getComConnCsor() csor.execute( 'select companyName from com_base_copy where length(companyName) = %s ', (nameLength, )) result = csor.fetchall() return result
def getQichachaInvestDigests(): idbloom = getBloom() conn, csor = getComConnCsor() csor.execute('select uid from com_invest') ids = csor.fetchall() [idbloom.add(mid[0]) for mid in ids] # if ids[0][0] in idbloom: myLogging.info('load exists ids ok') return idbloom
def crawlBaseInfo(begin, end): print 'start from ', begin, ' to ', end baseUrl = 'http://www.tianyancha.com/IcpList/' conn, csor = getComConnCsor() seq = range(begin, end) random.shuffle(seq) for id in seq: try: dealById(baseUrl, conn, csor, id) except Exception as e: print id, ': ', e
def qichachaFromProvs(provs): myLogging.info('start: provs %s', str(provs)) catBaseIrl = 'http://www.qichacha.com/gongsi_area_prov_' conn, csor = getComConnCsor() for prov in provs: pageBaseUrl = catBaseIrl + prov + '_p_' for pageCount in range(1, 501): pageUrl = pageBaseUrl + str(pageCount) + '.shtml' try: pageContent = getQichachaHtml(pageUrl) pageSoup = getSoupByStrEncode(pageContent, 'utf-8') dealUIDsBySoup(conn, csor, pageCount, pageSoup, prov) except Exception as ee: myLogging.error('page ' + str(pageCount) + ' error %s', ee)
def insertWithUid(conn2, csor2, prv, uid): if uid in idBloom: print 'already crawled uid:', uid return # idBloom.add(uid) global conn, csor if not conn or (not csor): conn2, csor2 = getComConnCsor() com_base_info_str = getBaseInfoById(prv, uid) com_base_info_json = json.loads(com_base_info_str) if com_base_info_json['status'] != 1: print 'json int not succ , uid: ', uid, ' content:', com_base_info_str return data = com_base_info_json['data']['Company'] companyType = data['EconKind'] # webName = data['webName'] companyName = data['Name'] liscense = data['No'] if not liscense: liscense = data['OrgNo'] examineDate = '' if data['CheckDate']: examineDate = data['CheckDate'].strip() # webSite = ','.join(data['webSite']) # sql = """insert ignore into com_base (id,companyName,companyType,examineDate,liscense,source,webSite,webName) values (%s,%s,%s,%s,%s,%s,%s,%s);""" % (str(id), companyName, companyType,examineDate, liscense, "tianyacha",webSite,webName) global staticInsertTotolCount, staticInsertTotolTime, staticInsertCarry startTime = time.time() try: csor2.execute( """insert ignore into com_base_copy (id,companyName,companyType,examineDate,liscense,source,src_content) values (%s,%s,%s,%s,%s,%s,%s);""", (uid, companyName, companyType, examineDate, liscense, "qichacha", com_base_info_str)) conn2.commit() myLogging.info('comOk, uid: %s, comName: %s', uid, unicode(companyName).encode('utf-8')) endTime = time.time() thisSpentTime = endTime - startTime statisMysqlInsert(staticInsertCarry, thisSpentTime) except Exception as e: myLogging.error('insert error, uid: %s, error:%s', uid, e)
def fromInvestInt(): global conn, csor if not conn or (not csor): conn, csor = getComConnCsor() csor.execute( "select id,companyName from com_base_copy where id = '6bc7e7ccdb755391651316a0227c059b' and companyName is not Null limit 10;" ) result = csor.fetchall() for comInfo in result: uid = comInfo[0] cName = comInfo[1] if not cName: myLogging.warning('no comName skip, uid: %s', uid) continue getInvestListByNameId(uid, cName)
def getQichachaDigests(): idbloom = loadBloomFromFile('local/qichachaUIDs') if idbloom: myLogging.info('load bloom from file succ, no need load from db') # return idbloom else: myLogging.info('no dump bloom file, load from db') idbloom = getBloom(2000 * 10000) # idbloom = getBloom() conn, csor = getComConnCsor() csor.execute('select id from com_base_copy') # csor.execute('select id from com_base_copy limit 10') ids = csor.fetchall() [idbloom.add(mid[0]) for mid in ids] # if ids[0][0] in idbloom: myLogging.info('load exists ids ok, generate dump bloom file') dumpBloomToFile(idbloom, fileName='local/qichachaUIDs') return idbloom