def getGeoLocation(): url = cfg.getProperty("baiduapi", "geo_url") fout = open("D:/shequloc.txt", "a") with open('D:/shequ.txt', 'r') as f: loop = 0 for line in f: addr = line.rstrip() lng, lat = geoinfo(url, addr) print lng, lat fout.write(addr + ',' + lng + ',' + lat + '\n') loop = loop + 1 if loop % 20 == 0 : fout.flush() time.sleep(random.randint(2, 5)) fout.close()
def getGeoLocation(): url = cfg.getProperty("baiduapi", "geo_url") fout = open("D:/shequloc.txt", "a") with open('D:/shequ.txt', 'r') as f: loop = 0 for line in f: addr = line.rstrip() lng, lat = geoinfo(url, addr) print lng, lat fout.write(addr + ',' + lng + ',' + lat + '\n') loop = loop + 1 if loop % 20 == 0: fout.flush() time.sleep(random.randint(2, 5)) fout.close()
def get_db(): print cfg.getProperty("database", "DB_HOST") client = MongoClient(cfg.getProperty("database", "DB_HOST"), int(cfg.getProperty("database", "DB_PORT"))) db = client[cfg.getProperty("database", "DB_NAME")] return db
def write_to_excel(sheet, datalist): fn = 'D:/shequ_' + sheet + '.xls' wb = excel.Workbook() sheet = wb.add_sheet(sheet.decode('utf-8')) size = len(datalist) for i in range(size): for j in range(3): try: sheet.write(i, j, datalist[i][j]) except: print i, j wb.save(fn) def test(): for i in range(3): print random.randint(1, 5) d = '浦东' print d.decode('GBK').encode('utf-8') print d.decode('utf-8') if __name__ == '__main__': url = cfg.getProperty("soufun", "url") crawlerjob(url) # crawler_pages(u'浦东', 'http://esf.sh.soufun.com/housing/25__0_0_0_0_1_0_0/') # test()
response.close() gzipped = response.headers.get('Content-Encoding') if gzipped: data = StringIO.StringIO(data) gzipper = gzip.GzipFile(fileobj=data) html = gzipper.read() gzipper.close() else: html = data html = html.decode('GBK').encode('utf-8') return html except Exception, e: print "No content for ", url, e return None def test(): for i in range(3): print random.randint(1, 5) d = '浦东' print d.decode('GBK').encode('utf-8') print d.decode('utf-8') if __name__ == '__main__': url = cfg.getProperty("soufun", "url") crawlerjob(url) # crawler_pages(u'浦东', 'http://esf.sh.soufun.com/housing/25__0_0_0_0_1_0_0/') # test()
def get_db(): print cfg.getProperty("database", "DB_HOST"); client = MongoClient(cfg.getProperty("database", "DB_HOST"), int(cfg.getProperty("database", "DB_PORT"))) db = client[cfg.getProperty("database", "DB_NAME")] return db