def sort(): """notice, in mongo java script shell, we use: sort({"field1", 1, "field2", -1}), 1 means asc, -1 means des """ ppt(list( users.find().sort("profile.enthnicity", pymongo.ASCENDING) ))
def dot_notation(): """because document can be nested, so you can use field.sub_field.subfield. ... to access children fields """ ppt(list( users.find({"profile.enthnicity": "asian"}) ))
def unit_test(): spider = Crawler() url = 'http://www.walgreens.com/locator/walgreens-424+sycolin+rd+se-leesburg-va-20175/id=15085' # url = 'http://www.walgreens.com//locator/walgreens-900+illinois+ave.-stevens+point-wi-54481/id=13074' html = spider.html(url) if html: soup = bs4.BeautifulSoup(html) services = dict() # extract service for div in soup.findAll('div', attrs = {'class': 'padTop5px wid220 float-left'}): for li in div.findAll('li'): services.setdefault('shop', list()) services['shop'].append( li.text.strip() ) for div in soup.findAll('div', attrs = {'class': 'wid220 float-left'}): for li in div.findAll('li'): services.setdefault('pharmacy', list()) services['pharmacy'].append( li.text.strip() ) for div in soup.findAll('div', attrs = {'class': 'mrgTop10px mrgBtm20px'}): for a in div.findAll('a', href = re.compile(r'http://photo.walgreens.com/walgreens/storepage/[\s\S]*')): services.setdefault('photo', list()) services['photo'].append( a.text.strip() ) ppt( services ) pass
def include_computed_fields(): """对文档进行计算之后输出。本例中主要将isbn拆分为了几个子field。 关于string aggregation operations, 请参考: http://docs.mongodb.org/manual/reference/operator/aggregation-string/ """ res = col.aggregate( [ { "$project": { "title": 1, "isbn": { "prefix": { "$substr": [ "$isbn", 0, 3 ] }, "group": { "$substr": [ "$isbn", 3, 2 ] }, "publisher": { "$substr": [ "$isbn", 5, 4 ] }, "title": { "$substr": [ "$isbn", 9, 3 ] }, "checkDigit": { "$substr": [ "$isbn", 12, 1] } }, "lastName": "$author.last", "copiesSold": "$copies", } } ] ) for doc in res: ppt(doc)
def example2(): '''GoogleV3 API 使用说明 2014-09-04 官方介绍 = https://developers.google.com/maps/documentation/geocoding/ 用户控制台 = https://code.google.com/apis/console/ 1. login your google account & google developer account 2. Go to google developer console, if there's no project, create one with any name. 3. Go "Service" It's on your left hand panel. 4. Activate "Geocoding API" 5. Go "API Access" it's on your left hand panel. 6. Scroll down, see "Simple API access", click "Create new server key", and leave it empty (It allows all ip access) 7. Then API key is there 注:普通用户只要API key就可以了,而bussiness用户需要Client_id和Secret_key ''' from geopy.geocoders import GoogleV3 from pprint import pprint as ppt ## 把查询到得结果保存到本地,节约测试时用得API access limit try: LOC = pickle.load(open('location.p', 'rb')) except: print 'Warning! Calling Google Geocoding API' engine = GoogleV3('AIzaSyBq-NZmY8G6Tm7Fzpx4dAR55Uk0n-5AIDQ') location = engine.geocode("238 MCMECHEN STREET BALTIMORE MD, 21217") LOC = location.raw # <=== 建议把这个保存了 pickle.dump(LOC.raw, open('location.p', 'wb')) ppt( LOC )
def upsert_example(): """upsert的意思是: 首先尝试update, 如果找不到该文档, 则insert改文档 """ users.update({"name": "obama"}, {"$set": {"name": "obama"}}, upsert=True) fmter.tpl._straightline("after", 100) for doc in users.find({"name": "obama"}): ppt(doc)
def multiUpdate(): # users.update({}, {"name": "obama"}) # only one document are updated users.update({}, {"$set": {"name": "obama"}}, multi=True) # all document matching where clause are updated for doc in users.find(): ppt(doc) # multiUpdate()
def test(): api_keys = GOOGLE_API_KEYS[:3] googlegeocoder = geomate.GoogleGeocoder(api_keys=api_keys) googlegeocoder.check_usable() ppt(googlegeocoder.geocode("1400 S Joyce St")) ppt(googlegeocoder.reverse((38.860, -77.066)))
def covered_query_example(): """由于我们要求只返回a, b, c关键字而不返回_id关键字, 所以这是一个covered query executionTimeMillis 非常小, 接近于0 totalDocsExamined 等于0, 因为只用到了索引而没有用到数据库 """ cursor = col.find({"a": 7, "b": 7, "c": 7}, {"_id": False, "a": True, "b": True, "c": True}) print("this is a covered query") ppt(cursor.explain()["executionStats"])
def pushAll(): """在右边加上多项, 相当于 list = list + another_list """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pushAll": {"skillset": ["data visualization", "R"]}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def unset(): """移除掉某一个key, 请使用"$unset" """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$unset": {"profile": 1}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def pop(): """在右边取出一项, 相当于list.pop() """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pop": {"skillset": 1}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def pullall(): """删除多项, 相当于 for i in list, if i in [item1, item2, ...], list.remove(i) """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pullAll": {"skillset": ["python", "cSharp", "R"]}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def pull(): """删除所有的某项, 相当于 for i in list, if i == pull_item, list.remove(i) """ users.update({"_id": 1}, {"$push": {"skillset": "python"}}) # 先加一个python, 故意添加重复项 fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pull": {"skillset": "python"}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0]) # 把所有的python, 包括重复项都删除了
def push(): """在右边添加一项, 相当于list.append(item) """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$push": {"skillset": "data visualization"}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def multiUpdate(): # users.update({}, {"name": "obama"}) # only one document are updated users.update({}, {"$set": { "name": "obama" }}, multi=True) # all document matching where clause are updated for doc in users.find(): ppt(doc) # multiUpdate()
def main_app(): ppt(get_tk(host)) print(login(host, login_url)) print(get_csrf(csrf_url)) data = get_app_data(visit_url) for dt in data: save_app_data(dt) time.sleep(1) print(dt) print('Spider finished!')
def addToSet(): """将array当成set来执行set.add(item)操作 """ users.update({"_id": 1}, {"$push": {"skillset": "python"}}) # 先加一个python, 故意添加重复项 fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$addToSet": {"skillset": "R"}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0]) # 只会在添加的时候将其当成set处理, 并不会把array自动转化为set
def test_get_all_state(): # Get test data url = urlencoder.state_listpage() path = get_path("all_state.html") if not exists(path): with ChromeSpider(executable_path) as spider: html = spider.get_html(url) write(html, path) html = read(path) ppt(htmlparser.get_all_state(html))
def update_without_set(): """若不使用collection.update(query, {"$set": {key: value}), 而使用: collection.update(query, new_document) 则会将所有定位到的document替换成, new_document """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"_id": 1}) # replace the whole document with the new one fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def main(begin,end): for anchor_id in range(begin,end): print('anchor_id: {0}'.format(anchor_id)) data = get_data(url,anchor_id) if data: try: nick_byte = data['nickname'].encode() nick_name = nick_byte.decode('unicode_escape') except: nick_name = data['nickname'] save_data(data,nick_name) ppt(data)
def pull(): """删除所有的某项, 相当于 for i in list, if i == pull_item, list.remove(i) """ users.update({"_id": 1}, {"$push": { "skillset": "python" }}) # 先加一个python, 故意添加重复项 fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pull": {"skillset": "python"}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0]) # 把所有的python, 包括重复项都删除了
def get_object(self, key): """ **中文文档** 获得一个Object。 """ obj = self.bucket.Object(key=key) res = obj.get() ppt(res) content = res["Body"].read() return content
def addToSet(): """将array当成set来执行set.add(item)操作 """ users.update({"_id": 1}, {"$push": { "skillset": "python" }}) # 先加一个python, 故意添加重复项 fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$addToSet": {"skillset": "R"}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0]) # 只会在添加的时候将其当成set处理, 并不会把array自动转化为set
def find_one(): """instead of return a cursor object, find_one() returns one document. so when you look up document by it's _id (_id field is always unique), use find_one() method. """ fmter.tpl._straightline("one document", 100) result = users.find_one({}) print(type(result)) ppt(result) fmter.tpl._straightline("none result", 100) result = users.find_one({"_id": 100}) print(type(result)) ppt(result)
def pullall(): """删除多项, 相当于 for i in list, if i in [item1, item2, ...], list.remove(i) """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pullAll": { "skillset": ["python", "cSharp", "R"] }}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def pushAll(): """在右边加上多项, 相当于 list = list + another_list """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) users.update({"_id": 1}, {"$pushAll": { "skillset": ["data visualization", "R"] }}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0])
def insept_example(): """insept的意思是: 首先尝试insert, 如果面临着_id重复问题, 则update 该逻辑可以用upsert实现。注: 有时候document是没有包含_id项的 """ doc = {"_id": 1, "name": "obama", "new_field": 999} try: users.insert(doc) except: _id = doc["_id"] del doc["_id"] users.update({"_id": _id}, {"$set": doc}, upsert=True) ppt(users.find({"name": "obama"})[0]) # insept_example()
def test_get_one(): print("{:=^100}".format("movie")) ppt(movie.get_one(1)) print("{:=^100}".format("maker")) ppt(maker.get_one(1)) print("{:=^100}".format("genre")) ppt(genre.get_one(1)) print("{:=^100}".format("person")) ppt(person.get_one(1)) print("{:=^100}".format("role")) ppt(role.get_one(1))
def example3(): """Reverse Geocoding """ from geopy.geocoders import GoogleV3 from pprint import pprint as ppt ## 把查询到得结果保存到本地,节约测试时用得API access limit try: LOC = pickle.load(open('location.p', 'rb')) except: print 'Warning! Calling Google Geocoding API' engine = GoogleV3('AIzaSyAuzs8xdbysdYZO1wNV3vVw1AdzbL_Dnpk') location = engine.reverse("44.44, -104.51")[0] # <== 根据坐标反向查询 print location, dir(location), location.raw LOC = location.raw # <=== 建议把这个保存了 pickle.dump(LOC, open('location.p', 'wb')) ppt( LOC )
def test_pool(self, pool): print( '\nBegin to test whether `proxy_pool` can be used, you will wait about [{} minutes].\n' .format(math.trunc((len(pool) * 5) / 60))) url_pool = [ 'https://hao.360.cn/', 'https://www.baidu.com/', 'https://www.taobao.com/', 'https://www.jd.com/', 'http://www.weibo.com/', 'http://www.toutiao.com/' ] UA = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6' } sss = requests.Session() final_proxy_pool = [] start = time.time() N = 0 for pxy in pool: sss.headers.update(UA) sss.proxies.update(pxy) url = random.choice(url_pool) try: r = sss.get(url, timeout=5) if r.status_code == requests.codes.ok: final_proxy_pool.append(pxy) #print('Success num[{0}], [{1}].'.format(len(final_proxy_pool),pxy)) except: pass finally: sss.close() N += 1 print('Tested [{0}%] -------- Wait [{1} minutes]'.format( math.trunc((N / len(pool)) * 100), math.trunc(-1 + (len(pool) * 5 - (time.time() - start)) / 60))) print('\n[{0}] proxies will be loaded.\n'.format( len(final_proxy_pool))) ppt(final_proxy_pool) #print('--------------------test_time:{} seconds. Instantiation end.---------------------'.format(math.trunc(time.time()-start))) print( '\n---------------------- YOU CAN BEGIN TO USE PROXY ----------------------\n' ) return final_proxy_pool
def path(maze, begin, end): i, j = begin ei, ej = end s = [(i, j)] maze[i][j] = 1 while s: i, j = s[-1] if (i, j) == (ei, ej): break for di, dj in [(0, 1), (1, 0), (-1, 0), (0, -1)]: if maze[i + di][j + dj] == 0: maze[i + di][j + dj] = 1 s.append((i + di, j + dj)) print('a: ', s) break else: s.pop() print('b: ', s) ppt(maze) return s
def test_get_all_county_and_zipcode(): state_list = ["MD", "VA", "PA"] path_list = [get_path("state_%s.html" % state) for state in state_list] # Get test data need_download_flag = False for state, path in zip(state_list, path_list): if not exists(path): need_download_flag = True if need_download_flag: with ChromeSpider(executable_path) as spider: for state, path in zip(state_list, path_list): if not exists(path): url = urlencoder.county_and_zipcode_listpage(state) html = spider.get_html(url) write(html, path) for state, path in zip(state_list, path_list): html = read(path) all_county, all_zipcode = htmlparser.get_all_county_and_zipcode( html) ppt(all_zipcode)
def absolute_update(): """collection.update()语法分两部分, 第一部分是定位到需要修改的document, 第二部分是对值 进行设定。 注意: 使用 "$set": {key: value} 只会对key的部分进行修改, 如果使用: users.update({"_id": 1}, {"name": "obama"}), 则会将整个文档替换成 {"name": "obama"} """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) ppt(users.find({"_id": 2})[0]) users.update({"_id": 1}, {"$set": {"name": "obama", # update name field "profile.enthnicity": "african american"}}) # access child users.update({"name": "michael"}, {"age": 100}) # replace whole document, only keep _id fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0]) ppt(users.find({"_id": 2})[0])
def test_get_one(): print("{:=^100}".format("user")) ppt(user.get_one(1)) print("{:=^100}".format("post")) ppt(post.get_one(1)) print("{:=^100}".format("tag")) ppt(tag.get_one(1))
def test_UPDATE(self): connect = sqlite3.connect(":memory:") cursor = connect.cursor() # http://www.w3schools.com/sql/sql_create_table.asp create_table_sql = \ """ CREATE TABLE employee ( _id INTEGER PRIMARY KEY NOT NULL, role TEXT, name TEXT, profile BLOB ) """ cursor.execute(create_table_sql) data = [(1, "coder", "John", None), (2, "sales", "Mike", None)] cursor.executemany("INSERT INTO employee VALUES (?,?,?,?)", data) cursor.execute( "UPDATE employee SET role = ?, profile = ? WHERE _id = ?", ("manager", pickle.dumps({"age": 32}), 2)) ppt(cursor.execute("SELECT * FROM employee").fetchall())
def test_post_all(): all_data = json.load(open("all_data.json", "r")) for data in all_data["maker_data"]: res = maker.post_one(data) ppt(res) for data in all_data["genre_data"]: res = genre.post_one(data) ppt(res) for data in all_data["person_data"]: res = person.post_one(data) ppt(res) for data in all_data["movie_data"]: res = movie.post_one(data) ppt(res)
def relative_update(): """在Sql中有 set column_name = column_name + 1 这种相对更新的方法。在mongodb中我们的做法是: 1. 使用$inc, $mul等操作符: http://docs.mongodb.org/manual/reference/operator/update-field/ 2. 首先find()找到document, 然后修改document对象, 最后再collection.save(document)保存改动。 """ fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 1})[0]) doc = users.find_one({"_id": 1}) # find the document doc["age"] += 30 # do some change to the document users.save(doc) # save changes into collections fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 1})[0]) fmter.tpl._straightline("before", 100) ppt(users.find({"_id": 2})[0]) users.update({"_id": 2}, {"$inc": {"age": 30}}) fmter.tpl._straightline("after", 100) ppt(users.find({"_id": 2})[0])
def discover_index(): """mongodb中每一个database都有一个reserved的collection, 叫system.indexes。里面储存了改数据库中 所有collection的索引的信息。你可以使用db.system.indexes.find()命令查看所有的metadata。也可以用 db.system.indexes.find({"ns": #db_name.collection_name})来查看某一个collections相关的索引信息。 mongodb中系统回味任何collection的_id这个特殊的field建立主键索引, 这和关系数据库一样。 """ print("\n显示 _id 和 a 这两个field有索引") ppt(list(db.system.indexes.find({"ns": "test.col"}))) # 显示 _id 和 a 这两个field有索引 print("\n返回所有index列表") ppt(list(db.col.list_indexes())) # pymongo api, 返回所有index列表 print("\n返回collection中所有index的信息") ppt(db.col.index_information()) # pymongo api, 返回collection中所有index的信息
def main(): try: database.connect() database.execute_sql('PRAGMA foreign_keys = ON;') logger.info("Querying the database") query = (Job.select()) ppt('{:25} | {:10} | {:10} | {:20}'.format('Job Name', 'Duration', 'Name', 'Department')) ppt('-' * (75)) for job in query: ppt('{:25} | {:10} | {:10} | {:20}'.format( job.job_name, job.duration, str(job.person_employed), job.job_department.dpt_name)) except Exception as e: logger.info(e) finally: database.close()
Mongodb中如何query. http://docs.mongodb.org/manual/reference/operator/query/ """ from tt00_connect import client, db, users from angora.GADGET.pytimer import Timer from angora.STRING.formatmaster import fmter from pprint import pprint as ppt from datetime import datetime, date import re timer = Timer() # comparison operator: $gt, $gte, $lt, $lte, $ne, $eq = {key: value} fmter.tpl._straightline("""users.find({"age": {"$gt": 30}})""", 150) ppt(list( users.find({"age": {"$gt": 30}}) )) fmter.tpl._straightline("""users.find({"age": {"$lt": 30}})""", 150) ppt(list( users.find({"age": {"$lt": 30}}) )) # compare datetime, in java script shell, the command to create a datetime object is: # db.test.insert({"Time" : new ISODate("2012-01-11T03:34:54Z") }); fmter.tpl._straightline("""users.find({"enroll_date": {"$gt": datetime(2014, 6, 1)}}""", 150) ppt(list( users.find({"enroll_date": {"$gt": datetime(2014, 6, 1)}}) )) # $in, $nin, in and not in
for i, j in attackGroup: M[i][j] = M[j][i] = 1 return M def division(M, n=None): if not n: n = len(M) #如果存在没有互相攻击的动物,那一般会给出动物数量 res = [] q = deque(range(n)) pre = n while q: cur = q.popleft() if pre >= cur: res.append([]) for i in res[-1]: if M[cur][i] == 1: q.append(cur) break else: res[-1].append(cur) pre = cur return res if __name__ == '__main__': M = initM() ppt(M) print(division(M))
if mode == 'departure': origin = airport_code else: # mode == 'arrival' destination = airport_code with open(fname, 'rb') as f: html = f.read() html = html.replace('a0:', '') soup = BS4(html) c = itertools.count(0) records = list() for tr in soup.find_all('tr', attrs = {}): try: # tag2row 很可能会出错,一旦出错说明这个tag不包含数据,那么就可以直接跳过了 if mode == 'departure': destination, flight, airline, scheduled_time, actual_time, Terminal_Gate, status, equip = tag2row(tr, valid_airport_code, date) else: # mode == 'arrival' origin, flight, airline, scheduled_time, actual_time, Terminal_Gate, status, equip = tag2row(tr, valid_airport_code, date) if c.next() >= 2: # 从第3个tr tag开始,才是正确的航班信息 if (origin in valid_airport_code) and (destination in valid_airport_code): GUID = md5_obj( (origin, destination, flight, scheduled_time) ) records.append( (GUID, origin, destination, flight, airline, scheduled_time, actual_time, Terminal_Gate, status, equip) ) except: pass return records if __name__ == '__main__': records = records_from_file(r'C:\HSH\Workspace\py27_projects\EFA-on-going-projects\TSA real-time query\04_database\2014-09-11 LAX 08-09.html', mode = 'departure') from pprint import pprint as ppt ppt(records)
def query_with_index(): """对array like的关键字tags进行查询 """ st = time.clock() ppt(list(companys.find({"tags": {"$all": [1,2,3]}}))) print("query WITH index cost %.4f seconds." % (time.clock() - st,) )
至于具体的使用细节, 我还没有研究清楚。 官方文档: http://docs.mongodb.org/manual/core/index-ttl/ """ from introduction import client, db from pprint import pprint as ppt import pymongo import time col = db.col col.drop() def initialize_collection(): """初始化演示数据库 """ col.insert({"value": i} for i in range(3)) def create_TTL_index(): col.create_index([("value", pymongo.ASCENDING)], expireAfterSeconds=3) if __name__ == "__main__": initialize_collection() print("at begin, we have:") ppt(list(col.find())) create_TTL_index() time.sleep(65) print("wait for 65 seconds, we have:") ppt(list(col.find())) client.close()
port=3306x, user='******', password='******', database='db_xxx', charset='utf8mb4') #cursorclass=pymysql.cursors.DictCursor) cur = conn.cursor() cur.execute( "CREATE TABLE IF NOT EXISTS db_xxx.xxx__umeng_app_yesterday_channel " "(app_name text,app_id text,deadline timestamp,channel_name text,secret_id text," "active_user int,duration text,launch int,install int,total_install int," "total_install_rate float,channel_url text);") session = requests.Session() print(login(host, login_url)) app_data = get_app_data(visit_url) for item in appid_list(app_data): ppt(item) channel_url = csrf_url + item[1] + '/channels/load_table_data?' channel_data = get_channel_data(channel_url) #ppt(channel_data) for dt in channel_data: ppt(dt) save_channel_data(dt,item) time.sleep(10) session.close() cur.close() conn.close() print('All session & db_connection closed.')
def test_extract(self): ppt(processor.extract(readfromfile(r"testdata\pos.html"))) ppt(processor.extract(readfromfile(r"testdata\neg.html"))) ppt(processor.extract(readfromfile(r"testdata\netrual.html"))) ppt(processor.extract(readfromfile(r"testdata\netrual_pos.html"))) ppt(processor.extract(readfromfile(r"testdata\netrual_neg.html")))
def test_process(self): ppt(processor.process("It helps me saving money"))