def __save_customize_page(self, themeInfo): # HTTP Post is done with Mime-type 'application/json' postData = jsonlib.dumps(themeInfo) postHandler = HTTPPostHandler('application/json') debug("Editing Theme HTML...") opener = self._make_opener() opener.add_handler(postHandler) opener.addheaders.append(('Referer', 'http://www.tumblr.com/customize/%s' % self.blogname)) opener.addheaders.append(('Accept', 'application/json, text/javascript, */*; q=0.01')) opener.addheaders.append(('Accept-Charset', 'UTF-8,*;q=0.5')) opener.addheaders.append(('X-Requested-With', 'XMLHttpRequest')) opener.addheaders.append(('Origin', 'http://www.tumblr.com')) opener.addheaders.append(('Pragma', 'no-cache')) opener.addheaders.append(('Cache-Control', 'no-cache')) try: resp = opener.open('http://www.tumblr.com/customize_api/blog/%s' % self.blogname, data = postData) except Exception as e: debug(" !! Failed to edit HTML") return None newThemeInfo = resp.fp.read() newThemeInfo = jsonlib.loads(newThemeInfo) debug(" <3 Theme Saved.") return newThemeInfo
def more(self): trs = [T.tr['']] response = yield fetch("http://bang:8007/commands", headers={"X-Foaf-Agent":[str(self.user)]}) if not response: raise ValueError('-H "X-Foaf-Agent: %s" http://bang:8007/commands failed' % str(self.user)) cmds = jsonlib.loads(response.body) belowZero = [] for (cmd, score) in cmds: cmd = URIRef(cmd) if score < 0: belowZero.append((cmd, score)) continue if len(trs[-1].children) >= 1 + columns: trs.append(T.tr['']) trs[-1].children.append(T.td["\n", self._buttonForm(cmd, score)]) trs.append(T.tr[T.td(colspan=columns)]) for (cmd, score) in belowZero: trs[-1].children[-1][self._buttonForm(cmd, score)] returnValue(T.table[trs])
def __get_customize_page(self): debug("Fetching Customize Page [%s]" % self.blogname) opener = self._make_opener() resp = opener.open('http://www.tumblr.com/customize/%s' % self.blogname) if resp.code != 200: debug(" !! Failed to fetch '/customize/%s': Error [%s]" % (self.blogname, resp.code)) return None html = resp.fp.read() # find the "user_form_key" m = re.search(r'Tumblr\.Customize\.user_form_key\s?=\s?[\'\"]([^\'\"]+)[\'\"];', html) if not m: debug(" !! Failed to parse Theme: Could not find [user_form_key]") return None userFormKey = m.group(1) m = re.search(r'Tumblr\.Customize\.blog\.set\((.+)(?=\);\n)', html) if not m: debug(" !! Failed to parse Theme: Could not find JSON object in Tumblr.Customize.blog.set()") return None themeInfo = m.group(1) themeInfo = jsonlib.loads(themeInfo) themeInfo['user_form_key'] = userFormKey themeInfo['id'] = themeInfo['name'] debug(' <3 Theme parsed') return themeInfo
def parseJsonResults(jsonResults): """returns the same as parseSparqlResults. Takes json string like this: { 'head': { 'link': [], 'vars': ['p', 'o'] }, 'results': { 'distinct': false, 'ordered': true, 'bindings': [ { 'p': { 'type': 'uri', 'value': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' }, 'o': { 'type': 'uri', 'value': 'http://fantasyfamegame.com/2006/01/User' }}, { 'p': { 'type': 'uri', 'value': 'http://fantasyfamegame.com/2006/01/username' }, 'o': { 'type': 'literal', 'value': 'drewp' }}, { 'p': { 'type': 'uri', 'value': 'http://fantasyfamegame.com/2006/01/passwordSHA' } , 'o': { 'type': 'literal', 'value': '23fa12c6b4e9e3805a5e9d5dded3e78665fc1899' }}, ... """ # for speed, don't let jsonlib poke at the string to figure out the encoding jsonResults = jsonResults.decode('utf8') if jsonResults in ['true', 'false']: # this would have been made explicit in the Content-type # header, but I don't have that handy. return jsonResults == 'true' ret = [] for row in jsonlib.loads(jsonResults)['results']['bindings']: outRow = {} for k, v in row.items(): outRow[k] = parseJsonTerm(v) ret.append(outRow) return ret
def decode(data): """ A proxy method for BSON.decode TODO: This will block if a lot data has been received! """ try: return jsonlib.loads(data) except Exception, e: msg = 'Invalid JSON Data, got: %s:%s' % (e.__class__.__name__, e) return Fault(NOT_WELLFORMED_ERROR, msg)
def post_iter(self): """ Return an iterator over all the posts in the dataset. The ordering of the posts follows the order of posts in the dataset file. """ fh = gzip.open(self._posts_fname, 'r') for line in fh: post = jsonlib.loads(line) yield post
def GetPage(newPage): url = 'http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=0&pageSize=20&pageContext=' + str(newPage) page = urllib.urlopen(url) data = page.read() jsonData = jsonlib.loads(data) if jsonData['count'] == 0: print "do not have more" driver.close() else: print "hava more" GetJson(newPage)
def post_iter(self): """ Return an iterator over all the posts in the dataset. The ordering of the posts follows the order of posts in the dataset file. """ fh = gzip.open(self._posts_fname,'r') for line in fh: post = jsonlib.loads(line) yield post
def GetJson(page): url = 'http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=0&pageSize=20&pageContext=' + str(page) pageData = urllib.urlopen(url) data = pageData.read() jsonData = jsonlib.loads(data) for newLine in jsonData['obj']: print newLine[u'appName'] + '\t' + newLine['pkgName'] + '\t' + newLine['apkUrl'] IfOfficial(newLine['pkgName']) page += 20 print page GetPage(page)
def user_iter(self): """ Return an iterator over all posts in the dataset grouped by user. Each user is represented by a list of their posts - so any metadata about the user must be aggregated from the posts it produced. """ fh = gzip.open(self._users_fname, 'r') for line in fh: user = jsonlib.loads(line) yield user
def user_iter(self): """ Return an iterator over all posts in the dataset grouped by user. Each user is represented by a list of their posts - so any metadata about the user must be aggregated from the posts it produced. """ fh = gzip.open(self._users_fname,'r') for line in fh: user = jsonlib.loads(line) yield user
def default(request): """ """ logger = logging.getLogger("default") ip_client_string = get_client_ip(request) ip_client_value = ip_convert.ipv4_from_string(ip_client_string) logger.debug("from " + ip_client_string) if COOKIE_QUERY_HISTORY in request.COOKIES: new_query_history = request.COOKIES[COOKIE_QUERY_HISTORY] try: new_query_history = json.loads(new_query_history) except json.ReadError: new_query_history = [] return render_to_response("ipinfo.html", locals())
def default(request): """ """ logger = logging.getLogger('default') ip_client_string = get_client_ip(request) ip_client_value = ip_convert.ipv4_from_string(ip_client_string) logger.debug('from ' + ip_client_string) if COOKIE_QUERY_HISTORY in request.COOKIES: new_query_history = request.COOKIES[COOKIE_QUERY_HISTORY] try: new_query_history = json.loads(new_query_history) except json.ReadError: new_query_history = [] return render_to_response('ipinfo.html', locals())
def api_json_to_worksheet(sheet_json): sheet_values = jsonlib.loads(sheet_json) worksheet = Worksheet() worksheet.name = sheet_values.get('name', 'Untitled') for key, value in sheet_values.iteritems(): if key == "usercode_error": worksheet._usercode_error = value elif isinstance(value, dict): rows = value col = int(key) for row, value in rows.iteritems(): row = int(row) worksheet[col, row].value = value return worksheet
def posts2mention_network(posts_fname, extract_user_id, extract_mentions, working_dir=None): """ This method builds a valid `mention_network.elist` file from the `posts.json.gz` file specified. Unless indicated otherwise, the directory containing the posts file will be used as the working and output directory for the construction process. `extract_user_id` is a function that accepts a post and returns a string user_id. `extract_mentions` is a function that accepts a post and returns a list of string user_ids mentioned in the post. """ G = zen.DiGraph() # figure out the working dir if not working_dir: working_dir = os.path.dirname(posts_fname) # bin the user data logging.info('building the network') fh = gzip.open(posts_fname, 'r') for line in fh: post = jsonlib.loads(line) uid = extract_user_id(post) mentions = extract_mentions(post) for m in mentions: if G.has_edge(uid, m): G.set_weight(uid, m, G.weight(uid, m) + 1) else: G.add_edge(uid, m, weight=1) # save the graph logging.info('writing network') # TODO: Add compression to this... zen.io.edgelist.write(G, os.path.join(working_dir, 'mention_network.elist'), use_weights=True) # done return
def getAudioUrl(name): if (name.find("http") == -1): print("Search: " + name) res = YoutubeSearch(name, max_results=10).to_json() data = json.loads(res) videoUrl = "http://youtube.com" + data["videos"][0]["url_suffix"] video = pafy.new(videoUrl) return { "title": data["videos"][0]['title'], "url": video.audiostreams[0].url, "thumbnail": data["videos"][0]['thumbnails'][0] } else: video = pafy.new(name) return { "title": video.title, "url": video.getbestaudio().url, "thumbnail": video.thumb }
def posts2mention_network(posts_fname,extract_user_id, extract_mentions,working_dir=None): """ This method builds a valid `mention_network.elist` file from the `posts.json.gz` file specified. Unless indicated otherwise, the directory containing the posts file will be used as the working and output directory for the construction process. `extract_user_id` is a function that accepts a post and returns a string user_id. `extract_mentions` is a function that accepts a post and returns a list of string user_ids mentioned in the post. """ G = zen.DiGraph() # figure out the working dir if not working_dir: working_dir = os.path.dirname(posts_fname) # bin the user data logging.info('building the network') fh = gzip.open(posts_fname,'r') for line in fh: post = jsonlib.loads(line) uid = extract_user_id(post) mentions = extract_mentions(post) for m in mentions: if G.has_edge(uid,m): G.set_weight(uid,m,G.weight(uid,m)+1) else: G.add_edge(uid,m,weight=1) # save the graph logging.info('writing network') # TODO: Add compression to this... zen.io.edgelist.write(G,os.path.join(working_dir,'mention_network.elist'),use_weights=True) # done return
def query_by_ipv4_inner(request, ipv4): """ """ logger = logging.getLogger("query_by_ipv4_inner") ip_infos = models.Ipv4Info.objects.filter_by_ip(ipv4)[:5] ip_string = ip_convert.ipv4_to_string(ipv4) ip_value = ip_convert.ipv4_int2readable(ipv4) ip_client_string = get_client_ip(request) ip_client_value = ip_convert.ipv4_from_string(ip_client_string) logger.debug("from " + ip_client_string + " query " + ip_string + " return " + str(ip_infos.count()) + " results") new_query_history = [] if ip_infos.count() > 0: new_query_history.append([ip_string, unicode(ip_infos[0])]) if COOKIE_QUERY_HISTORY in request.COOKIES: old_query_history = request.COOKIES[COOKIE_QUERY_HISTORY] try: old_query_history = json.loads(old_query_history) except json.ReadError: old_query_history = [] old_query_history = uniq(old_query_history) new_query_history.extend(old_query_history) new_query_history = uniq(new_query_history)[:MAX_QUERY_HISTORY] response = render_to_response("ipinfo.html", locals()) try: new_query_history_str = json.dumps(new_query_history) response.set_cookie( key=COOKIE_QUERY_HISTORY, value=new_query_history_str, max_age=86400, expires=None, path="/", domain=None, secure=None, ) except json.WriteError: response.delete_cookie(key=COOKIE_QUERY_HISTORY) print "write error: " print new_query_history except json.UnknownSerializerError: response.delete_cookie(key=COOKIE_QUERY_HISTORY) print "error" return response
def query_by_ipv4_inner(request, ipv4): """ """ logger = logging.getLogger('query_by_ipv4_inner') ip_infos = models.Ipv4Info.objects.filter_by_ip(ipv4)[:5] ip_string = ip_convert.ipv4_to_string(ipv4) ip_value = ip_convert.ipv4_int2readable(ipv4) ip_client_string = get_client_ip(request) ip_client_value = ip_convert.ipv4_from_string(ip_client_string) logger.debug('from ' + ip_client_string + ' query ' + ip_string + ' return ' + str(ip_infos.count()) + ' results') new_query_history = [] if ip_infos.count() > 0: new_query_history.append([ip_string, unicode(ip_infos[0])]) if COOKIE_QUERY_HISTORY in request.COOKIES: old_query_history = request.COOKIES[COOKIE_QUERY_HISTORY] try: old_query_history = json.loads(old_query_history) except json.ReadError: old_query_history = [] old_query_history = uniq(old_query_history) new_query_history.extend(old_query_history) new_query_history = uniq(new_query_history)[:MAX_QUERY_HISTORY] response = render_to_response('ipinfo.html', locals()) try: new_query_history_str = json.dumps(new_query_history) response.set_cookie(key=COOKIE_QUERY_HISTORY, value=new_query_history_str, max_age=86400, expires=None, path='/', domain=None, secure=None) except json.WriteError: response.delete_cookie(key=COOKIE_QUERY_HISTORY) print 'write error: ' print new_query_history except json.UnknownSerializerError: response.delete_cookie(key=COOKIE_QUERY_HISTORY) print 'error' return response
def POST(self): data = jsonlib.loads(web.data()) if data.get('action') == 'arrive': snd = ('/my/music/entrance/%s.wav' % data['name'].replace(' ', '_').replace(':', '_')) if not os.path.exists(snd): snd = None soundOut(speech="new %s: %s" % (sensorWords[data['sensor']], data['name']), postSound=snd) return 'ok' if data.get('action') == 'leave': soundOut(preSound='/my/music/entrance/leave.wav', speech="lost %s. %s" % (sensorWords[data['sensor']], data['name'])) return 'ok' return "nothing to do"
def get_all_min_position_id(self): opener = urt.build_opener() # urllib创建开启url器 # 设置头文件参数: User-agent, Cookie opener.addheaders = [('User-agent', self.user_agent)] opener.addheaders = [('Cookie', self.cookie)] count = 0 flag = True while flag: page = opener.open(self.url, data=None, timeout=1000) content = page.read() # 读取response文件 # print("content:", content) # 将response文件转换成JSON格式,JSON转换成Python字典数据结构 data = jsonlib.loads(content) # print("data:", data) # 获取字典数据结构内部的数据字段和值(key: value) new_latent_count":20 min_position = int(data['min_position']) # int mobile_id = str(min_position - 1) min_position = str(min_position) print("min_po:", min_position, ", mo_id:", mobile_id) # has_more_items = data['has_more_items'] # int new_latent_count = data['new_latent_count'] # boolean # print("next_cursor:", min_position) if not (new_latent_count > 0): flag = False else: # 形成新的URL,在获取JSON next_url = "https://twitter.com/i/profiles/show/poke/timeline/with_replies?" \ "include_available_features=1" \ "&include_entities=1&max_position="+min_position+"&reset_error_state=false" self.url = next_url # 直接一个个的写入文本文件,获取一个就写一个 mobile_next_url = "https://mobile.twitter.com/i/rw/profile/timeline?" \ "max_id="+mobile_id+"&screen_name=poke&type=tweets" self.input_text(mobile_next_url) count += 1 print("count:", count, ", next_cursor:", min_position) print("next_url:", self.url) print("\n")
def to_cells(self, start, end): start_col, start_row = start end_col, end_row = end strings_dict = jsonlib.loads(self.contents_json) for col in xrange(0, end_col - start_col + 1): for row in xrange(0, end_row - start_row + 1): clip_loc = col % self.width, row % self.height clip_cell = strings_dict['%s,%s' % clip_loc] dest_cell = Cell() if clip_cell['formula']: column_offset, row_offset = self._get_offset( col, row, start_col, start_row) dest_cell.formula = rewrite_formula( clip_cell['formula'], column_offset, row_offset, self.is_cut, self.source_range) dest_cell.formatted_value = clip_cell['formatted_value'] dest_loc = col + start_col, row + start_row yield (dest_loc, dest_cell)
def to_cells(self, start, end): start_col, start_row = start end_col, end_row = end strings_dict = jsonlib.loads(self.contents_json) for col in xrange(0, end_col - start_col + 1): for row in xrange(0, end_row - start_row + 1): clip_loc = col % self.width, row % self.height clip_cell = strings_dict['%s,%s' % clip_loc] dest_cell = Cell() if clip_cell['formula']: column_offset, row_offset = self._get_offset(col, row, start_col, start_row) dest_cell.formula = rewrite_formula( clip_cell['formula'], column_offset, row_offset, self.is_cut, self.source_range ) dest_cell.formatted_value = clip_cell['formatted_value'] dest_loc = col + start_col, row + start_row yield (dest_loc, dest_cell)
def main(): updates = [] url = "http://search.twitter.com/search.json" params = {"q": HASH_TAG, "rpp": 30, "page": 1} while True: req_data = urllib.urlencode(params) print >>sys.stderr, "Fetching %s?%s..." % (url, req_data) req = urllib2.Request(url, req_data) response = urllib2.urlopen(req) contents = response.read() info = json.loads(contents) updates += [update for update in info['results']] if not len(info['results']): break else: params['page'] += 1 req_data = urllib.urlencode(params) all_words = [] banned = [HASH_TAG] allowed_re = re.compile(ALLOWED_REGEX) for result in updates: words = result['text'].split(" ") all_words += [word.lower() for word in words if (word not in banned and allowed_re.match(word) and len(word) > MIN_LENGTH)] print("") print(" ".join(sorted(all_words)))
# 设置头文件参数: User-agent, Cookie opener.addheaders = [('User-agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) " "AppleWebKit/537.36 (KHTML, " "like Gecko) Chrome/55.0.2883.87 Safari/537.36")] opener.addheaders = [( 'Cookie', "m-b=\"Xb0jVMw7nHR4ALk0Bu1nFQ\075\075\"; m-s=\"be36VoAlhhlQ7bjYB5sGlg\075\075\";" " m-css_v=b6a9d4fb55602580; m-early_v=83471c69fad5a4ed; m-tz=-480; m-wf-loaded=q-ico" "ns-q_serif; _ga=GA1.2.1160086400.1486206954")] page = opener.open( "https://tch170417.tch.quora.com/up/chan32-8888/updates?min_seq=4274796479&channel=ma" "in-w-dep3505-2944002901738499632&hash=11835125263837804143&callback=jsonp15" "a09a3bcc6416ca304392bd", data=None, timeout=100) content = page.read() # 读取response文件 print("content:", content) # 将response文件转换成JSON格式,JSON转换成Python字典数据结构 data = jsonlib.loads(content) print("data:", data) # 获取字典数据结构内部的数据字段和值(key: value) new_latent_count":20 # 转换str to int min_position = int(data['min_position']) mobile_id = min_position - 1 mobile_id = str(mobile_id) print("mobile_id:", mobile_id, ", type:", type(mobile_id)) print("min_position:", min_position, ", type:", type(min_position)) print("has_more_items:", data['has_more_items']) print("new_latent_count:", data['new_latent_count']) # print("items_html:", data['items_html'])
def jsonRowCount(jsonResults): """given a json string like parseJsonTerm takes, just count the rows""" return len(jsonlib.loads(jsonResults)['results']['bindings'])
def posts2users(posts_fname,extract_user_id, working_dir=None,max_open_temp_files=256): """ This method builds a valid `users.json.gz` file from the `posts.json.gz` file specified. Unless indicated otherwise, the directory containing the posts file will be used as the working and output directory for the construction process. `extract_user_id` is a function that accepts a post and returns a string user_id. """ # figure out the working dir if not working_dir: working_dir = os.path.dirname(posts_fname) # bin the user data logger.info('binning user posts') curr_temp_file_idx = -1 # A dict from a user-id to the file handle-id user_assignments = {} # A dict from the file handle-id to the actual file handle file_handles = {} # Sanity check methods for ensuring we're reading and writing # all the data. posts_seen = 0 user_posts_written = 0 fh = gzip.open(posts_fname,'r') for line in fh: post = jsonlib.loads(line) uid = extract_user_id(post) posts_seen += 1 if uid not in user_assignments: # Get the temp file this user should be in. # Assume that user-ids are randomly distribued # in some range such that the last three # digits of the id serve as a uniformly # distributed hash tmp_file_assignment = long(uid) % max_open_temp_files if not tmp_file_assignment in file_handles: # Write the temp file as gzipped files # because this splitting process gets # very expensive when processing large # datasets tmp_fname = os.path.join(working_dir,'tmp-%03d.json.gz' % tmp_file_assignment) logger.debug('creating temp file %s' % tmp_fname) tmp_fh = gzip.open(tmp_fname,'w') file_handles[tmp_file_assignment] = tmp_fh user_assignments[uid] = tmp_file_assignment file_handles[user_assignments[uid]].write(line) for idx,tmp_fh in file_handles.items(): tmp_fh.close() # aggregate the users logger.info('aggregating user data') user_fh = gzip.open(os.path.join(working_dir,'users.json.gz'),'w') for i in range(max_open_temp_files): logging.debug('processing file %d' % i) tmp_fname = os.path.join(working_dir,'tmp-%03d.json.gz' % i) tmp_fh = gzip.open(tmp_fname,'r') # aggregate data by tweets user_posts = {} for line in tmp_fh: post = jsonlib.loads(line) uid = extract_user_id(post) if uid not in user_posts: user_posts[uid] = [] user_posts[uid].append(post) # write out the tweets by user for uid,posts in user_posts.items(): user_fh.write('%s\n' % jsonlib.dumps({'user_id':uid,'posts':posts})) user_posts_written += len(posts) # delete the temporary file tmp_fh.close(); os.remove(tmp_fname) # done user_fh.close() logger.debug("Read %s posts, wrote %s posts to users.json.gz" % (posts_seen, user_posts_written))
def read(): for line in open(sys.argv[1]): yield line.strip().split("\t")[-1] def timeit(label, seq): t0 = time.time() for x in seq: pass t1 = time.time() print label, "%.2f sec" % (t1-t0) if False and __name__ == "__main__": timeit("read", read()) timeit("simplejson.load", (simplejson.loads(json) for json in read())) timeit("jsonlib.load", (jsonlib.loads(json) for json in read())) timeit("simplejson.load-dump", (simplejson.dumps(simplejson.loads(json)) for json in read())) timeit("jsonlib.load-dump", (jsonlib.dumps(jsonlib.loads(json)) for json in read())) def bench(count, f, *args): times = [] for _ in range(count): t0 = time.time() f(*args) times.append(time.time() - t0) times = sorted(times) return "avg %.5f med %.5f max %.5f min %.5f" % ( sum(times) / float(len(times)),
import jsonlib, http.client connection = http.client.HTTPConnection('18.217.240.250', 80) connection.connect() connection.request( 'GET', '/parse/classes/water_consumption', "", { "X-Parse-Application-Id": "0bfc45c8be2b2e93f018041ff949fe6d09233c0a", "X-Parse-REST-API-Key": "avbs", "Content-Type": "application/json" }) result = jsonlib.loads(connection.getresponse().read()) print(result)
def getTicket(self): t = jsonlib.loads(open(self.ticketFile).read(), use_float=True) if t['expires'] < time.time(): raise ValueError("access ticket expired") return t['magic']
def posts2users(posts_fname, extract_user_id, working_dir=None, max_open_temp_files=256): """ This method builds a valid `users.json.gz` file from the `posts.json.gz` file specified. Unless indicated otherwise, the directory containing the posts file will be used as the working and output directory for the construction process. `extract_user_id` is a function that accepts a post and returns a string user_id. """ # figure out the working dir if not working_dir: working_dir = os.path.dirname(posts_fname) # bin the user data logger.info('binning user posts') curr_temp_file_idx = -1 # A dict from a user-id to the file handle-id user_assignments = {} # A dict from the file handle-id to the actual file handle file_handles = {} # Sanity check methods for ensuring we're reading and writing # all the data. posts_seen = 0 user_posts_written = 0 fh = gzip.open(posts_fname, 'r') for line in fh: post = jsonlib.loads(line) uid = extract_user_id(post) posts_seen += 1 if uid not in user_assignments: # Get the temp file this user should be in. # Assume that user-ids are randomly distribued # in some range such that the last three # digits of the id serve as a uniformly # distributed hash tmp_file_assignment = long(uid) % max_open_temp_files if not tmp_file_assignment in file_handles: # Write the temp file as gzipped files # because this splitting process gets # very expensive when processing large # datasets tmp_fname = os.path.join( working_dir, 'tmp-%03d.json.gz' % tmp_file_assignment) logger.debug('creating temp file %s' % tmp_fname) tmp_fh = gzip.open(tmp_fname, 'w') file_handles[tmp_file_assignment] = tmp_fh user_assignments[uid] = tmp_file_assignment file_handles[user_assignments[uid]].write(line) for idx, tmp_fh in file_handles.items(): tmp_fh.close() # aggregate the users logger.info('aggregating user data') user_fh = gzip.open(os.path.join(working_dir, 'users.json.gz'), 'w') for i in range(max_open_temp_files): logging.debug('processing file %d' % i) tmp_fname = os.path.join(working_dir, 'tmp-%03d.json.gz' % i) tmp_fh = gzip.open(tmp_fname, 'r') # aggregate data by tweets user_posts = {} for line in tmp_fh: post = jsonlib.loads(line) uid = extract_user_id(post) if uid not in user_posts: user_posts[uid] = [] user_posts[uid].append(post) # write out the tweets by user for uid, posts in user_posts.items(): user_fh.write('%s\n' % jsonlib.dumps({ 'user_id': uid, 'posts': posts })) user_posts_written += len(posts) # delete the temporary file tmp_fh.close() os.remove(tmp_fname) # done user_fh.close() logger.debug("Read %s posts, wrote %s posts to users.json.gz" % (posts_seen, user_posts_written))
def load_user(self, line): """ Converts this compressed representation of the user's data into a dict format that mirrors the full JSON data, except with all unused fields omitted (e.g., posting date). """ cols = line.split("\t") user_id_str = cols[0] user_id = user_id_str posts = [] user_obj = {} user_obj['user_id'] = user_id user_obj['posts'] = posts COLS_PER_POST = 8 should_exclude_location_data = user_id in self.excluded_users #print "User %d had line with %d columns (%f posts)" % (user_id, len(cols), len(cols) / 8.0) for post_offset in range(1, len(cols), COLS_PER_POST): try: # Grab the relevant content for this post text = cols[post_offset] tweet_id = cols[post_offset+1] self_reported_loc = cols[post_offset+2] geo_str = cols[post_offset+3] mentions_str = cols[post_offset+4] hashtags_str = cols[post_offset+5] is_retweet_str = cols[post_offset+6] place_json = cols[post_offset+7] # Reconstruct the post as a series of nested dicts that # mirrors the real-world full JSON object in structure post = {} post["id_str"] = tweet_id post["id"] = long(tweet_id) post["text"] = text if is_retweet_str == 'True': # We don't have any data to put, so just fill it # with an empty object post["retweeted_status"] = {} entities = {} post["entities"] = entities user_mentions = [] entities["user_mentions"] = user_mentions if len(mentions_str) > 0: mentions = mentions_str.split(" ") for mention in mentions: mention_obj = {} mention_obj["id"] = long(mention) mention_obj["id_str"] = mention user_mentions.append(mention_obj) hashtags = [] entities["hashtags"] = hashtags if len(hashtags_str) > 0: tags = hashtags_str.split(" ") for tag in tags: tag_obj = {} tag_obj["text"] = tag hashtags.append(tag_obj) # Only include geo information for posts that are not in # the set of exlcuded posts, which are likely being used # for testing data if len(geo_str) > 0 and not should_exclude_location_data: geo = {} post["geo"] = geo coordinates = [] coords = geo_str.split(" ") coordinates.append(float(coords[0])) coordinates.append(float(coords[1])) geo["coordinates"] = coordinates # Place is a special case because the field formatting # is so complex, it's just saved as a raw JSON string. # This requires reparsing place to stuff in our object. # However, since place is relative rare (1% of tweets), # this isn't very expensive if len(place_json) > 1 and not should_exclude_location_data: place = jsonlib.loads(place_json) post["place"] = place user = {} post["user"] = user user["id_str"] = user_id_str user["id"] = user_id user["location"] = self_reported_loc posts.append(post) except: logger.info("Saw malformed post when reading user; skipping") pass return user_obj
import jsonlib # 写入文件 data = { 'no': 1, 'name': 'Runoob', 'url': 'http://runoob.txt.com' } with open('data.json', 'w') as f: jsonlib.dump(data, f) # 读取数据 with open('data.json', 'r') as f: data = jsonlib.loads(f)
def getTicket(self): t = jsonlib.loads(open(self.ticketFile).read(), use_float=True) if t["expires"] < time.time(): raise ValueError("access ticket expired") return t["magic"]
def process_result_value(self, value, dialect): if value is None: return value else: return json.loads(value)
import urllib import jsonlib # serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?' serviceurl = 'http://python-data.dr-chuck.net/geojson?' while True: address = input('Enter location: ') if len(address) < 1: break url = serviceurl + urllib.urlencode({ 'sensor': 'false', 'address': address }) print('Retrieving ', url) url_handle = urllib.urlopen(url) data = url_handle.read() print('Retrieved', len(data), 'characters') json_data = jsonlib.loads(data) #print json.dumps(json_data['results'], indent=3) print('Place id', json_data['results'][0]['place_id'])
def set_column_widths(request, sheet): sheet.column_widths.update(jsonlib.loads(request.POST['column_widths'])) sheet.save() return HttpResponse('OK')
def __init__(self, *args, **kwargs): models.Model.__init__(self, *args, **kwargs) self.column_widths = jsonlib.loads(self.column_widths_json) if not self.api_key: self.api_key = str(uuid4())
for line in open(sys.argv[1]): yield line.strip().split("\t")[-1] def timeit(label, seq): t0 = time.time() for x in seq: pass t1 = time.time() print label, "%.2f sec" % (t1 - t0) if False and __name__ == "__main__": timeit("read", read()) timeit("simplejson.load", (simplejson.loads(json) for json in read())) timeit("jsonlib.load", (jsonlib.loads(json) for json in read())) timeit("simplejson.load-dump", (simplejson.dumps(simplejson.loads(json)) for json in read())) timeit("jsonlib.load-dump", (jsonlib.dumps(jsonlib.loads(json)) for json in read())) def bench(count, f, *args): times = [] for _ in range(count): t0 = time.time() f(*args) times.append(time.time() - t0) times = sorted(times) return "avg %.5f med %.5f max %.5f min %.5f" % (sum(times) / float(