Exemple #1
0
    def __save_customize_page(self, themeInfo):

        # HTTP Post is done with Mime-type 'application/json'

        postData = jsonlib.dumps(themeInfo)

        postHandler = HTTPPostHandler('application/json')

        debug("Editing Theme HTML...")

        opener = self._make_opener()
        opener.add_handler(postHandler)
        opener.addheaders.append(('Referer', 'http://www.tumblr.com/customize/%s' % self.blogname))
        opener.addheaders.append(('Accept', 'application/json, text/javascript, */*; q=0.01'))
        opener.addheaders.append(('Accept-Charset', 'UTF-8,*;q=0.5'))
        opener.addheaders.append(('X-Requested-With', 'XMLHttpRequest'))
        opener.addheaders.append(('Origin', 'http://www.tumblr.com'))
        opener.addheaders.append(('Pragma', 'no-cache'))
        opener.addheaders.append(('Cache-Control', 'no-cache'))

        try:
            resp = opener.open('http://www.tumblr.com/customize_api/blog/%s' % self.blogname, data = postData)

        except Exception as e:
            debug("  !! Failed to edit HTML")
            return None

        newThemeInfo = resp.fp.read()
        newThemeInfo = jsonlib.loads(newThemeInfo)

        debug("  <3 Theme Saved.")

        return newThemeInfo
Exemple #2
0
    def more(self):

        trs = [T.tr['']]

        response = yield fetch("http://bang:8007/commands",
                               headers={"X-Foaf-Agent":[str(self.user)]})
        if not response:
            raise ValueError('-H "X-Foaf-Agent: %s" http://bang:8007/commands failed' % str(self.user))
        cmds = jsonlib.loads(response.body)

        belowZero = []

        for (cmd, score) in cmds:
            cmd = URIRef(cmd)
            if score < 0:
                belowZero.append((cmd, score))
                continue

            if len(trs[-1].children) >= 1 + columns:
                trs.append(T.tr[''])
            trs[-1].children.append(T.td["\n", self._buttonForm(cmd, score)])

        trs.append(T.tr[T.td(colspan=columns)])
        for (cmd, score) in belowZero:
            trs[-1].children[-1][self._buttonForm(cmd, score)]
        returnValue(T.table[trs])
Exemple #3
0
    def __get_customize_page(self):

        debug("Fetching Customize Page [%s]" % self.blogname)

        opener = self._make_opener()
        resp = opener.open('http://www.tumblr.com/customize/%s' % self.blogname)
        if resp.code != 200:
            debug("  !! Failed to fetch '/customize/%s': Error [%s]" % (self.blogname, resp.code))
            return None

        html = resp.fp.read()

        # find the "user_form_key"
        m = re.search(r'Tumblr\.Customize\.user_form_key\s?=\s?[\'\"]([^\'\"]+)[\'\"];', html)
        if not m:
            debug("  !! Failed to parse Theme: Could not find [user_form_key]")
            return None

        userFormKey = m.group(1)

        m = re.search(r'Tumblr\.Customize\.blog\.set\((.+)(?=\);\n)', html)
        if not m:
            debug("  !! Failed to parse Theme: Could not find JSON object in Tumblr.Customize.blog.set()")
            return None

        themeInfo = m.group(1)
        themeInfo = jsonlib.loads(themeInfo)

        themeInfo['user_form_key'] = userFormKey
        themeInfo['id'] = themeInfo['name']

        debug('  <3 Theme parsed')

        return themeInfo
Exemple #4
0
def parseJsonResults(jsonResults):
    """returns the same as parseSparqlResults. Takes json string like this:

    { 'head': { 'link': [], 'vars': ['p', 'o'] },
      'results': { 'distinct': false, 'ordered': true, 'bindings': [
        { 'p': { 'type': 'uri',
                 'value': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' },
          'o': { 'type': 'uri',
                 'value': 'http://fantasyfamegame.com/2006/01/User' }},
        { 'p': { 'type': 'uri',
                 'value': 'http://fantasyfamegame.com/2006/01/username' },
        'o': { 'type': 'literal', 'value': 'drewp' }},
        { 'p': { 'type': 'uri', 'value': 'http://fantasyfamegame.com/2006/01/passwordSHA' }	, 'o': { 'type': 'literal', 'value': '23fa12c6b4e9e3805a5e9d5dded3e78665fc1899' }},
      ...
    """

    # for speed, don't let jsonlib poke at the string to figure out the encoding
    jsonResults = jsonResults.decode('utf8')
    
    if jsonResults in ['true', 'false']:
        # this would have been made explicit in the Content-type
        # header, but I don't have that handy.
        return jsonResults == 'true'
    ret = []
    for row in jsonlib.loads(jsonResults)['results']['bindings']:
        outRow = {}
        for k, v in row.items():
            outRow[k] = parseJsonTerm(v)
        ret.append(outRow)
    return ret
Exemple #5
0
 def decode(data):
     """
     A proxy method for BSON.decode
     TODO: This will block if a lot data has been received!
     """
     try:
         return jsonlib.loads(data)
     except Exception, e:
         msg = 'Invalid JSON Data, got: %s:%s' % (e.__class__.__name__, e)
         return Fault(NOT_WELLFORMED_ERROR, msg)
Exemple #6
0
    def post_iter(self):
        """
		Return an iterator over all the posts in the dataset. The ordering
		of the posts follows the order of posts in the dataset file.
		"""
        fh = gzip.open(self._posts_fname, 'r')

        for line in fh:
            post = jsonlib.loads(line)

            yield post
Exemple #7
0
def GetPage(newPage):
    url = 'http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=0&pageSize=20&pageContext=' + str(newPage)
    page = urllib.urlopen(url)
    data = page.read()
    jsonData = jsonlib.loads(data)
    if jsonData['count'] == 0:
        print "do not have more"
        driver.close()
    else:
        print "hava more"
        GetJson(newPage)
Exemple #8
0
	def post_iter(self):
		"""
		Return an iterator over all the posts in the dataset. The ordering
		of the posts follows the order of posts in the dataset file.
		"""
		fh = gzip.open(self._posts_fname,'r')

		for line in fh:
			post = jsonlib.loads(line)

			yield post
Exemple #9
0
def GetJson(page):
    url = 'http://sj.qq.com/myapp/cate/appList.htm?orgame=1&categoryId=0&pageSize=20&pageContext=' + str(page)
    pageData = urllib.urlopen(url)
    data = pageData.read()
    jsonData = jsonlib.loads(data)
    for newLine in jsonData['obj']:
        print newLine[u'appName'] + '\t' + newLine['pkgName'] + '\t' + newLine['apkUrl']
        IfOfficial(newLine['pkgName'])
    page += 20
    print page
    GetPage(page)
Exemple #10
0
    def user_iter(self):
        """
		Return an iterator over all posts in the dataset grouped by user. Each
		user is represented by a list of their posts - so any metadata about the
		user must be aggregated from the posts it produced.
		"""
        fh = gzip.open(self._users_fname, 'r')

        for line in fh:
            user = jsonlib.loads(line)

            yield user
Exemple #11
0
	def user_iter(self):
		"""
		Return an iterator over all posts in the dataset grouped by user. Each
		user is represented by a list of their posts - so any metadata about the
		user must be aggregated from the posts it produced.
		"""
		fh = gzip.open(self._users_fname,'r')

		for line in fh:
			user = jsonlib.loads(line)

			yield user
Exemple #12
0
def default(request):
    """
    """
    logger = logging.getLogger("default")
    ip_client_string = get_client_ip(request)
    ip_client_value = ip_convert.ipv4_from_string(ip_client_string)
    logger.debug("from " + ip_client_string)
    if COOKIE_QUERY_HISTORY in request.COOKIES:
        new_query_history = request.COOKIES[COOKIE_QUERY_HISTORY]
        try:
            new_query_history = json.loads(new_query_history)
        except json.ReadError:
            new_query_history = []
    return render_to_response("ipinfo.html", locals())
Exemple #13
0
def default(request):
    """
    """
    logger = logging.getLogger('default')
    ip_client_string = get_client_ip(request)
    ip_client_value = ip_convert.ipv4_from_string(ip_client_string)
    logger.debug('from ' + ip_client_string)
    if COOKIE_QUERY_HISTORY in request.COOKIES:
        new_query_history = request.COOKIES[COOKIE_QUERY_HISTORY]
        try:
            new_query_history = json.loads(new_query_history)
        except json.ReadError:
            new_query_history = []
    return render_to_response('ipinfo.html', locals())
Exemple #14
0
def api_json_to_worksheet(sheet_json):
    sheet_values = jsonlib.loads(sheet_json)
    worksheet = Worksheet()

    worksheet.name = sheet_values.get('name', 'Untitled')
    for key, value in sheet_values.iteritems():
        if key == "usercode_error":
            worksheet._usercode_error = value
        elif isinstance(value, dict):
            rows = value
            col = int(key)
            for row, value in rows.iteritems():
                row = int(row)
                worksheet[col, row].value = value
    return worksheet
Exemple #15
0
def posts2mention_network(posts_fname,
                          extract_user_id,
                          extract_mentions,
                          working_dir=None):
    """
	This method builds a valid `mention_network.elist` file from the 
	`posts.json.gz` file specified. Unless indicated otherwise, the 
	directory containing the posts file will be used as the working 
	and output directory for the construction process.

	`extract_user_id` is a function that accepts a post and returns a string
	user_id.

	`extract_mentions` is a function that accepts a post and returns a list of
	string user_ids mentioned in the post.
	"""
    G = zen.DiGraph()

    # figure out the working dir
    if not working_dir:
        working_dir = os.path.dirname(posts_fname)

    # bin the user data
    logging.info('building the network')

    fh = gzip.open(posts_fname, 'r')
    for line in fh:
        post = jsonlib.loads(line)
        uid = extract_user_id(post)
        mentions = extract_mentions(post)

        for m in mentions:
            if G.has_edge(uid, m):
                G.set_weight(uid, m, G.weight(uid, m) + 1)
            else:
                G.add_edge(uid, m, weight=1)

    # save the graph
    logging.info('writing network')
    # TODO: Add compression to this...
    zen.io.edgelist.write(G,
                          os.path.join(working_dir, 'mention_network.elist'),
                          use_weights=True)

    # done
    return
Exemple #16
0
def getAudioUrl(name):
    if (name.find("http") == -1):
        print("Search: " + name)
        res = YoutubeSearch(name, max_results=10).to_json()
        data = json.loads(res)
        videoUrl = "http://youtube.com" + data["videos"][0]["url_suffix"]
        video = pafy.new(videoUrl)
        return {
            "title": data["videos"][0]['title'],
            "url": video.audiostreams[0].url,
            "thumbnail": data["videos"][0]['thumbnails'][0]
        }
    else:
        video = pafy.new(name)
        return {
            "title": video.title,
            "url": video.getbestaudio().url,
            "thumbnail": video.thumb
        }
Exemple #17
0
def posts2mention_network(posts_fname,extract_user_id,
						  extract_mentions,working_dir=None):
	"""
	This method builds a valid `mention_network.elist` file from the 
	`posts.json.gz` file specified. Unless indicated otherwise, the 
	directory containing the posts file will be used as the working 
	and output directory for the construction process.

	`extract_user_id` is a function that accepts a post and returns a string
	user_id.

	`extract_mentions` is a function that accepts a post and returns a list of
	string user_ids mentioned in the post.
	"""
	G = zen.DiGraph()

	# figure out the working dir
	if not working_dir:
		working_dir = os.path.dirname(posts_fname)

	# bin the user data
	logging.info('building the network')

	fh = gzip.open(posts_fname,'r')
	for line in fh:
		post = jsonlib.loads(line)
		uid = extract_user_id(post)
		mentions = extract_mentions(post)

		for m in mentions:
			if G.has_edge(uid,m):
				G.set_weight(uid,m,G.weight(uid,m)+1)
			else:
				G.add_edge(uid,m,weight=1)

	# save the graph
	logging.info('writing network')
	# TODO: Add compression to this...
	zen.io.edgelist.write(G,os.path.join(working_dir,'mention_network.elist'),use_weights=True)

	# done
	return
Exemple #18
0
def query_by_ipv4_inner(request, ipv4):
    """
    """
    logger = logging.getLogger("query_by_ipv4_inner")
    ip_infos = models.Ipv4Info.objects.filter_by_ip(ipv4)[:5]
    ip_string = ip_convert.ipv4_to_string(ipv4)
    ip_value = ip_convert.ipv4_int2readable(ipv4)
    ip_client_string = get_client_ip(request)
    ip_client_value = ip_convert.ipv4_from_string(ip_client_string)
    logger.debug("from " + ip_client_string + " query " + ip_string + " return " + str(ip_infos.count()) + " results")
    new_query_history = []
    if ip_infos.count() > 0:
        new_query_history.append([ip_string, unicode(ip_infos[0])])
    if COOKIE_QUERY_HISTORY in request.COOKIES:
        old_query_history = request.COOKIES[COOKIE_QUERY_HISTORY]
        try:
            old_query_history = json.loads(old_query_history)
        except json.ReadError:
            old_query_history = []
        old_query_history = uniq(old_query_history)
        new_query_history.extend(old_query_history)
        new_query_history = uniq(new_query_history)[:MAX_QUERY_HISTORY]
    response = render_to_response("ipinfo.html", locals())
    try:
        new_query_history_str = json.dumps(new_query_history)
        response.set_cookie(
            key=COOKIE_QUERY_HISTORY,
            value=new_query_history_str,
            max_age=86400,
            expires=None,
            path="/",
            domain=None,
            secure=None,
        )
    except json.WriteError:
        response.delete_cookie(key=COOKIE_QUERY_HISTORY)
        print "write error: "
        print new_query_history
    except json.UnknownSerializerError:
        response.delete_cookie(key=COOKIE_QUERY_HISTORY)
        print "error"
    return response
Exemple #19
0
def query_by_ipv4_inner(request, ipv4):
    """
    """
    logger = logging.getLogger('query_by_ipv4_inner')
    ip_infos = models.Ipv4Info.objects.filter_by_ip(ipv4)[:5]
    ip_string = ip_convert.ipv4_to_string(ipv4)
    ip_value = ip_convert.ipv4_int2readable(ipv4)
    ip_client_string = get_client_ip(request)
    ip_client_value = ip_convert.ipv4_from_string(ip_client_string)
    logger.debug('from ' + ip_client_string + ' query ' + ip_string +
                 ' return ' + str(ip_infos.count()) + ' results')
    new_query_history = []
    if ip_infos.count() > 0:
        new_query_history.append([ip_string, unicode(ip_infos[0])])
    if COOKIE_QUERY_HISTORY in request.COOKIES:
        old_query_history = request.COOKIES[COOKIE_QUERY_HISTORY]
        try:
            old_query_history = json.loads(old_query_history)
        except json.ReadError:
            old_query_history = []
        old_query_history = uniq(old_query_history)
        new_query_history.extend(old_query_history)
        new_query_history = uniq(new_query_history)[:MAX_QUERY_HISTORY]
    response = render_to_response('ipinfo.html', locals())
    try:
        new_query_history_str = json.dumps(new_query_history)
        response.set_cookie(key=COOKIE_QUERY_HISTORY,
                            value=new_query_history_str,
                            max_age=86400,
                            expires=None,
                            path='/',
                            domain=None,
                            secure=None)
    except json.WriteError:
        response.delete_cookie(key=COOKIE_QUERY_HISTORY)
        print 'write error: '
        print new_query_history
    except json.UnknownSerializerError:
        response.delete_cookie(key=COOKIE_QUERY_HISTORY)
        print 'error'
    return response
Exemple #20
0
    def POST(self):
        data = jsonlib.loads(web.data())

        if data.get('action') == 'arrive':

            snd = ('/my/music/entrance/%s.wav' %
                   data['name'].replace(' ', '_').replace(':', '_'))
            if not os.path.exists(snd):
                snd = None

            soundOut(speech="new %s: %s" %
                     (sensorWords[data['sensor']], data['name']),
                     postSound=snd)
            return 'ok'

        if data.get('action') == 'leave':
            soundOut(preSound='/my/music/entrance/leave.wav',
                     speech="lost %s. %s" %
                     (sensorWords[data['sensor']], data['name']))
            return 'ok'

        return "nothing to do"
Exemple #21
0
    def POST(self):
        data = jsonlib.loads(web.data())

        if data.get('action') == 'arrive':
            
            snd = ('/my/music/entrance/%s.wav' %
                   data['name'].replace(' ', '_').replace(':', '_'))
            if not os.path.exists(snd):
                snd = None

            soundOut(speech="new %s: %s" % (sensorWords[data['sensor']],
                                            data['name']),
                     postSound=snd)
            return 'ok'

        if data.get('action') == 'leave':
            soundOut(preSound='/my/music/entrance/leave.wav',
                     speech="lost %s. %s" % (sensorWords[data['sensor']],
                                             data['name']))
            return 'ok'
        
        return "nothing to do"
Exemple #22
0
 def get_all_min_position_id(self):
     opener = urt.build_opener()  # urllib创建开启url器
     # 设置头文件参数: User-agent, Cookie
     opener.addheaders = [('User-agent', self.user_agent)]
     opener.addheaders = [('Cookie', self.cookie)]
     count = 0
     flag = True
     while flag:
         page = opener.open(self.url, data=None, timeout=1000)
         content = page.read()  # 读取response文件
         # print("content:", content)
         # 将response文件转换成JSON格式,JSON转换成Python字典数据结构
         data = jsonlib.loads(content)
         # print("data:", data)
         # 获取字典数据结构内部的数据字段和值(key: value) new_latent_count":20
         min_position = int(data['min_position'])  # int
         mobile_id = str(min_position - 1)
         min_position = str(min_position)
         print("min_po:", min_position, ", mo_id:", mobile_id)
         # has_more_items = data['has_more_items']  # int
         new_latent_count = data['new_latent_count']  # boolean
         # print("next_cursor:", min_position)
         if not (new_latent_count > 0):
             flag = False
         else:
             # 形成新的URL,在获取JSON
             next_url = "https://twitter.com/i/profiles/show/poke/timeline/with_replies?" \
                   "include_available_features=1" \
                   "&include_entities=1&max_position="+min_position+"&reset_error_state=false"
             self.url = next_url
             # 直接一个个的写入文本文件,获取一个就写一个
             mobile_next_url = "https://mobile.twitter.com/i/rw/profile/timeline?" \
                               "max_id="+mobile_id+"&screen_name=poke&type=tweets"
             self.input_text(mobile_next_url)
             count += 1
             print("count:", count, ", next_cursor:", min_position)
             print("next_url:", self.url)
             print("\n")
Exemple #23
0
    def to_cells(self, start, end):
        start_col, start_row = start
        end_col, end_row = end

        strings_dict = jsonlib.loads(self.contents_json)

        for col in xrange(0, end_col - start_col + 1):
            for row in xrange(0, end_row - start_row + 1):

                clip_loc = col % self.width, row % self.height

                clip_cell = strings_dict['%s,%s' % clip_loc]
                dest_cell = Cell()
                if clip_cell['formula']:
                    column_offset, row_offset = self._get_offset(
                        col, row, start_col, start_row)
                    dest_cell.formula = rewrite_formula(
                        clip_cell['formula'], column_offset, row_offset,
                        self.is_cut, self.source_range)

                dest_cell.formatted_value = clip_cell['formatted_value']
                dest_loc = col + start_col, row + start_row
                yield (dest_loc, dest_cell)
    def to_cells(self, start, end):
        start_col, start_row = start
        end_col, end_row = end

        strings_dict = jsonlib.loads(self.contents_json)

        for col in xrange(0, end_col - start_col + 1):
            for row in xrange(0, end_row - start_row + 1):

                clip_loc = col % self.width, row % self.height

                clip_cell = strings_dict['%s,%s' % clip_loc]
                dest_cell = Cell()
                if clip_cell['formula']:
                    column_offset, row_offset = self._get_offset(col, row, start_col, start_row)
                    dest_cell.formula = rewrite_formula(
                        clip_cell['formula'], column_offset, row_offset,
                        self.is_cut, self.source_range
                    )

                dest_cell.formatted_value = clip_cell['formatted_value']
                dest_loc = col + start_col, row + start_row
                yield (dest_loc, dest_cell)
Exemple #25
0
def main():
    updates = []
    url = "http://search.twitter.com/search.json"
    params = {"q": HASH_TAG,
              "rpp": 30,
              "page": 1}

    while True:
        req_data = urllib.urlencode(params)
        print >>sys.stderr, "Fetching %s?%s..." % (url, req_data)

        req = urllib2.Request(url, req_data)
        response = urllib2.urlopen(req)
        contents = response.read()
        info = json.loads(contents)

        updates += [update for update in info['results']]
        if not len(info['results']):
            break
        else:
            params['page'] += 1
            req_data = urllib.urlencode(params)

    all_words = []
    banned = [HASH_TAG]
    allowed_re = re.compile(ALLOWED_REGEX)
    
    for result in updates:
        words = result['text'].split(" ")
        all_words += [word.lower()
                      for word in words
                      if (word not in banned and
                          allowed_re.match(word) and
                          len(word) > MIN_LENGTH)]

    print("")
    print(" ".join(sorted(all_words)))
Exemple #26
0
# 设置头文件参数: User-agent, Cookie
opener.addheaders = [('User-agent', "Mozilla/5.0 (Windows NT 10.0; WOW64) "
                      "AppleWebKit/537.36 (KHTML, "
                      "like Gecko) Chrome/55.0.2883.87 Safari/537.36")]
opener.addheaders = [(
    'Cookie',
    "m-b=\"Xb0jVMw7nHR4ALk0Bu1nFQ\075\075\"; m-s=\"be36VoAlhhlQ7bjYB5sGlg\075\075\";"
    " m-css_v=b6a9d4fb55602580; m-early_v=83471c69fad5a4ed; m-tz=-480; m-wf-loaded=q-ico"
    "ns-q_serif; _ga=GA1.2.1160086400.1486206954")]
page = opener.open(
    "https://tch170417.tch.quora.com/up/chan32-8888/updates?min_seq=4274796479&channel=ma"
    "in-w-dep3505-2944002901738499632&hash=11835125263837804143&callback=jsonp15"
    "a09a3bcc6416ca304392bd",
    data=None,
    timeout=100)
content = page.read()  # 读取response文件
print("content:", content)
# 将response文件转换成JSON格式,JSON转换成Python字典数据结构
data = jsonlib.loads(content)
print("data:", data)
# 获取字典数据结构内部的数据字段和值(key: value) new_latent_count":20
# 转换str to int
min_position = int(data['min_position'])
mobile_id = min_position - 1
mobile_id = str(mobile_id)
print("mobile_id:", mobile_id, ", type:", type(mobile_id))
print("min_position:", min_position, ", type:", type(min_position))
print("has_more_items:", data['has_more_items'])
print("new_latent_count:", data['new_latent_count'])
# print("items_html:", data['items_html'])
Exemple #27
0
def jsonRowCount(jsonResults):
    """given a json string like parseJsonTerm takes, just count the rows"""
    return len(jsonlib.loads(jsonResults)['results']['bindings'])
Exemple #28
0
def posts2users(posts_fname,extract_user_id,
				working_dir=None,max_open_temp_files=256):
	""" 
	This method builds a valid `users.json.gz` file from the `posts.json.gz` file
	specified.  Unless indicated otherwise, the directory containing the posts
	file will be used as the working and output directory for the construction
	process.

	`extract_user_id` is a function that accepts a post and returns a string
	user_id.
	"""
	
	# figure out the working dir
	if not working_dir:
		working_dir = os.path.dirname(posts_fname)

	# bin the user data
	logger.info('binning user posts')

	curr_temp_file_idx = -1

	# A dict from a user-id to the file handle-id 
	user_assignments = {}
	# A dict from the file handle-id to the actual file handle
	file_handles = {}

	# Sanity check methods for ensuring we're reading and writing
	# all the data.
	posts_seen = 0
	user_posts_written = 0

	fh = gzip.open(posts_fname,'r')
	for line in fh:
		post = jsonlib.loads(line)
		uid = extract_user_id(post)
		posts_seen += 1

		if uid not in user_assignments:
			
			# Get the temp file this user should be in.
			# Assume that user-ids are randomly distribued
			# in some range such that the last three
			# digits of the id serve as a uniformly
			# distributed hash
			tmp_file_assignment = long(uid) % max_open_temp_files
			if not tmp_file_assignment in file_handles:
				# Write the temp file as gzipped files
				# because this splitting process gets
				# very expensive when processing large
				# datasets
				tmp_fname = os.path.join(working_dir,'tmp-%03d.json.gz'
							 % tmp_file_assignment)
				logger.debug('creating temp file %s' % tmp_fname)

				tmp_fh = gzip.open(tmp_fname,'w')

				file_handles[tmp_file_assignment] = tmp_fh
			user_assignments[uid] = tmp_file_assignment

		file_handles[user_assignments[uid]].write(line)


	for idx,tmp_fh in file_handles.items():
		tmp_fh.close()

	# aggregate the users
	logger.info('aggregating user data')

	user_fh = gzip.open(os.path.join(working_dir,'users.json.gz'),'w')
	for i in range(max_open_temp_files):
		logging.debug('processing file %d' % i)

		tmp_fname = os.path.join(working_dir,'tmp-%03d.json.gz' % i)
		tmp_fh = gzip.open(tmp_fname,'r')

		# aggregate data by tweets
		user_posts = {}
		for line in tmp_fh:
			post = jsonlib.loads(line)
			uid = extract_user_id(post)

			if uid not in user_posts:
				user_posts[uid] = []

			user_posts[uid].append(post)

		# write out the tweets by user
		for uid,posts in user_posts.items():
			user_fh.write('%s\n' % jsonlib.dumps({'user_id':uid,'posts':posts}))
			user_posts_written += len(posts)

		# delete the temporary file
                tmp_fh.close();
		os.remove(tmp_fname)

	# done
	user_fh.close()
	logger.debug("Read %s posts, wrote %s posts to users.json.gz" 
		    % (posts_seen, user_posts_written))
Exemple #29
0
def read():
    for  line in open(sys.argv[1]):
        yield line.strip().split("\t")[-1]


def timeit(label, seq):
    t0 = time.time()
    for x in seq:
        pass
    t1 = time.time()
    print label, "%.2f sec" % (t1-t0)

if False and __name__ == "__main__":
    timeit("read", read())
    timeit("simplejson.load", (simplejson.loads(json) for json in read()))
    timeit("jsonlib.load", (jsonlib.loads(json) for json in read()))


    timeit("simplejson.load-dump", (simplejson.dumps(simplejson.loads(json)) for json in read()))
    timeit("jsonlib.load-dump", (jsonlib.dumps(jsonlib.loads(json)) for json in read()))


def bench(count, f, *args):
    times = []
    for _ in range(count):
        t0 = time.time()
        f(*args)
        times.append(time.time() - t0)
    times = sorted(times)
    return "avg %.5f med %.5f max %.5f min %.5f" % (
        sum(times) / float(len(times)),
Exemple #30
0
import jsonlib, http.client

connection = http.client.HTTPConnection('18.217.240.250', 80)
connection.connect()
connection.request(
    'GET', '/parse/classes/water_consumption', "", {
        "X-Parse-Application-Id": "0bfc45c8be2b2e93f018041ff949fe6d09233c0a",
        "X-Parse-REST-API-Key": "avbs",
        "Content-Type": "application/json"
    })
result = jsonlib.loads(connection.getresponse().read())
print(result)
Exemple #31
0
    def getTicket(self):
        t = jsonlib.loads(open(self.ticketFile).read(), use_float=True)

        if t['expires'] < time.time():
            raise ValueError("access ticket expired")
        return t['magic']
Exemple #32
0
def posts2users(posts_fname,
                extract_user_id,
                working_dir=None,
                max_open_temp_files=256):
    """ 
	This method builds a valid `users.json.gz` file from the `posts.json.gz` file
	specified.  Unless indicated otherwise, the directory containing the posts
	file will be used as the working and output directory for the construction
	process.

	`extract_user_id` is a function that accepts a post and returns a string
	user_id.
	"""

    # figure out the working dir
    if not working_dir:
        working_dir = os.path.dirname(posts_fname)

    # bin the user data
    logger.info('binning user posts')

    curr_temp_file_idx = -1

    # A dict from a user-id to the file handle-id
    user_assignments = {}
    # A dict from the file handle-id to the actual file handle
    file_handles = {}

    # Sanity check methods for ensuring we're reading and writing
    # all the data.
    posts_seen = 0
    user_posts_written = 0

    fh = gzip.open(posts_fname, 'r')
    for line in fh:
        post = jsonlib.loads(line)
        uid = extract_user_id(post)
        posts_seen += 1

        if uid not in user_assignments:

            # Get the temp file this user should be in.
            # Assume that user-ids are randomly distribued
            # in some range such that the last three
            # digits of the id serve as a uniformly
            # distributed hash
            tmp_file_assignment = long(uid) % max_open_temp_files
            if not tmp_file_assignment in file_handles:
                # Write the temp file as gzipped files
                # because this splitting process gets
                # very expensive when processing large
                # datasets
                tmp_fname = os.path.join(
                    working_dir, 'tmp-%03d.json.gz' % tmp_file_assignment)
                logger.debug('creating temp file %s' % tmp_fname)

                tmp_fh = gzip.open(tmp_fname, 'w')

                file_handles[tmp_file_assignment] = tmp_fh
            user_assignments[uid] = tmp_file_assignment

        file_handles[user_assignments[uid]].write(line)

    for idx, tmp_fh in file_handles.items():
        tmp_fh.close()

    # aggregate the users
    logger.info('aggregating user data')

    user_fh = gzip.open(os.path.join(working_dir, 'users.json.gz'), 'w')
    for i in range(max_open_temp_files):
        logging.debug('processing file %d' % i)

        tmp_fname = os.path.join(working_dir, 'tmp-%03d.json.gz' % i)
        tmp_fh = gzip.open(tmp_fname, 'r')

        # aggregate data by tweets
        user_posts = {}
        for line in tmp_fh:
            post = jsonlib.loads(line)
            uid = extract_user_id(post)

            if uid not in user_posts:
                user_posts[uid] = []

            user_posts[uid].append(post)

        # write out the tweets by user
        for uid, posts in user_posts.items():
            user_fh.write('%s\n' % jsonlib.dumps({
                'user_id': uid,
                'posts': posts
            }))
            user_posts_written += len(posts)

        # delete the temporary file
        tmp_fh.close()
        os.remove(tmp_fname)

    # done
    user_fh.close()
    logger.debug("Read %s posts, wrote %s posts to users.json.gz" %
                 (posts_seen, user_posts_written))
Exemple #33
0
        def load_user(self, line):
                """
                Converts this compressed representation of the user's data into
                a dict format that mirrors the full JSON data, except with all
                unused fields omitted (e.g., posting date).
                """
                cols = line.split("\t")
                user_id_str = cols[0]
                user_id = user_id_str
                posts = []
                user_obj = {} 
                user_obj['user_id']  = user_id
                user_obj['posts'] = posts
                COLS_PER_POST = 8

                should_exclude_location_data = user_id in self.excluded_users
                
                #print "User %d had line with %d columns (%f posts)" % (user_id, len(cols), len(cols) / 8.0)
                
                for post_offset in range(1, len(cols), COLS_PER_POST):
                        try:
                                # Grab the relevant content for this post
                                text = cols[post_offset]
                                tweet_id = cols[post_offset+1]
                                self_reported_loc = cols[post_offset+2]
                                geo_str = cols[post_offset+3]
                                mentions_str = cols[post_offset+4]
                                hashtags_str = cols[post_offset+5]
                                is_retweet_str = cols[post_offset+6]
                                place_json = cols[post_offset+7]                                                
                                
                                # Reconstruct the post as a series of nested dicts that
                                # mirrors the real-world full JSON object in structure
                                post = {}
                                post["id_str"] = tweet_id
                                post["id"] = long(tweet_id)
                                post["text"] = text

                                if is_retweet_str == 'True':
                                        # We don't have any data to put, so just fill it
                                        # with an empty object
                                        post["retweeted_status"] = {}

                                entities = {}
                                post["entities"] = entities

                                user_mentions = []
                                entities["user_mentions"] = user_mentions
                                if len(mentions_str) > 0:
                                        mentions = mentions_str.split(" ")
                                        for mention in mentions:
                                                mention_obj = {}
                                                mention_obj["id"] = long(mention)
                                                mention_obj["id_str"] = mention
                                                user_mentions.append(mention_obj)

                                hashtags = []
                                entities["hashtags"] = hashtags
                                if len(hashtags_str) > 0:
                                        tags = hashtags_str.split(" ")
                                        for tag in tags:
                                                tag_obj = {}
                                                tag_obj["text"] = tag
                                                hashtags.append(tag_obj)
                                
                        
                                # Only include geo information for posts that are not in
                                # the set of exlcuded posts, which are likely being used
                                # for testing data
                                if len(geo_str) > 0 and not should_exclude_location_data:
                                        geo = {}
                                        post["geo"] = geo
                                        coordinates = []
                                        coords = geo_str.split(" ")
                                        coordinates.append(float(coords[0]))
                                        coordinates.append(float(coords[1]))
                                        geo["coordinates"] = coordinates

                                # Place is a special case because the field formatting
                                # is so complex, it's just saved as a raw JSON string.
                                # This requires reparsing place to stuff in our object.
                                # However, since place is relative rare (1% of tweets),
                                # this isn't very expensive
                                if len(place_json) > 1 and not should_exclude_location_data:
                                        place = jsonlib.loads(place_json)
                                        post["place"] = place
                                user = {}
                                post["user"] = user
                                user["id_str"] = user_id_str
                                user["id"] = user_id
                                user["location"] = self_reported_loc

                                posts.append(post)
                        except:
                                logger.info("Saw malformed post when reading user; skipping")
                                pass
                        
                return user_obj
Exemple #34
0
import jsonlib

# 写入文件
data = {
    'no': 1,
    'name': 'Runoob',
    'url': 'http://runoob.txt.com'
}

with open('data.json', 'w') as f:
    jsonlib.dump(data, f)

# 读取数据
with open('data.json', 'r') as f:
    data = jsonlib.loads(f)














Exemple #35
0
    def getTicket(self):
        t = jsonlib.loads(open(self.ticketFile).read(), use_float=True)

        if t["expires"] < time.time():
            raise ValueError("access ticket expired")
        return t["magic"]
Exemple #36
0
 def process_result_value(self, value, dialect):
     if value is None:
         return value
     else:
         return json.loads(value)
import urllib
import jsonlib

# serviceurl = 'http://maps.googleapis.com/maps/api/geocode/json?'
serviceurl = 'http://python-data.dr-chuck.net/geojson?'

while True:
    address = input('Enter location: ')
    if len(address) < 1: break

    url = serviceurl + urllib.urlencode({
        'sensor': 'false',
        'address': address
    })

    print('Retrieving ', url)
    url_handle = urllib.urlopen(url)
    data = url_handle.read()

    print('Retrieved', len(data), 'characters')
    json_data = jsonlib.loads(data)

    #print json.dumps(json_data['results'], indent=3)
    print('Place id', json_data['results'][0]['place_id'])
Exemple #38
0
def jsonRowCount(jsonResults):
    """given a json string like parseJsonTerm takes, just count the rows"""
    return len(jsonlib.loads(jsonResults)['results']['bindings'])
Exemple #39
0
def set_column_widths(request, sheet):
    sheet.column_widths.update(jsonlib.loads(request.POST['column_widths']))
    sheet.save()
    return HttpResponse('OK')
 def __init__(self, *args, **kwargs):
     models.Model.__init__(self, *args, **kwargs)
     self.column_widths = jsonlib.loads(self.column_widths_json)
     if not self.api_key:
         self.api_key = str(uuid4())
Exemple #41
0
    for line in open(sys.argv[1]):
        yield line.strip().split("\t")[-1]


def timeit(label, seq):
    t0 = time.time()
    for x in seq:
        pass
    t1 = time.time()
    print label, "%.2f sec" % (t1 - t0)


if False and __name__ == "__main__":
    timeit("read", read())
    timeit("simplejson.load", (simplejson.loads(json) for json in read()))
    timeit("jsonlib.load", (jsonlib.loads(json) for json in read()))

    timeit("simplejson.load-dump",
           (simplejson.dumps(simplejson.loads(json)) for json in read()))
    timeit("jsonlib.load-dump",
           (jsonlib.dumps(jsonlib.loads(json)) for json in read()))


def bench(count, f, *args):
    times = []
    for _ in range(count):
        t0 = time.time()
        f(*args)
        times.append(time.time() - t0)
    times = sorted(times)
    return "avg %.5f med %.5f max %.5f min %.5f" % (sum(times) / float(