def size_in_bytes(data): ''' Gets the size in bytes of a str. @return: long ''' try: from os import sys return sys.getsizeof(data) except AttributeError: import base64 #2.5 and older... return len(base64.encodestring(data)) #1 ASCII char = 1 byte
def send(self, xmli, signature=True): ''' Sends an invoice @param xmli:str Invoice XML representation. @return: InvoiceReport ''' xmli = to_byte_string(xmli) if not xmli: raise ValueError("Invalid XMLi") ''' XMLdsig: required PyCrypto + lxml ''' private_key, public_key = self.email.client.keys if signature and private_key and public_key: from greendizer import xmldsig xmli = xmldsig.sign(xmli, private_key, public_key) size = 0 try: from os import sys size = sys.getsizeof(xmli) except AttributeError: import base64 #2.5 and older... size = len(base64.encodestring(xmli)) #1 ASCII = 1 byte if size > MAX_CONTENT_LENGTH: raise ValueError("XMLi's size is limited to %skb." % MAX_CONTENT_LENGTH / 1024) request = Request(self.email.client, method="POST", data=xmli, uri=self._uri, content_type="application/xml") response = request.get_response() if response.status_code == 202: #Accepted return InvoiceReport(self.email, extract_id_from_uri(response["Location"]))
import time import sys tic0 = time.perf_counter() ##---------------------------------------------------------------------------------------- ## Logistic Regression with SGD ##---------------------------------------------------------------------------------------- sample_size = 5000 p = 50 partition_method = "systematic" partition_num = 20 data_pdf = simulate_logistic(sample_size, p, partition_method, partition_num) data_sdf = spark.createDataFrame(data_pdf) memsize = sys.getsizeof(data_pdf) assembler = VectorAssembler(inputCols=["x" + str(x) for x in range(p)], outputCol="features") tic = time.perf_counter() parsedData = assembler.transform(data_sdf) time_parallelize = time.perf_counter() - tic tic = time.perf_counter() # Model configuration lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8) # Fit the model lrModel = lr.fit(parsedData) time_clusterrun = time.perf_counter() - tic
def analysis_advance(self, securities: str or [str], analyzers: [str], time_serial: (datetime.datetime, datetime.datetime), progress_rate: ProgressRate = None, enable_calculation: bool = True, enable_from_cache: bool = True, enable_update_cache: bool = True, debug_load_json: bool = False, debug_dump_json: bool = False, dump_path: str = '') -> [AnalysisResult]: clock = Clock() total_result = [] if progress_rate is not None: progress_rate.reset() if not isinstance(securities, list): securities = [securities] for analyzer in analyzers: progress_rate.set_progress(analyzer, 0, len(securities)) for analyzer in analyzers: result = None uncached = True if debug_load_json: # DEBUG: Load result from json file clock.reset() with open(path.join(dump_path, analyzer + '.json'), 'rt') as f: result = analysis_results_from_json(f) print('Analyzer %s : Load json finished, time spending: %ss' % (analyzer, clock.elapsed_s())) else: if enable_from_cache: df = self.result_from_cache('Result.Analyzer', analyzer=analyzer, identity=securities, time_serial=time_serial) result = analysis_dataframe_to_list(df) if result is None or len(result) == 0: result = None print('Analyzer %s : No cache data' % analyzer) else: uncached = False if progress_rate is not None: progress_rate.finish_progress(analyzer) print( 'Analyzer %s : Load cache finished, time spending: %ss' % (analyzer, clock.elapsed_s())) if result is None and enable_calculation: clock.reset() if progress_rate is not None: result = self.run_strategy(securities, [analyzer], time_serial=time_serial, progress=progress_rate) else: result = self.run_strategy(securities, [analyzer], time_serial=time_serial) print( 'Analyzer %s : Execute analysis, time spending: %ss' % (analyzer, clock.elapsed_s())) if result is not None and len(result) > 0: total_result.extend(result) byte_size = sys.getsizeof(total_result) + sum( r.rough_size() for r in total_result) print('Total result size = %.2f MB' % (float(byte_size) / 1024 / 1024)) if debug_dump_json: # DEBUG: Dump result to json file clock.reset() with open(path.join(dump_path, analyzer + '.json'), 'wt') as f: analysis_results_to_json(result, f) print('Analyzer %s : Dump json, time spending: %ss' % (analyzer, clock.elapsed_s())) if uncached and enable_update_cache: clock.reset() self.cache_analysis_result('Result.Analyzer', result) print( 'Analyzer %s : Cache result, time spending: %ss' % (analyzer, clock.elapsed_s())) return total_result
# Read or load data chunks into pandas #----------------------------------------------------------------------------------------- time_2sdf_sub = [] time_repartition_sub = [] loop_counter = 0 for file_no_i in range(n_files): tic_2sdf = time.perf_counter() if using_data == "simulated_pdf": if file_no_i == 0: # To test performance, we only simulate one subset of data and replicated it. data_pdf_i = simulate_logistic(sample_size_sub[0], p, partition_method, partition_num_sub) memsize_sub0 = sys.getsizeof(data_pdf_i) else: sample_size_sub.append(sample_size_sub[0]) memsize_sub.append(memsize_sub0) partition_num_sub.append(partition_num_sub[0]) elif using_data == "real_pdf": # Read real data data_pdf_i0 = clean_airlinedata(os.path.expanduser( file_path[file_no_i]), fit_intercept=fit_intercept) # Create an full-column empty DataFrame and resize current subset edf = pd.DataFrame( columns=list(set(dummy_column_names) - set(data_pdf_i0.columns))) data_pdf_i = data_pdf_i0.append(edf, sort=True) del data_pdf_i0
# 使用生成器来创建列表 from os import sys f = [x for x in range(1, 10)] print(f) f = [x + y for x in 'ABCDE' for y in '12346'] print(f) # 用列表的生成表达式语法创建列表容器 f = [x**2 for x in range(1, 10000)] # 查看对象占用的内存 print(sys.getsizeof(f)) # 创建一个生成器对象 f = (x**2 for x in range(1, 1000)) print(f) print(sys.getsizeof(f)) # 遍历生成器对象 for val in f: print(val)
def analysis_advance(self, securities: str or [str], analyzers: [str], time_serial: (datetime.datetime, datetime.datetime), progress_rate: ProgressRate = None, enable_calculation: bool = True, enable_from_cache: bool = True, enable_update_cache: bool = True, debug_load_json: bool = False, debug_dump_json: bool = False, dump_path: str = '') -> [AnalysisResult]: """ Execute analysis and do extra job specified by param. And dump offline analysis result. :param securities: The securities that you want to analysis :param analyzers: The analyzer that you want to execute :param time_serial: The time range as tuple that you want to analysis :param progress_rate: The progress rate, None if you don't need the progress updating. :param enable_calculation: If False, it will only use cached data or debug json data, not do calculation. :param enable_from_cache: If True, it will get data from cache first. If not exists, than do calculation. :param enable_update_cache: If True, the calculate result (if calculation executed) will be cached. :param debug_load_json: If True, data will come from debug json file (not offline analysis result json). :param debug_dump_json: If True, data will dump to debug json file (not offline analysis result json). :param dump_path: The debug json and offline analysis result json dump directory (not file name). :return: Analysis result list """ clock = Clock() total_result = [] if progress_rate is not None: progress_rate.reset() if not isinstance(securities, list): securities = [securities] if progress_rate is not None: for analyzer in analyzers: progress_rate.set_progress(analyzer, 0, len(securities)) # So the percentage of the dump progress is weight enough progress_rate.set_progress('dump_result_json', 0, len(securities)) # Remove microsecond to avoid mongodb query fail. # time_serial = [t.replace(microsecond=0) for t in time_serial] errors = [] for analyzer in analyzers: result = None uncached = True if debug_load_json: # DEBUG: Load result from json file clock.reset() try: with open(path.join(dump_path, analyzer + '.json'), 'rt') as f: result = analysis_results_from_json(f) if result is not None and len(result) > 0: total_result.extend(result) print( 'Analyzer %s : Load json finished, time spending: %ss' % (analyzer, clock.elapsed_s())) except Exception as e: print('Analyzer load from json fail. Continue...') print(e) print(traceback.format_exc()) finally: pass else: if enable_from_cache: df = self.result_from_cache('Result.Analyzer', analyzer=analyzer, identity=securities, time_serial=time_serial) result = analysis_result_dataframe_to_list(df) if result is None or len(result) == 0: result = None print('Analyzer %s : No cache data' % analyzer) else: uncached = False if progress_rate is not None: progress_rate.finish_progress(analyzer) print( 'Analyzer %s : Load cache finished, time spending: %ss' % (analyzer, clock.elapsed_s())) if result is None and enable_calculation: clock.reset() self.__strategy_plugin.clear_error() if progress_rate is not None: result = self.run_strategy(securities, [analyzer], time_serial=time_serial, progress=progress_rate) else: result = self.run_strategy(securities, [analyzer], time_serial=time_serial) errors.append(self.__strategy_plugin.get_last_error()) print( 'Analyzer %s : Execute analysis, time spending: %ss' % (analyzer, clock.elapsed_s())) if result is not None and len(result) > 0: total_result.extend(result) byte_size = sys.getsizeof(total_result) + sum( r.rough_size() for r in total_result) print('Total result size = %.2f MB' % (float(byte_size) / 1024 / 1024)) if debug_dump_json: # DEBUG: Dump result to json file clock.reset() with open(path.join(dump_path, analyzer + '.json'), 'wt') as f: analysis_results_to_json(result, f) print('Analyzer %s : Dump json, time spending: %ss' % (analyzer, clock.elapsed_s())) if uncached and enable_update_cache: clock.reset() self.cache_analysis_result('Result.Analyzer', result) print( 'Analyzer %s : Cache result, time spending: %ss' % (analyzer, clock.elapsed_s())) errors = [err for err in errors if err[0] is not None] if len(errors) > 0: print( '----------------------------- Analysis Errors -----------------------------' ) for err in errors: if err[0] is not None: print(err[0]) print(err[1]) print( '---------------------------------------------------------------------------' ) name_dict_path = os.path.join(dump_path, 'analyzer_names.json') full_dump_path = os.path.join(dump_path, 'analysis_result.json') self.dump_analysis_report(total_result, full_dump_path) self.dump_strategy_name_dict(name_dict_path) progress_rate.finish_progress('dump_result_json') return total_result