def size_in_bytes(data):
    '''
    Gets the size in bytes of a str.
    @return: long
    '''
    try:
        from os import sys
        return sys.getsizeof(data)
    except AttributeError:
        import base64 #2.5 and older...
        return len(base64.encodestring(data)) #1 ASCII char = 1 byte
Esempio n. 2
0
    def send(self, xmli, signature=True):
        '''
        Sends an invoice
        @param xmli:str Invoice XML representation.
        @return: InvoiceReport
        '''
        xmli = to_byte_string(xmli)
        if not xmli:
            raise ValueError("Invalid XMLi")

        '''
        XMLdsig: required PyCrypto + lxml
        '''
        private_key, public_key = self.email.client.keys
        if signature and private_key and public_key:
            from greendizer import xmldsig
            xmli = xmldsig.sign(xmli, private_key, public_key)

        size = 0
        try:
            from os import sys
            size = sys.getsizeof(xmli)
        except AttributeError:
            import base64 #2.5 and older...
            size = len(base64.encodestring(xmli)) #1 ASCII = 1 byte

        if size > MAX_CONTENT_LENGTH:
            raise ValueError("XMLi's size is limited to %skb."
                             % MAX_CONTENT_LENGTH / 1024)

        request = Request(self.email.client, method="POST", data=xmli,
                          uri=self._uri, content_type="application/xml")

        response = request.get_response()
        if response.status_code == 202: #Accepted
            return InvoiceReport(self.email,
                                 extract_id_from_uri(response["Location"]))
Esempio n. 3
0
import time
import sys

tic0 = time.perf_counter()
##----------------------------------------------------------------------------------------
## Logistic Regression with SGD
##----------------------------------------------------------------------------------------
sample_size = 5000
p = 50
partition_method = "systematic"
partition_num = 20

data_pdf = simulate_logistic(sample_size, p, partition_method, partition_num)
data_sdf = spark.createDataFrame(data_pdf)

memsize = sys.getsizeof(data_pdf)

assembler = VectorAssembler(inputCols=["x" + str(x) for x in range(p)],
                            outputCol="features")

tic = time.perf_counter()
parsedData = assembler.transform(data_sdf)
time_parallelize = time.perf_counter() - tic

tic = time.perf_counter()
# Model configuration
lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(parsedData)
time_clusterrun = time.perf_counter() - tic
Esempio n. 4
0
    def analysis_advance(self,
                         securities: str or [str],
                         analyzers: [str],
                         time_serial: (datetime.datetime, datetime.datetime),
                         progress_rate: ProgressRate = None,
                         enable_calculation: bool = True,
                         enable_from_cache: bool = True,
                         enable_update_cache: bool = True,
                         debug_load_json: bool = False,
                         debug_dump_json: bool = False,
                         dump_path: str = '') -> [AnalysisResult]:
        clock = Clock()
        total_result = []

        if progress_rate is not None:
            progress_rate.reset()

        if not isinstance(securities, list):
            securities = [securities]

        for analyzer in analyzers:
            progress_rate.set_progress(analyzer, 0, len(securities))

        for analyzer in analyzers:
            result = None
            uncached = True

            if debug_load_json:
                # DEBUG: Load result from json file
                clock.reset()
                with open(path.join(dump_path, analyzer + '.json'), 'rt') as f:
                    result = analysis_results_from_json(f)
                print('Analyzer %s : Load json finished, time spending: %ss' %
                      (analyzer, clock.elapsed_s()))
            else:
                if enable_from_cache:
                    df = self.result_from_cache('Result.Analyzer',
                                                analyzer=analyzer,
                                                identity=securities,
                                                time_serial=time_serial)
                    result = analysis_dataframe_to_list(df)

                    if result is None or len(result) == 0:
                        result = None
                        print('Analyzer %s : No cache data' % analyzer)
                    else:
                        uncached = False
                        if progress_rate is not None:
                            progress_rate.finish_progress(analyzer)
                        print(
                            'Analyzer %s : Load cache finished, time spending: %ss'
                            % (analyzer, clock.elapsed_s()))

                if result is None and enable_calculation:
                    clock.reset()
                    if progress_rate is not None:
                        result = self.run_strategy(securities, [analyzer],
                                                   time_serial=time_serial,
                                                   progress=progress_rate)
                    else:
                        result = self.run_strategy(securities, [analyzer],
                                                   time_serial=time_serial)
                    print(
                        'Analyzer %s : Execute analysis, time spending: %ss' %
                        (analyzer, clock.elapsed_s()))

                if result is not None and len(result) > 0:
                    total_result.extend(result)
                    byte_size = sys.getsizeof(total_result) + sum(
                        r.rough_size() for r in total_result)
                    print('Total result size = %.2f MB' %
                          (float(byte_size) / 1024 / 1024))

                    if debug_dump_json:
                        # DEBUG: Dump result to json file
                        clock.reset()
                        with open(path.join(dump_path, analyzer + '.json'),
                                  'wt') as f:
                            analysis_results_to_json(result, f)
                        print('Analyzer %s : Dump json, time spending: %ss' %
                              (analyzer, clock.elapsed_s()))

                    if uncached and enable_update_cache:
                        clock.reset()
                        self.cache_analysis_result('Result.Analyzer', result)
                        print(
                            'Analyzer %s : Cache result, time spending: %ss' %
                            (analyzer, clock.elapsed_s()))
        return total_result
Esempio n. 5
0
# Read or load data chunks into pandas
#-----------------------------------------------------------------------------------------
time_2sdf_sub = []
time_repartition_sub = []

loop_counter = 0
for file_no_i in range(n_files):
    tic_2sdf = time.perf_counter()

    if using_data == "simulated_pdf":
        if file_no_i == 0:
            # To test performance, we only simulate one subset of data and replicated it.
            data_pdf_i = simulate_logistic(sample_size_sub[0], p,
                                           partition_method, partition_num_sub)
            memsize_sub0 = sys.getsizeof(data_pdf_i)
        else:
            sample_size_sub.append(sample_size_sub[0])
            memsize_sub.append(memsize_sub0)
            partition_num_sub.append(partition_num_sub[0])

    elif using_data == "real_pdf":  # Read real data
        data_pdf_i0 = clean_airlinedata(os.path.expanduser(
            file_path[file_no_i]),
                                        fit_intercept=fit_intercept)

        # Create an full-column empty DataFrame and resize current subset
        edf = pd.DataFrame(
            columns=list(set(dummy_column_names) - set(data_pdf_i0.columns)))
        data_pdf_i = data_pdf_i0.append(edf, sort=True)
        del data_pdf_i0
Esempio n. 6
0
# 使用生成器来创建列表

from os import sys
f = [x for x in range(1, 10)]
print(f)
f = [x + y for x in 'ABCDE' for y in '12346']
print(f)
# 用列表的生成表达式语法创建列表容器
f = [x**2 for x in range(1, 10000)]
# 查看对象占用的内存
print(sys.getsizeof(f))

# 创建一个生成器对象
f = (x**2 for x in range(1, 1000))
print(f)
print(sys.getsizeof(f))

# 遍历生成器对象
for val in f:
    print(val)
    def analysis_advance(self,
                         securities: str or [str],
                         analyzers: [str],
                         time_serial: (datetime.datetime, datetime.datetime),
                         progress_rate: ProgressRate = None,
                         enable_calculation: bool = True,
                         enable_from_cache: bool = True,
                         enable_update_cache: bool = True,
                         debug_load_json: bool = False,
                         debug_dump_json: bool = False,
                         dump_path: str = '') -> [AnalysisResult]:
        """
        Execute analysis and do extra job specified by param. And dump offline analysis result.
        :param securities: The securities that you want to analysis
        :param analyzers: The analyzer that you want to execute
        :param time_serial: The time range as tuple that you want to analysis
        :param progress_rate: The progress rate, None if you don't need the progress updating.
        :param enable_calculation: If False, it will only use cached data or debug json data, not do calculation.
        :param enable_from_cache: If True, it will get data from cache first. If not exists, than do calculation.
        :param enable_update_cache: If True, the calculate result (if calculation executed) will be cached.
        :param debug_load_json: If True, data will come from debug json file (not offline analysis result json).
        :param debug_dump_json: If True, data will dump to debug json file (not offline analysis result json).
        :param dump_path: The debug json and offline analysis result json dump directory (not file name).
        :return: Analysis result list
        """
        clock = Clock()
        total_result = []

        if progress_rate is not None:
            progress_rate.reset()

        if not isinstance(securities, list):
            securities = [securities]

        if progress_rate is not None:
            for analyzer in analyzers:
                progress_rate.set_progress(analyzer, 0, len(securities))
            # So the percentage of the dump progress is weight enough
            progress_rate.set_progress('dump_result_json', 0, len(securities))

        # Remove microsecond to avoid mongodb query fail.
        # time_serial = [t.replace(microsecond=0)  for t in time_serial]

        errors = []
        for analyzer in analyzers:
            result = None
            uncached = True

            if debug_load_json:
                # DEBUG: Load result from json file
                clock.reset()
                try:
                    with open(path.join(dump_path, analyzer + '.json'),
                              'rt') as f:
                        result = analysis_results_from_json(f)
                    if result is not None and len(result) > 0:
                        total_result.extend(result)
                    print(
                        'Analyzer %s : Load json finished, time spending: %ss'
                        % (analyzer, clock.elapsed_s()))
                except Exception as e:
                    print('Analyzer load from json fail. Continue...')
                    print(e)
                    print(traceback.format_exc())
                finally:
                    pass
            else:
                if enable_from_cache:
                    df = self.result_from_cache('Result.Analyzer',
                                                analyzer=analyzer,
                                                identity=securities,
                                                time_serial=time_serial)
                    result = analysis_result_dataframe_to_list(df)

                    if result is None or len(result) == 0:
                        result = None
                        print('Analyzer %s : No cache data' % analyzer)
                    else:
                        uncached = False
                        if progress_rate is not None:
                            progress_rate.finish_progress(analyzer)
                        print(
                            'Analyzer %s : Load cache finished, time spending: %ss'
                            % (analyzer, clock.elapsed_s()))

                if result is None and enable_calculation:
                    clock.reset()
                    self.__strategy_plugin.clear_error()
                    if progress_rate is not None:
                        result = self.run_strategy(securities, [analyzer],
                                                   time_serial=time_serial,
                                                   progress=progress_rate)
                    else:
                        result = self.run_strategy(securities, [analyzer],
                                                   time_serial=time_serial)
                    errors.append(self.__strategy_plugin.get_last_error())
                    print(
                        'Analyzer %s : Execute analysis, time spending: %ss' %
                        (analyzer, clock.elapsed_s()))

                if result is not None and len(result) > 0:
                    total_result.extend(result)
                    byte_size = sys.getsizeof(total_result) + sum(
                        r.rough_size() for r in total_result)
                    print('Total result size = %.2f MB' %
                          (float(byte_size) / 1024 / 1024))

                    if debug_dump_json:
                        # DEBUG: Dump result to json file
                        clock.reset()
                        with open(path.join(dump_path, analyzer + '.json'),
                                  'wt') as f:
                            analysis_results_to_json(result, f)
                        print('Analyzer %s : Dump json, time spending: %ss' %
                              (analyzer, clock.elapsed_s()))

                    if uncached and enable_update_cache:
                        clock.reset()
                        self.cache_analysis_result('Result.Analyzer', result)
                        print(
                            'Analyzer %s : Cache result, time spending: %ss' %
                            (analyzer, clock.elapsed_s()))

        errors = [err for err in errors if err[0] is not None]
        if len(errors) > 0:
            print(
                '----------------------------- Analysis Errors -----------------------------'
            )
            for err in errors:
                if err[0] is not None:
                    print(err[0])
                    print(err[1])
                    print(
                        '---------------------------------------------------------------------------'
                    )

        name_dict_path = os.path.join(dump_path, 'analyzer_names.json')
        full_dump_path = os.path.join(dump_path, 'analysis_result.json')

        self.dump_analysis_report(total_result, full_dump_path)
        self.dump_strategy_name_dict(name_dict_path)
        progress_rate.finish_progress('dump_result_json')

        return total_result