Beispiel #1
0
 def get_memory_usage(self):
     """
     Returns the sizes (in bytes) of the four main models that the classifier keeps in memory.
     """
     with self.lock:
         return asizeof.asizeof(self.hc), asizeof.asizeof(self.htc), asizeof.asizeof(self.tc), \
             asizeof.asizeof(self.thc)
    def _get_lines(self, file1, file2, lines, file1_dest, file2_dest):
        """
        Given two files to open and a sorted list of lines to get, open the files,
        retrieves the desired lines, and dumps them to specified file locations.
        """
        lines = deque(lines)
        buf1, buf2 = [], []
        line_counter, target_line = 0, lines.popleft()

        for f1_line, f2_line in zip(open(file1, 'r'), open(file2, 'r')):
            if target_line == line_counter:
                buf1.append(f1_line.strip())
                buf2.append(f2_line.strip())

                if asizeof.asizeof(buf1) + asizeof.asizeof(buf2) > \
                    self.mem_limit:
                    self._dump_bufs_to( [file1_dest, file2_dest],
                                        [buf1, buf2])

                if len(lines) != 0:
                    target_line = lines.popleft()
                else:
                    break
            line_counter += 1

        self._dump_bufs_to( [file1_dest, file2_dest],
                            [buf1, buf2])
    def cleanse(self, src_lang_file, tar_lang_file):
        """
        Cleans the file provided by lowercasing all words and ensuring each line in
        the text file is within min_len and max_len. Operates on two streams
        simultaneously in order to keep line to line correspondence
        """
        self._validate_file(src_lang_file), self._validate_file(tar_lang_file)
        src_dest_file = self.destdir + utilities.strip_filename_from_path(src_lang_file) + ".cleansed"
        tar_dest_file = self.destdir + utilities.strip_filename_from_path(tar_lang_file) + ".cleansed"

        if utilities.files_exist([src_dest_file, tar_dest_file]):
            return
        else:
            utilities.wipe_files([src_dest_file, tar_dest_file])
        self._print("""Cleaning data.  Ensuring uniformity of data...""")

        src_buf, tar_buf = [], []
        for src_line, tar_line in zip(open(src_lang_file), open(tar_lang_file)):
            src_line = src_line.lower().split()
            tar_line = tar_line.lower().split()

            if len(src_line) > self.min_len and len(src_line) < self.max_len and \
                len(tar_line) > self.min_len and len(tar_line) < self.max_len:
                src_buf.append(' '.join(src_line))
                tar_buf.append(' '.join(tar_line))

            if asizeof.asizeof(src_buf) + asizeof.asizeof(tar_buf) > self.mem_limit:
                self._dump_bufs_to( [src_dest_file, tar_dest_file],
                                    [src_buf, tar_buf])

        self._dump_bufs_to([src_dest_file, tar_dest_file], [src_buf, tar_buf])
        self._print("Done\n")
def test_copy_features_does_not_copy_entityset(es):
    agg = Sum(es['log']['value'], es['sessions'])
    agg_where = Sum(es['log']['value'], es['sessions'],
                    where=IdentityFeature(es['log']['value']) == 2)
    agg_use_previous = Sum(es['log']['value'], es['sessions'],
                           use_previous='4 days')
    agg_use_previous_where = Sum(es['log']['value'], es['sessions'],
                                 where=IdentityFeature(es['log']['value']) == 2,
                                 use_previous='4 days')
    features = [agg, agg_where, agg_use_previous, agg_use_previous_where]
    in_memory_size = asizeof(locals())
    copied = [f.copy() for f in features]
    new_in_memory_size = asizeof(locals())
    assert new_in_memory_size < 2 * in_memory_size

    for f, c in zip(features, copied):
        assert f.entityset
        assert c.entityset
        assert id(f.entityset) == id(c.entityset)
        if f.where:
            assert c.where
            assert id(f.where.entityset) == id(c.where.entityset)
        for bf, bf_c in zip(f.base_features, c.base_features):
            assert id(bf.entityset) == id(bf_c.entityset)
            if bf.where:
                assert bf_c.where
                assert id(bf.where.entityset) == id(bf_c.where.entityset)
Beispiel #5
0
def test_slots_being_used():
    """
    The class is really using __slots__.
    """
    non_slot_instance = C1(x=1, y="test")
    slot_instance = C1Slots(x=1, y="test")

    assert "__dict__" not in dir(slot_instance)
    assert "__slots__" in dir(slot_instance)

    assert "__dict__" in dir(non_slot_instance)
    assert "__slots__" not in dir(non_slot_instance)

    assert set(["x", "y"]) == set(slot_instance.__slots__)

    if has_pympler:
        assert asizeof(slot_instance) < asizeof(non_slot_instance)

    non_slot_instance.t = "test"
    with pytest.raises(AttributeError):
        slot_instance.t = "test"

    assert 1 == non_slot_instance.method()
    assert 1 == slot_instance.method()

    assert attr.fields(C1Slots) == attr.fields(C1)
    assert attr.asdict(slot_instance) == attr.asdict(non_slot_instance)
 def process_response(self, request, response):
         req = request.META['PATH_INFO']
         if req.find('static') == -1 and req.find('media') == -1:
                 print req
                 self.end_objects = muppy.get_objects()
                 sum_start = summary.summarize(self.start_objects)
                 sum_end = summary.summarize(self.end_objects)
                 diff = summary.get_diff(sum_start, sum_end)
                 summary.print_(diff)
                 #print '~~~~~~~~~'
                 #cb = refbrowser.ConsoleBrowser(response, maxdepth=2, \
                         #str_func=output_function)
                 #cb.print_tree()
                 print '~~~~~~~~~'
                 a = asizeof(response)
                 print 'Total size of response object in kB: %s' % \
                     str(a / 1024.0)
                 print '~~~~~~~~~'
                 a = asizeof(self.end_objects)
                 print 'Total size of end_objects in MB: %s' % \
                     str(a / 1048576.0)
                 b = asizeof(self.start_objects)
                 print 'Total size of start_objects in MB: %s' % \
                     str(b / 1048576.0)
                 print '~~~~~~~~~'
         return response
Beispiel #7
0
 def push(self, msg):
     serialized_msg = pickle.dumps(msg)
     from pympler.asizeof import asizeof
     print('unpickled: {}, pickled: {}'.format(
         asizeof(msg),
         asizeof(serialized_msg)
     ))
     self.output(serialized_msg)
Beispiel #8
0
 def test_adict(self):
     '''Test asizeof.adict()
     '''
     pdict = PseudoDict()
     size1 = asizeof.asizeof(pdict)
     asizeof.adict(PseudoDict)
     size2 = asizeof.asizeof(pdict)
     # TODO: come up with useful assertions
     self.assertEqual(size1, size2)
Beispiel #9
0
    def test_methods(self):
        '''Test sizing methods and functions
        '''
        def foo():
            pass

        s1 = asizeof.asizeof(self.test_methods, code=True)
        s2 = asizeof.asizeof(TypesTest.test_methods, code=True)
        s3 = asizeof.asizeof(foo, code=True)
Beispiel #10
0
    def test_globals(self):
        '''Test globals examples'''
        self._printf('%sasizeof(%s, limit=%s, code=%s) ... %s', os.linesep, 'globals()', 'MAX', False, '-glob[als]')
        asizeof.asizeof(globals(), limit=self.MAX, code=False, stats=1)
        self._print_functions(globals(), 'globals()', opt='-glob[als]')

        self._printf('%sasizesof(%s, limit=%s, code=%s) ... %s', os.linesep, 'globals(), locals()', 'MAX', False, '-glob[als]')
        asizeof.asizesof(globals(), locals(), limit=self.MAX, code=False, stats=1)
        asizeof.asized(globals(), align=0, detail=self.MAX, limit=self.MAX, code=False, stats=1)
Beispiel #11
0
    def push(self, msg):

        compressed_msg = zlib.compress(msg)
        from pympler.asizeof import asizeof
        print('uncompressed: {}, compressed: {}'.format(
            asizeof(msg),
            asizeof(compressed_msg)
        ))
        self.output(compressed_msg)
Beispiel #12
0
def run(dicp="~/dev/kaggle/fb5/pdic.map", datap="~/dev/kaggle/fb5/train.tab", lr=1., numbats=100, epochs=10):
    dic, revdic = loaddict(expanduser(dicp))
    print len(dic)
    traindata, golddata = loaddata(expanduser(datap), top=10000)
    print asizeof(traindata), golddata.dtype
    m = SpatialEmb(dim=len(dic))

    m.train([traindata], golddata).adagrad(lr=lr).cross_entropy()\
        .split_validate(splits=100, random=True).cross_entropy().accuracy()\
        .train(numbats, epochs)
Beispiel #13
0
 def getSizeOfMgrs(self):
     """ get size of object """
     appGlobal = config['pylons.app_globals']
     
     result = {}
     result['threadmgr'] = asizeof(appGlobal.threadMgr)
     result['packagemgr'] = asizeof(appGlobal.packageMgr)
     result['montior'] = asizeof(appGlobal.agentMonitor)
     result['all'] = asizeof(appGlobal)
     
     return doneResult(request, response, result = result, controller = self)
Beispiel #14
0
 def add_results_data(self, results):
     if SIZE_CONTROL:
         if not self.MEM_LIMIT:
             mem_size = asizeof(self.current_task.results)
             add_size = asizeof(results)
             if (mem_size + add_size) < 15000000:
                 self._add_results(results)
             else:
                 self.MEM_LIMIT = True
     else:
         self._add_results(results)
Beispiel #15
0
 def test_asizer(self):
     '''Test Asizer properties.
     '''
     sizer = asizeof.Asizer()
     obj = 'unladen swallow'
     mutable = [obj]
     sizer.asizeof(obj)
     self.assertEqual(sizer.total, asizeof.asizeof(obj))
     sizer.asizeof(mutable, mutable)
     self.assertEqual(sizer.duplicate, 1)
     self.assertEqual(sizer.total, asizeof.asizeof(obj, mutable))
Beispiel #16
0
    def test_private_slots(self):
        class PrivateSlot(object):
            __slots__ = ('__data',)
            def __init__(self, data):
                self.__data = data

        data = [42] * 100
        container = PrivateSlot(data)
        size1 = asizeof.asizeof(container)
        size2 = asizeof.asizeof(data)
        self.assertTrue(size1 > size2, (size1, size2))
Beispiel #17
0
 def dumpMonitorValues(self):
     ''' dump all monitor values as json string '''
     result = {}
     for (service, mname) in self.__monitorValues:
         key = '%s.%s' % (service, mname)
         result[key] = self.__monitorValues[(service, mname)]
     result['values'] = asizeof(self.__monitorValues)
     result['tasks'] = asizeof(self.__monitorTasks)
     result['messages'] = asizeof(self.__monitorMessages)
     result['tags'] = asizeof(self.__monitorTags)
     result['messagekeys'] = '%s' % self.__monitorMessages
     return result
Beispiel #18
0
    def test_asizeof(self):
        '''Test asizeof.asizeof()
        '''
        self.assertEqual(asizeof.asizeof(), 0)

        objs = [Foo(42), ThinFoo("spam"), OldFoo(67)]
        total = asizeof.asizeof(*objs)
        sizes = list(asizeof.asizesof(*objs))
        sum = 0
        for sz in sizes:
            sum += sz
        self.assertEqual(total, sum, (total, sum))
Beispiel #19
0
    def test_closure(self):
        '''Test sizing closures.
        '''
        def outer(x):
            def inner():
                return x
            return inner

        data = [1] * 1000
        closure = outer(data)
        size_closure = asizeof.asizeof(closure, code=True)
        size_data = asizeof.asizeof(data)
        self.assertTrue(size_closure >= size_data, (size_closure, size_data))
    def split_train_tune_test(self, src_file, src_piv_file, piv_tar_file, tar_file,
        train_split, test_split):
        """
        Splits the full datafiles into test, tune, and train sets.
        Receives 4 files as parameters and 2 decimals indicating the percentage of
        data to be used as train, tune, and test data. If line 1 in src langs is
        in test, then line 1 in tar langs will also be in test. Etc.
        """
        utilities.make_dir(self.traindir)
        utilities.make_dir(self.tunedir)
        utilities.make_dir(self.testdir)

        self._validate_file(src_file), self._validate_file(src_piv_file)
        self._validate_file(piv_tar_file), self._validate_file(tar_file)
        assert train_split + test_split <= 1 , "Invalid size for train, tune, and test splits"

        train_files, tune_files, test_files = self._ttt_filenames(src_file, src_piv_file, piv_tar_file, tar_file)
        if utilities.ttt_files_exist(train_files, tune_files, test_files):
            return
        else:
            utilities.ttt_wipe_files(train_files, tune_files, test_files)

        self._print("""Splitting data into train, tune, and test sets...""")
        train, tune, test = [[] ,[], [], []],  [[], [], [], []], [[], [], [], []]
        for src_line, src_piv_line, piv_tar_line, tar_line in \
            zip_longest(open(src_file), open(src_piv_file), open(piv_tar_file), open(tar_file)):

            x = numpy.random.sample()
            if x < train_split:
                self._add_line_to(train[0], src_line)
                self._add_line_to(train[1], src_piv_line)
                self._add_line_to(train[2], piv_tar_line)
                self._add_line_to(train[3], tar_line)
            elif x >= train_split and x < train_split + test_split:
                self._add_line_to(tune[0], src_line)
                self._add_line_to(tune[1], src_piv_line)
                self._add_line_to(tune[2], piv_tar_line)
                self._add_line_to(tune[3], tar_line)
            else:
                self._add_line_to(test[0], src_line)
                self._add_line_to(test[1], src_piv_line)
                self._add_line_to(test[2], piv_tar_line)
                self._add_line_to(test[3], tar_line)

            if asizeof.asizeof(train) + asizeof.asizeof(tune) + \
                asizeof.asizeof(test) > self.mem_limit:
                self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files)

        self._dump_ttt_bufs_to(train, tune, test, train_files, tune_files, test_files)
        self._print("Done\n")
def print_memory_profiles(sm, tr, tr_sm, LOGFILE = None):
    '''
    Prints report on memory profiles

    IN:
        sm - SeriesModel - SeriesModel object for this run
        tr - SummaryTracker - SummaryTracker object for the whole run
        tr_sm - ClassTrackers - ClassTracker object of SeriesModel
        LOGFILE - file obj - Open logfile for print output
    OUT: None
    '''
    ptf( '\nSERIESMODEL profiling', LOGFILE)
    ptf( 'Look at size of seriesmodel object', LOGFILE)
    ptf( asizeof.asizeof(sm), LOGFILE)
    ptf( asizeof.asized(sm, detail=1).format(), LOGFILE)

    ptf( 'Look at how the SeriesModel class is doing', LOGFILE)
    tr_sm.create_snapshot()
    tr_sm.stats.print_summary()
    tr_sm.stats.print_summary() >> LOGFILE

    ptf( 'PROFILING', LOGFILE)
    ptf( 'Look at memory leaks up to this point', LOGFILE)
    tr.print_diff() >> LOGFILE
    tr.print_diff()
Beispiel #22
0
 def test_weakref(self):
     '''Test sizing weak references.
     '''
     alive = Foo('alive')
     aref = weakref.ref(alive)
     dead = Foo('dead')
     dref = weakref.ref(dead)
     del dead
     aref_size = asizeof.asizeof(aref)
     self.assertTrue(aref_size > asizeof.asizeof(alive), aref_size)
     refs = asizeof.named_refs(aref)
     # TODO: Should a weakref return ('ref', obj)?
     dref_size = asizeof.asizeof(dref)
     self.assertTrue(dref_size > 0, dref_size)
     self.assertNotEqual(dref_size, aref_size)
     refs = asizeof.named_refs(dref)
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
Beispiel #25
0
def run(p="../../../data/atis/atis.pkl", wordembdim=70, lablembdim=70, innerdim=300, lr=0.05, numbats=100, epochs=20, validinter=1, wreg=0.0003, depth=1):
    tracker = SummaryTracker()
    train, test, dics = pickle.load(open(p))
    word2idx = dics["words2idx"]
    table2idx = dics["tables2idx"]
    label2idx = dics["labels2idx"]
    label2idxrev = {v: k for k, v in label2idx.items()}
    train = zip(*train)
    test = zip(*test)
    print "%d training examples, %d test examples" % (len(train), len(test))
    #tup2text(train[0], word2idx, table2idx, label2idx)
    maxlen = 0
    for tup in train + test:
        maxlen = max(len(tup[0]), maxlen)

    numwords = max(word2idx.values()) + 2
    numlabels = max(label2idx.values()) + 2

    # get training data
    traindata = getdatamatrix(train, maxlen, 0).astype("int32")
    traingold = getdatamatrix(train, maxlen, 2).astype("int32")
    trainmask = (traindata > 0).astype("float32")

    # test data
    testdata = getdatamatrix(test, maxlen, 0).astype("int32")
    testgold = getdatamatrix(test, maxlen, 2).astype("int32")
    testmask = (testdata > 0).astype("float32")

    res = atiseval(testgold-1, testgold-1, label2idxrev); print res#; exit()

    print asizeof(traindata)

    # define model
    innerdim = [innerdim] * depth
    m = SimpleSeqTransDec(indim=numwords, inpembdim=wordembdim, outembdim=lablembdim, innerdim=innerdim, outdim=numlabels)

    # training
    m = m.train([traindata, shiftdata(traingold), trainmask], traingold).adagrad(lr=lr).grad_total_norm(5.0).seq_cross_entropy().l2(wreg)\
        .cross_validate(splits=5, random=True).seq_cross_entropy().seq_accuracy().validinter(validinter).takebest()\
        .train(numbats, epochs)

    # predict after training
    s = SeqTransDecSearch(m)
    testpred = s.decode(testdata)
    testpred = testpred * testmask

    evalres = atiseval(testpred-1, testgold-1, label2idxrev); print evalres
Beispiel #26
0
    def create_snapshot(self, description='', compute_total=False):
        """
        Collect current per instance statistics and saves total amount of
        memory associated with the Python process.

        If `compute_total` is `True`, the total consumption of all objects
        known to *asizeof* is computed. The latter might be very slow if many
        objects are mapped into memory at the time the snapshot is taken.
        Therefore, `compute_total` is set to `False` by default.

        The overhead of the `ClassTracker` structure is also computed.

        Snapshots can be taken asynchronously. The function is protected with a
        lock to prevent race conditions.
        """

        try:
            # TODO: It is not clear what happens when memory is allocated or
            # released while this function is executed but it will likely lead
            # to inconsistencies. Either pause all other threads or don't size
            # individual objects in asynchronous mode.
            self.snapshot_lock.acquire()

            timestamp = _get_time()

            sizer = asizeof.Asizer()
            objs = [tobj.ref() for tobj in list(self.objects.values())]
            sizer.exclude_refs(*objs)

            # The objects need to be sized in a deterministic order. Sort the
            # objects by its creation date which should at least work for
            # non-parallel execution. The "proper" fix would be to handle
            # shared data separately.
            tracked_objects = list(self.objects.values())
            tracked_objects.sort(key=lambda x: x.birth)
            for tobj in tracked_objects:
                tobj.track_size(timestamp, sizer)

            snapshot = Snapshot()

            snapshot.timestamp = timestamp
            snapshot.tracked_total = sizer.total
            if compute_total:
                snapshot.asizeof_total = asizeof.asizeof(all=True, code=True)
            snapshot.system_total = pympler.process.ProcessMemoryInfo()
            snapshot.desc = str(description)

            # Compute overhead of all structures, use sizer to exclude tracked
            # objects(!)
            snapshot.overhead = 0
            if snapshot.tracked_total:
                snapshot.overhead = sizer.asizeof(self)
                if snapshot.asizeof_total:
                    snapshot.asizeof_total -= snapshot.overhead

            self.snapshots.append(snapshot)

        finally:
            self.snapshot_lock.release()
Beispiel #27
0
 def test_ignore_frame(self):
     '''Test whether reference cycles are created
     '''
     gc.collect()
     gc.disable()
     s = asizeof.asizeof(all=True, code=True)
     self.assertEqual(gc.collect(), 0)
     gc.enable()
Beispiel #28
0
 def test_exception(self):
     '''Test sizing exceptions.
     '''
     try:
         raise Exception("Test exception-sizing.")
     except Exception:
         etype, exc, etb = sys.exc_info()
         try:
             tb_size = asizeof.asizeof(etb)
             self.assertTrue(tb_size > 0, tb_size)
             refs = asizeof.named_refs(etb)
             ref_names = set([name for name, _ in refs])
             self.assertTrue(set(['tb_frame', 'tb_next']) <= ref_names, ref_names)
             ex_size = asizeof.asizeof(etype, exc)
             self.assertTrue(ex_size > 0, ex_size)
         finally:
             del etb
Beispiel #29
0
def get_size_in_mb(obj):
    """Get the size of a given object in MB.

    :param obj: Object to get memory usage of
    :return: Memory used by the given object
    :rtype: float
    """
    return asizeof(obj) / (1024.0 * 1024.0)
Beispiel #30
0
def get_license_index(rules=None):
    """
    Return a LicenseIndex built from a list of rules.
    """
    if not rules:
        rules = get_all_rules()

    if DEBUG_PERF:
        from pympler import asizeof  # @UnresolvedImport
        print('Memory size of rules:', asizeof.asizeof(rules))

    idx = LicenseIndex(rules)

    if DEBUG_PERF:
        print('Memory size of index:', asizeof.asizeof(idx))

    return idx
Beispiel #31
0
    def write(self,data):
        print ("=================== Writing Data down to wire from Client ================\n")


        Cencap = PEEPpacket()
        calcChecksum = PEEPClient(self.loop)
        Cencap.Type = 5
        Cencap.SequenceNumber = self.update_sequence(data)
        self.prev_sequence_number = Cencap.SequenceNumber
        print ("SEQ No:" + str(Cencap.SequenceNumber))
        Cencap.Acknowledgement = self.global_number_ack
        print ("ACK No:" + str(Cencap.Acknowledgement))
        Cencap.Data = data
        print ("Data is", data)
        print ("Size of data", asizeof.asizeof(data))
        Cencap.Checksum = calcChecksum.calculateChecksum(Cencap)

        bytes = Cencap.__serialize__()
        self.transport.write(bytes)
Beispiel #32
0
    async def send_websocket_message(self, message):
        """
        Handles all out going websocket messages to not overflow the size of one message handable for the websocket
        connection. Thus, this function introduces chunked transfer of the given message if required. The maximum size
        of one message is set in config with parameter WEBSOCKET_MAX.py

        :param message: The message to be send as JSON object via the websocket connection.
        :type message: dict or list
        """
        message_size = asizeof.asizeof(message)
        if message_size < WEBSOCKET_MAX:
            await self.send_json(message)
        else:
            message_str = json.dumps(message)
            self.parts_to_send = [
                message_str[i:i + WEBSOCKET_MAX]
                for i in range(0, len(message_str), WEBSOCKET_MAX)
            ]
            await self.send_part(0)
Beispiel #33
0
    def test_long(self):
        '''Test int and long examples'''
        try:
            _L5d = long(1) << 64
            _L17d = long(1) << 256
            t = '<int>/<long>'
        except NameError:
            _L5d = 1 << 64
            _L17d = 1 << 256
            t = '<int>'

        self._printf('%sasizeof(%s, align=%s, limit=%s) ... %s', os.linesep, t,
                     0, 0, '-int')
        for o in (1024, 1000000000, 1.0, 1.0e100, 1024, 1000000000, self.MAX,
                  1 << 32, _L5d, -_L5d, _L17d, -_L17d):
            self._printf(" asizeof(%s) is %s (%s + %s * %s)", _repr(o),
                         asizeof.asizeof(o, align=0, limit=0),
                         asizeof.basicsize(o), asizeof.leng(o),
                         asizeof.itemsize(o))
Beispiel #34
0
    def test_asized(self):
        '''Test asizeof.asized()
        '''
        self.assertEqual(list(asizeof.asized(detail=2)), [])
        self.assertRaises(KeyError, asizeof.asized, **{'all': True})
        sized = asizeof.asized(Foo(42), detail=2)
        self.assertEqual(sized.name, 'Foo')
        refs = [ref for ref in sized.refs if ref.name == '__dict__']
        self.assertEqual(len(refs), 1)
        self.assertEqual(refs[0], sized.get('__dict__'))

        refs = [ref for ref in refs[0].refs if ref.name == '[V] data: 42']
        self.assertEqual(len(refs), 1, refs)
        i = 42
        self.assertEqual(refs[0].size, asizeof.asizeof(i), refs[0].size)
        # Size multiple objects
        sizer = asizeof.Asizer()
        sized_objs = sizer.asized(Foo(3), Foo(4), detail=2)
        self.assertEqual(len(sized_objs), 2)
Beispiel #35
0
    def memory_report(self):
        """Prints a detailed memory report of the pipeline object to screen.

        To get better memory estimates make sure the pympler Python package is
        installed. Without it, sys.getsizeof is used, which can be extremely
        underestimate memory size of Python objects.
        """
        print("=== Pipeline memory report ===")
        size = asizeof(self)
        if size > 500000:  # pragma: no cover
            print("Total pipeline size in memory: {:.2f}Mb".format(size /
                                                                   1000000))
        elif size > 1000:  # pragma: no cover
            print("Total pipeline size in memory: {:.2f}Kb".format(size /
                                                                   1000))
        else:
            print("Total pipeline size in memory: {:.2f}b".format(size))
        print("Per-stage memory structure:")
        print(self._mem_str(total=size))
Beispiel #36
0
def get_collection_sizes(obj, collections: Optional[Tuple]=None,
                         get_only_non_empty=False):
    """
    Iterates over `collections` of the gives object and gives its byte size
    and number of items in collection
    """
    from pympler import asizeof
    collections = collections or (list, dict, set, deque, abc.Sized)
    if not isinstance(collections, tuple):
        collections = tuple(collections)

    result = []
    for attr_name in dir(obj):
        attr = getattr(obj, attr_name)
        if isinstance(attr, collections) and (
                not get_only_non_empty or len(attr) > 0):
            result.append(
                (attr_name, len(attr), asizeof.asizeof(attr, detail=1)))
    return result
Beispiel #37
0
    def put_data(self, key, value, ttl_duration=None):
        size_of_key = sys.getsizeof(key)
        size_of_value = asizeof.asizeof(value)

        if (os.stat(self.__database.name).st_size > 1000000000 -
            (size_of_key + size_of_value)):
            raise Exception(
                "Database size has reached the maximum size(1GB). Cannot add any more data into this database."
            )
        if (self.get_data(key) != None):
            raise Exception("Given key is already present in the database")
        if (len(key) > 32):
            raise Exception("Length of key should not exceed 32 Characters")
        if (type(value) != dict or size_of_value > 16000):
            raise Exception("Size of value(data) should not exceed 16KB")
        if (ttl_duration != None and ttl_duration < 0):
            raise Exception(
                "Time-to-live property must be greater than or equal to 0")

        d = datetime.datetime.now()
        created_time = None
        if (ttl_duration != None):
            created_time = {
                'day': d.day,
                'month': d.month,
                'year': d.year,
                'hour': d.hour,
                'minute': d.minute,
                'second': d.second,
                'microsecond': d.microsecond,
                'tzinfo': d.tzinfo
            }

        data = {
            "value": value,
            "ttl": {
                'ttl_duration': ttl_duration,
                'created_time': created_time
            }
        }
        self.__database.seek(0, io.SEEK_END)
        self.__database.write(f'"{key}"' + ':' + json.dumps(data) + '\n')
def main_process():
    t = '''
    q: 当退出Python时,是否释放全部内存?
    ans:答案是No。循环引用其它对象或引用自全局命名空间的对象的模块,在Python退出时并非完全释放。
    另外,也不会释放C库保留的内存部分
    '''
    print(colored('mycount=', 'red'), t)


    obj = [1, 2, (3, 4), 'text']
    print(asizeof.asizeof(obj))
    print(asizeof.asized(obj, detail=1).format())

    tr = tracker.SummaryTracker()
    a = [[random.random() for i in range(2000)] for i in range(2000)]
    tr.print_diff()

    gc.collect()
    from sys import getsizeof
    print('-'*20, getsizeof(a))
def python_numpy_unique(db, words):
    print('Standart python functionality (numpy.unique())')
    # unique_words = []
    collection = db.python_numpy_unique
    collection.delete_many({})

    start = time.time()
    for word in words:
        unique_words = [elem['word'] for elem in collection.find()]
        len_unique_words = len(unique_words)
        unique_words = np.unique(np.append(unique_words, word))
        if len(unique_words) > len_unique_words:
            collection.insert_one({'word': word})
        # unique_words = np.unique(unique_words.append(word))
    end = time.time()
    
    print('[python np.unique()] Time python:: {}'.format(end - start))
    print('[python np.unique()] Number of unique words: {}'.format(collection.count_documents({})))
    print('[python np.unique()] Size of unique words: {} Mb, {} Kb\n'.format(asizeof.asizeof(unique_words)/1024/1024, asizeof.asizeof(unique_words)/1024))
    print('________________________________________________________________\n')
Beispiel #40
0
    def _send_report(self, report_id):

        with open(self.report_file) as fh:
            report_json = json.loads(fh.read())

        logger.debug("Unique payload sent with size: {}".format(
            asizeof(json.dumps(report_json))
        ))

        try:
            requests.post(
                self.broadcast_address,
                json={"run_id": report_id, "report_json": report_json}
            )
        except requests.exceptions.ConnectionError:
            logger.error(colored_print(
                "ERROR: Could not establish connection with server. The server"
                " may be down or there is a problem with your internet "
                "connection.", "red_bold"))
            sys.exit(1)
Beispiel #41
0
 def check_pieces_size(self):
     Logger().write(
         LogVerbosity.Important,
         "    _pieces size: " + write_size(asizeof.asizeof(self._pieces)))
     not_done_pieces = [
         piece for piece in self._pieces.values() if not piece.done
     ]
     done_pieces = [piece for piece in self._pieces.values() if piece.done]
     stream_index = [
         piece for piece in self._pieces.values()
         if piece.index < self.torrent.stream_position
     ]
     stream_index_50_mb = [
         piece for piece in self._pieces.values()
         if piece.index > self.torrent.stream_position +
         (50000000 // self.piece_length)
     ]
     Logger().write(
         LogVerbosity.Important,
         "    pieces not done: " + str(len(not_done_pieces)) + " - " +
         write_size(asizeof.asizeof(not_done_pieces)))
     Logger().write(
         LogVerbosity.Important,
         "    pieces done: " + str(len(done_pieces)) + " - " +
         write_size(asizeof.asizeof(done_pieces)))
     Logger().write(
         LogVerbosity.Important,
         "    pieces < stream index: " + str(len(stream_index)) + " - " +
         write_size(asizeof.asizeof(stream_index)))
     Logger().write(
         LogVerbosity.Important, "    pieces > stream index + 50mb: " +
         str(len(stream_index_50_mb)) + " - " +
         write_size(asizeof.asizeof(stream_index_50_mb)))
     Logger().write(
         LogVerbosity.Important,
         "    pieces with initialized blocks: " + str(
             len([
                 piece for piece in self._pieces.values()
                 if len(piece._blocks) > 0
             ])))
     if self.torrent.stream_manager.buffer is not None:
         data_ready = [
             piece
             for piece in self.torrent.stream_manager.buffer.data_ready
         ]
         Logger().write(
             LogVerbosity.Important,
             "    pieces in data_ready: " + str(len(data_ready)) + " - " +
             write_size(asizeof.asizeof(data_ready)))
Beispiel #42
0
def fully_dynamic(eps, args):
    """
    Run the fully dynamic clustering algorithm with the given parameter eps.
    """
    print('Running the algorithm with eps = {} in Process #{}'.format(
        eps, os.getpid()))

    limit = args.limit
    window = args.window

    space = 0  # keep track of the space used by the data structures
    start = time()
    beta = 0

    fdc = FullyDynClus(eps, 20, window)

    for point in data_stream(limit):
        # There are some duplicate points in the dataset, for example,
        # the 71st and 86th points
        if point in fdc.points:
            continue

        fdc.insert(point)

        # Delete the least recent point in the sliding window
        if len(fdc.points) >= window + 1:
            fdc.delete(fdc.points[-window - 1])

        beta = max(beta, fdc.get_result())
        space = max(space, asizeof(fdc))

    print('Finish running the algorithm with eps = {}'.format(eps))

    return {
        str(eps): {
            'run_time': round(time() - start, 3),
            'space': space,
            'op_count': fdc.op_count,
            'beta': beta
        }
    }
def get_negative_indices(chosen_prods_ids_ls, dataset_type):

    if dataset_type == MyUtils_flags.FLAG_TRAIN:
        questions_final_filepath = F.QUESTIONS_FINAL_TRAIN
    if dataset_type == MyUtils_flags.FLAG_VALID:
        questions_final_filepath = F.QUESTIONS_FINAL_VALID
    if dataset_type == MyUtils_flags.FLAG_TEST:
        questions_final_filepath = F.QUESTIONS_FINAL_TEST

    negative_indices_dict = {}

    num_of_questions = 0
    for input_segment in pd.read_csv(questions_final_filepath,
                                     chunksize=2**10,
                                     sep="_"):
        num_of_questions = num_of_questions + len(input_segment)
    logging.info("Number of questions in the current dataset: %s",
                 num_of_questions)  # 111171

    for prod_id in chosen_prods_ids_ls:
        #logging.info("Debug: %s", prod_id)
        random_indices = np.random.choice(a=range(num_of_questions),
                                          size=NUM_NEGATIVE_CANDIDATES,
                                          replace=False,
                                          p=None)
        for rand_index in random_indices:
            if rand_index not in negative_indices_dict:
                negative_indices_dict[rand_index] = []
                negative_indices_dict[rand_index].append(prod_id)
            else:
                negative_indices_dict[rand_index].append(prod_id)

    #with open(F.PRODS_NEGATIVEINDICES, "wb") as neg_indices_dict_file: ##save the structure to check it
    #    pickle.dump(obj=negative_indices_dict, file=neg_indices_dict_file, )
    #Python dictionaries are not ordered --> need to use... a list of
    #logging.info(negative_indices_dict)
    logging.info("Size in memory of the dictionary of random indices: %s KB",
                 mem.asizeof(negative_indices_dict) // 2**10)
    negindices_lts = sorted(negative_indices_dict.items())

    return negindices_lts
Beispiel #44
0
def test_100mb():
    gk_stats = []
    files = []

    #Collect all file names in collection
    for filename in glob.glob('./input_files/98mb_genome/*.fastq'):
        files.append(filename)

    #building Gk Array time and memory test
    gk_start = timer()
    gk_array = GkArray(files, 3)
    gk_end = timer()

    # comment this out of if script is taking too long and
    # set gk_memory  = 1 instead
    # This will skip the memory measurement of the data strucutre
    # but allow the test script to run faster
    gk_memory = asizeof.asizeof(gk_array)

    #query time
    q_start = timer()
    gk_array.get_reads("TTG")
    q_end = timer()

    gk_time = gk_end - gk_start
    q_time = q_end - q_start
    gk_stats.extend(((str(gk_time) + " sec"), (str(gk_memory / 1000) + " kb"),
                     (str(q_time) + " sec")))

    # Print performance stats

    titles = ['', 'Gk Array']
    names = ['Build Time', 'Memory', 'Query Time']
    data = [titles] + list(zip(names, gk_stats))

    print("100 MB Collection Results:")
    for i, d in enumerate(data):
        line = '|'.join(str(x).ljust(30) for x in d)
        print(line)
        if i == 0:
            print('-' * len(line))
def test_log(log_name, log_path):
    print("xes_certific_mem_iterab", log_name)
    dfg = micropm4py.log.xes_import_traces_file_standard.imp_dfg_file_sten(log_path)
    net, im, fm = micropm4py.conversion.dfg.dfg_mining.apply(dfg)
    it = micropm4py.log.xes_import_traces_file_standard.get_it_from_file(log_path)
    nxt = micropm4py.log.xes_import_traces_file_standard.get_nxt_trace(it)
    mp_memory = 16384
    mp4_memory = asizeof(micropm4py)
    net_memory = asizeof(net) + 2*asizeof(im) + 2*asizeof(fm)
    max_it_memory = 0
    while nxt:
        max_it_memory = max(max_it_memory, asizeof(nxt) + asizeof(it))
        nxt = micropm4py.log.xes_import_traces_file_standard.get_nxt_trace(it)
    sum_max_memory = mp_memory + mp4_memory + net_memory + max_it_memory
    Shared.results.append([log_name, max_it_memory, net_memory, mp4_memory, mp_memory, sum_max_memory])
Beispiel #46
0
        def test_it_matches_the_expected_size(self):
            dummy_frame_name = None

            node = CallGraphNode(frame_name=dummy_frame_name,
                                 class_name=None,
                                 file_path=None,
                                 line_no=None)
            node.increase_runnable_count()

            full_recursive_size_of_node = asizeof.asizeof(node)

            frame_name_size = sys.getsizeof(dummy_frame_name)
            empty_children_tuple_size = sys.getsizeof(())

            assert (MemoryCounter.empty_node_size_bytes == \
                (full_recursive_size_of_node \
                    # The empty size should not include the frame name, so we subtract it

                    - frame_name_size
                    # The empty tuple is always reused by Python, so we also subtract it
                    - empty_children_tuple_size))
def python_set(db, words):
    print('Standart python functionality (set())')
    collection = db.python_set
    collection.delete_many({})
    # unique_words = set()

    start = time.time()
    for word in words:
        unique_words = set([elem['word'] for elem in collection.find()])
        len_unique_words = len(unique_words)
        unique_words.add(word)
        if len(unique_words) > len_unique_words:
            collection.insert_one({'word': word})
        # unique_words.add(word)
    end = time.time()

    print('[python set()] Time python:: {}'.format(end - start))
    print('[python set()] Number of unique words: {}'.format(collection.count_documents({})))
    print('[python set()] Size of unique words: {} Mb, {} Kb\n'.format(asizeof.asizeof(unique_words)/1024/1024, asizeof.asizeof(unique_words)/1024))
    print('________________________________________________________________\n')
    return len(unique_words)
def run_inducing(log, input_path, args):
    im = InducingMiner(log, args.db_database, args.db_user, args.db_password, args.db_hostname, args.db_port, args.db_authentication, args.ssl, args.project_name, args.repository_url, input_path, repo_from_db=args.input is None)
    im.collect()

    log.info("memory for git: %s mb", asizeof.asizeof(im._cg) / 1024 / 1024)
    # everything with label='validated_bugfix' uses commit.fixed_issue_ids
    # szz uses commit.szz_issue_ids
    im.write_bug_inducing(label='adjustedszz_bugfix', inducing_strategy='all', java_only=False, affected_versions=False, ignore_refactorings=False, name='SZZ')  # plain szz
    im.write_bug_inducing(label='issueonly_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=True, name='JL+R')  # best automatic szz

    im.write_bug_inducing(label='validated_bugfix', inducing_strategy='all', java_only=False, affected_versions=False, ignore_refactorings=False, name='JLMIV')  # plain szz validated labels
    im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=False, name='JLMIV+')  # improved szz validated labels
    im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=True, ignore_refactorings=False, name='JLMIV+AV')  # improved szz validated labels, affected versions

    im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=True, ignore_refactorings=True, name='JLMIV+RAV')  # best + AV

    im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=True, name='JLMIV+R')  # improved szz validated labels, without refactorings

    im.write_bug_inducing(label='validated_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=False, name='JLMIVLV', only_validated_bugfix_lines=True)  # improved szz validated labels, only validated lines

    im.write_bug_inducing(label='issuefasttext_bugfix', inducing_strategy='code_only', java_only=True, affected_versions=False, ignore_refactorings=True, name='JLIP+R')
def pickle_features_test_helper(es_size, features_original, dir_path):
    filepath = os.path.join(dir_path, 'test_feature')

    ft.save_features(features_original, filepath)
    features_deserializedA = ft.load_features(filepath)
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    with open(filepath, "w") as f:
        ft.save_features(features_original, f)
    features_deserializedB = ft.load_features(open(filepath))
    assert os.path.getsize(filepath) < es_size
    os.remove(filepath)

    features = ft.save_features(features_original)
    features_deserializedC = ft.load_features(features)
    assert asizeof(features) < es_size

    features_deserialized_options = [features_deserializedA, features_deserializedB, features_deserializedC]
    for features_deserialized in features_deserialized_options:
        assert_features(features_original, features_deserialized)
Beispiel #50
0
    def write(self, data):
        print ("================== Writing Data down to wire from Server ================\n")

        Sencap = PEEPpacket()
        calcChecksum = PEEPServerProtocol(self.loop)
        Sencap.Type = 5
        Sencap.SequenceNumber = self.update_sequence(data)
        self.prev_sequence_number = Sencap.SequenceNumber
        print ("SEQ No:" + str(Sencap.SequenceNumber))
        Sencap.Acknowledgement = self.global_number_ack
        print ("ACK No:" + str(Sencap.Acknowledgement))

        Sencap.Data = data

        # For debugging
        print("data is",data)
        print("size of data",asizeof.asizeof(data))

        Sencap.Checksum = calcChecksum.calculateChecksum(Sencap)
        bytes = Sencap.__serialize__()
        self.transport.write(bytes)
Beispiel #51
0
    def viterbiDecoder(self, code):
        """
        Function to decode data with Viterbi Algorithm

        :param code: numpy.array (bits to decode)
        :returns: numpy.array (decoded data)
        """
        depth = 0
        viterbiTree = Node(state='A', depth=depth, HammingDistance=0)

        for word in npy.split(npy.array(code), npy.array(code).size / self.n):
            ConvolutionalCode.viterbi_iter(viterbiTree, word, depth)
            ConvolutionalCode.pruning(viterbiTree, depth)
            depth += 1

        states = ConvolutionalCode.bestPath(viterbiTree)

        return [
            npy.hstack(ConvolutionalCode.getDecoded_code(states)),
            asizeof.asizeof(viterbiTree)
        ]
Beispiel #52
0
    def test_asizesof(self):
        '''Test asizeof.asizesof()
        '''
        self.assertEqual(list(asizeof.asizesof()), [])
        self.assertRaises(KeyError, asizeof.asizesof, **{'all': True})

        objs = [Foo(42), ThinFoo("spam"), OldFoo(67)]
        sizes = list(asizeof.asizesof(*objs))
        objs.reverse()
        rsizes = list(asizeof.asizesof(*objs))
        self.assertEqual(len(sizes), 3)
        rsizes.reverse()
        self.assertEqual(sizes, rsizes, (sizes, rsizes))
        objs.reverse()
        isizes = [asizeof.asizeof(obj) for obj in objs]
        self.assertEqual(sizes, isizes)
        sizer = asizeof.Asizer()
        asizer_sizes = sizer.asizesof(*objs)
        self.assertEqual(list(asizer_sizes), sizes)
        code_sizes = sizer.asizesof(*objs, **dict(code=True))
        self.failIfEqual(list(code_sizes), sizes)
Beispiel #53
0
    def test_generator(self):
        '''Test generator examples'''
        self._printf('%sasizeof(%s, code=%s) ... %s', os.linesep,
                     '<generator>', True, '-gen[erator]')

        def gen(x):
            i = 0
            while i < x:
                yield i
                i += 1

        a = gen(5)
        b = gen(50)
        asizeof.asizeof(a, code=True, stats=1)
        asizeof.asizeof(b, code=True, stats=1)
        asizeof.asizeof(a, code=True, stats=1)
Beispiel #54
0
    def safe_load(self):
        """ Load data while keeping an eye on memory usage."""

        if self.file_counter >= len(self.file_paths):
            print("No more files to load!")
            return None

        # For in-place appending.
        # S.O.: https://stackoverflow.com/questions/20906474/
        list_ = []  # real descriptive :)
        for i in range(self.file_counter, len(self.file_paths)):
            # lines=True means "read as json-object-per-line."
            list_.append(pd.read_json(self.file_paths[i], lines=True))

            mem_usage = float(asizeof(list_)) / 1e9
            logging.info("Data list has size %.3f GiB", mem_usage)
            logging.info("Most recent file loaded: %s", self.file_paths[i])
            print("\rLoaded file", self.file_paths[i], end="")
            sys.stdout.flush()
            if mem_usage > self.max_mem:
                print("\nPast max capacity:", mem_usage,
                      "Leaving data collection early.")
                logging.warning(
                    'Terminated data loading after '
                    'reading %d files.', i + 1)
                logging.info('Files read into df: %r', self.file_paths[:i + 1])
                break
        print()

        # If the user decides they want to continue loading later
        # (when memory frees up), we want the file_counter set so that it
        # starts on the next file.
        self.file_counter = i + 1
        self._next_file_path = self.file_paths[self.file_counter]

        df = pd.concat(list_).reset_index()
        logging.info("Number of lines in raw data file: %r", len(df.index))
        logging.info("Column names from raw data file: %r", df.columns)
        logging.info("DataHelper.safe_load: df.head() = %r", df.head())
        return df
Beispiel #55
0
    def test_merge(self):
        """Test merging of reference trees.
        """
        self.tracker.track_class(FooNew, name='Foo', resolution_level=2)

        f1 = FooNew()
        f1.a = list(range(1000))
        f2 = FooNew()
        f2.a = list(range(100))
        f2.b = 'This is some stupid spam.'

        self.tracker.create_snapshot('Merge test')

        sizer = Asizer()
        sz1 = sizer.asized(f1)
        sz2 = sizer.asized(f2)

        stats = self.tracker.stats
        for fp in stats.snapshots:
            if fp.desc == 'Merge test':
                stats.annotate_snapshot(fp)
                self.assert_(hasattr(fp, 'classes'))
                classes = fp.classes
                stats.annotate_snapshot(fp)
                self.assertEqual(fp.classes, classes)
                self.assert_('Foo' in fp.classes, fp.classes)
                self.assert_('merged' in fp.classes['Foo'])
                fm = fp.classes['Foo']['merged']
                self.assertEqual(fm.size, sz1.size + sz2.size,
                                 (fm.size, str(sz1), str(sz2)))
                refs = {}
                for ref in fm.refs:
                    refs[ref.name] = ref
                self.assert_('__dict__' in refs.keys(), refs.keys())
                refs2 = {}
                for ref in refs['__dict__'].refs:
                    refs2[ref.name] = ref
                self.assert_('[V] a' in refs2.keys(), refs2.keys())
                self.assert_('[V] b' in refs2.keys(), refs2.keys())
                self.assertEqual(refs2['[V] a'].size, asizeof(f1.a, f2.a))
Beispiel #56
0
    def information(self, print_level=1):
        """Print overview information about the options settings, problem
        statistics, and the solution of the computation.

        Parameters
        ----------
        print_level : int (default=1)
            Level of details.
        """
        self._check_is_fitted()

        if not isinstance(print_level, numbers.Integral) or print_level < 0:
            raise ValueError("print_level must be an integer >= 0; got {}."
                             .format(print_level))

        binning_type = self.__class__.__name__.lower()

        # Optimizer
        if self._optimizer is not None:
            solver = self._optimizer.solver_
            time_solver = self._time_solver
        else:
            solver = None
            time_solver = 0

        # Sketch memory usage
        memory_usage = asizeof.asizeof(self._bsketch) * 1e-6

        dict_user_options = self.get_params()

        print_binning_information(binning_type, print_level, self.name,
                                  self._status, self.solver, solver,
                                  self._time_total, self._time_prebinning,
                                  time_solver, self._time_postprocessing,
                                  self._n_prebins, self._n_refinements,
                                  self._bsketch.n, self._n_add,
                                  self._time_streaming_add, self._n_solve,
                                  self._time_streaming_solve, memory_usage,
                                  dict_user_options)
Beispiel #57
0
def depth_file_reader(depth_file):
    """
    Function that parse samtools depth file and creates 3 dictionaries that
    will be useful to make the outputs of this script, both the tabular file
    and the json file that may be imported by pATLAS

    Parameters
    ----------
    depth_file: textIO
        the path to depth file for each sample

    Returns
    -------
    depth_dic_coverage: dict
            dictionary with the coverage per position for each plasmid
    """

    # dict to store the mean coverage for each reference
    depth_dic_coverage = {}

    for line in depth_file:
        tab_split = line.split()  # split by any white space
        reference = "_".join(tab_split[0].strip().split("_")[0:3])  # store
        # only the gi for the reference
        position = tab_split[1]
        num_reads_align = float(tab_split[2].rstrip())

        if reference not in depth_dic_coverage:
            depth_dic_coverage[reference] = {}

        depth_dic_coverage[reference][position] = num_reads_align

    logger.info("Finished parsing depth file.")
    depth_file.close()

    logger.debug("Size of dict_cov: {} kb".format(
        asizeof(depth_dic_coverage)/1024))

    return depth_dic_coverage
Beispiel #58
0
def test_100mb():
    h_stats = []
    files = []

    #Collect all file names in collection
    for filename in glob.glob('./input_files/98mb_genome/*.fastq'):
        files.append(filename)

    #building hash table time and memory test
    h_start = timer()
    h_table = hash_table(files)
    h_end = timer()

    # comment this out of if script is taking too long and 
    # set h_memory  = 1 instead
    # This will skip the memory measurement of the data strucutre
    # but allow the test script to run faster
    h_memory = asizeof.asizeof(h_table)

    #query time
    q_start = timer()
    h_table.find_sequence("TTG")
    q_end = timer()


    h_time = h_end - h_start
    q_time = q_end - q_start
    h_stats.extend(((str(h_time) + " sec"), (str(h_memory / 1000) + " kb"), (str(q_time) + " sec")))

    titles = ['', 'Hash Table']
    names = ['Build Time', 'Memory', 'Query Time']
    data = [titles] + list(zip(names, h_stats))

    print ("100 MB Collection Results:")
    for i, d in enumerate(data):
        line = '|'.join(str(x).ljust(30) for x in d)
        print(line)
        if i == 0:
            print('-' * len(line))
def test_50kb():

    gk_stats = []
    files = []

    #Collect all file names in collection
    for filename in glob.glob('./input_files/50kb_genome/*.fastq'):
        files.append(filename)

    #building Gk Array time and memory test

    gk_start = timer()
    gk_array = GkArray(files, 3)
    gk_end = timer()

    gk_memory = asizeof.asizeof(gk_array)

    #query time
    q_start = timer()
    gk_array.get_reads("TTG")
    q_end = timer()

    gk_time = gk_end - gk_start
    q_time = q_end - q_start
    gk_stats.extend(((str(gk_time) + " sec"), (str(gk_memory / 1000) + " kb"),
                     (str(q_time) + " sec")))

    # Print performance stats

    titles = ['', 'Gk Array']
    names = ['Build Time', 'Memory', 'Query Time']
    data = [titles] + list(zip(names, gk_stats))

    print("50 kb Collection Results:")
    for i, d in enumerate(data):
        line = '|'.join(str(x).ljust(30) for x in d)
        print(line)
        if i == 0:
            print('-' * len(line))
Beispiel #60
0
def test():
    gName = "./datasets/hp.txt"
    r = 1000
    n = 3133
    c = 0.6
    t = 10
    r_index = readsd(gName, n, r, c, t)
    readsd_rs = [0.0] * n
    cmc_rs = [0] * n
    g = nx.read_edgelist(gName, create_using=nx.DiGraph(), nodetype=int)
    print(type(r_index))
    print("index size", asizeof.asizeof(r_index))
    for i in range(0, n):
        readsd_rs[i] = r_index.queryOne(0, i)
        cmc_rs[i] = truncated_MC(g, (0, i), R=r)
        print(i, readsd_rs[i], cmc_rs[i])
    print("inserting...")
    r_index.insEdge(3, 7)
    r_index.insEdge(5, 55)
    r_index.insEdge(6, 666)
    print("finish...")
    return