Example #1
0
    def test_handle_lifetime(self):
        refs = []
        for type in self.handle_types:
            klass = getattr(pyuv, type)
            obj = klass(self.loop)
            refs.append(weakref.ref(obj))
            del obj

        # There are no more references to the handles at this point.
        # Garbage collection should be prevented from freeing them, though.
        # Touching each of these without segfault is a best effort check.
        # The validity of the weakrefs is implementation dependent :(.
        gc.collect()
        handles = self.loop.handles
        self.assertEqual(len(handles), len(self.handle_types))
        for handle in handles:
            self.assertTrue(handle.closed)
            del handle
        del handles

        # Give the loop a chance to finish closing the handles.
        self.loop.run()

        # Ensure the weakref is gone now.
        for ref in refs:
            self.assertEqual(ref(), None)
Example #2
0
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    ypred=np.zeros(X_train.shape[0])
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=1
         
        for j in range(m):
            clf=xgb_classifier(eta=0.01,min_child_weight=10,col=0.7,subsample=0.68,depth=5,num_round=500,seed=j*77,gamma=0)

            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred/(1+j)
            print j,llfun(y_test_cv,yqq)
        y_pred/=m;
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        #clf.fit(X_train_cv,(y_train_cv))
        #y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred[test_index]=y_pred
        print xx[-1]#,y_pred.shape

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred
Example #3
0
    def scheduled_recipe_fetched(self, job):
        temp_files, fmt, arg = self.conversion_jobs.pop(job)
        fname = temp_files[0].name
        if job.failed:
            self.scheduler.recipe_download_failed(arg)
            return self.gui.job_exception(job)
        id = self.gui.library_view.model().add_news(fname, arg)

        # Arg may contain a "keep_issues" variable. If it is non-zero,
        # delete all but newest x issues.
        try:
            keep_issues = int(arg['keep_issues'])
        except:
            keep_issues = 0
        if keep_issues > 0:
            ids_with_tag = list(sorted(self.gui.library_view.model().
                db.tags_older_than(arg['title'],
                    None, must_have_tag=_('News')), reverse=True))
            ids_to_delete = ids_with_tag[keep_issues:]
            if ids_to_delete:
                self.gui.library_view.model().delete_books_by_id(ids_to_delete)

        self.gui.library_view.model().beginResetModel(), self.gui.library_view.model().endResetModel()
        sync = self.gui.news_to_be_synced
        sync.add(id)
        self.gui.news_to_be_synced = sync
        self.scheduler.recipe_downloaded(arg)
        self.gui.status_bar.show_message(arg['title'] + _(' fetched.'), 3000)
        self.gui.email_news(id)
        self.gui.sync_news()
        gc.collect()
Example #4
0
 def clear(self):
     """
     Clear the output_table
     """
     self.output_table = None
     self.output_table_default = None
     gc.collect()
Example #5
0
def lars_regression_noise_ipyparallel(pars): 
    import numpy as np
    import os
    import sys
    import gc
        
    
    Y_name,C_name,noise_sn,idxs_C, idxs_Y=pars
    Y=np.load(Y_name,mmap_mode='r')
    Y=np.array(Y[idxs_Y,:])
    C=np.load(C_name,mmap_mode='r')
    C=np.array(C)
    _,T=np.shape(C)
    #sys.stdout = open(str(os.getpid()) + ".out", "w")
    st=time.time()
    As=[]    
    #print "*****************:" + str(idxs_Y[0]) + ',' + str(idxs_Y[-1])
    sys.stdout.flush()    
    for y,px in zip(Y,idxs_Y):  
        #print str(time.time()-st) + ": Pixel" + str(px)
        sys.stdout.flush()    
        c=C[idxs_C[px],:]
        if np.size(c)>0:             
            sn=noise_sn[px]**2*T            
            _,_,a,_,_=lars_regression_noise(y, c.T, 1, sn)
            if not np.isscalar(a):                
                a=a.T  
                 
            As.append((px,idxs_C[px],a))
    
    del Y
    del C
    gc.collect()
    
    return As#As
Example #6
0
def dump_references(log, instances, exclude=[]):
    import gc
    import inspect
    gc.collect()
    frame = inspect.currentframe()
    try:
        exclude.append(instances)
        exclude.append([frame])
        for instance in instances:
            referrers = [x for x in gc.get_referrers(instance) if (x not in exclude and len([y for y in exclude if x in y])==0)]
            log.info("referrers for %s: %s", instance, len(referrers))
            for i in range(len(referrers)):
                r = referrers[i]
                log.info("[%s] in %s", i, type(r))
                if inspect.isframe(r):
                    log.info("  frame info: %s", str(inspect.getframeinfo(r))[:1024])
                elif type(r)==list:
                    listref = gc.get_referrers(r)
                    log.info("  list: %s..  %s referrers: %s", str(r[:32])[:1024], len(listref), str(listref[:32])[:1024])
                elif type(r)==dict:
                    if len(r)>64:
                        log.info("  %s items: %s", len(r), str(r)[:1024])
                        continue
                    for k,v in r.items():
                        if k is instance:
                            log.info("  key with value=%s", v)
                        elif v is instance:
                            log.info("  for key=%s", k)
                else:
                    log.info("     %s : %s", type(r), r)
    finally:
        del frame
Example #7
0
 def print_leaks():
     global before, after
     gc.collect()
     lobjs = gc.get_objects()
     for i in lobjs:
         if type(i) not in ignore:
             after[type(i)] += 1
     log.info("print_leaks:")
     leaked = {}
     for k in after:
         delta = after[k]-before[k]
         if delta>0:
             leaked[delta] = k
     before = after
     after = defaultdict(int)
     for delta in reversed(sorted(leaked.keys())):
         ltype = leaked[delta]
         matches = [x for x in lobjs if type(x)==ltype and ltype not in ignore]
         if len(matches)<32:
             minfo = [str(x)[:32] for x in matches]
         else:
             minfo = "%s matches" % len(matches)
         log.info("%8i : %s : %s", delta, ltype, minfo)
         if len(matches)<32 and ltype in detailed:
             frame = inspect.currentframe()
             exclude = [frame, matches, lobjs]
             try:
                 dump_references(log, matches, exclude=exclude)
             finally:
                 del frame
                 del exclude
         del matches
         del minfo
     del lobjs
     return True
Example #8
0
	def cancel_clicked(self,widget,temp=False):
		
		newtree=devede_other.create_tree(self,"wcancel_job_dialog",self.gladefile,False)
		window=newtree.get_object("wcancel_job_dialog")
		window.show()
		value=window.run()
		window.hide()
		window.destroy()
		if value!=-5: # no
			return True

		self.runner.cancel()
		self.runner.wait_end()
		gobject.source_remove(self.timer)
		self.window.hide()
		self.window.destroy()
		newtree=devede_other.create_tree(self,"waborted_dialog",self.gladefile,False)
		window=newtree.get_object("waborted_dialog")
		window.show()
		window.run()
		window.hide()
		window.destroy()
		window=None
		gc.collect()
		(self.main_window_callback)() # show the main window
		return True
    def dump_state(self):
        """Dump the state of the application to the output, this method is
        triggered by pressing :kbd:`Ctrl-Alt-D` in the GUI"""
        from camelot.view.model_thread import post
        from camelot.view.register import dump_register
        from camelot.view.proxy.collection_proxy import CollectionProxy

        import gc
        gc.collect()

            
        dump_register()
        
        def dump_session_state():
            import collections
            
            from camelot.model.authentication import Person

            print '======= begin session =============='
            type_counter = collections.defaultdict(int)
            for o in Person.query.session:
                type_counter[type(o).__name__] += 1
            for k,v in type_counter.items():
                print k,v
            print '====== end session =============='

        post( dump_session_state )

        for o in gc.get_objects():
            if isinstance(o, CollectionProxy):
                print o
                for r in gc.get_referrers(o):
                    print ' ', type(r).__name__
                    for rr in gc.get_referrers(r):
                        print  '  ', type(rr).__name__
Example #10
0
 def test_wiretap(self):
     attic = Location("Attic", "A dark attic.")
     player = Player("fritz", "m")
     io = ConsoleIo(None)
     io.supports_smartquotes = False
     pc = PlayerConnection(player, io)
     player.set_screen_sizes(0, 100)
     julie = NPC("julie", "f")
     julie.move(attic)
     player.move(attic)
     julie.tell("message for julie")
     attic.tell("message for room")
     self.assertEqual(["message for room\n"], player.test_get_output_paragraphs())
     with self.assertRaises(ActionRefused):
         player.create_wiretap(julie)
     player.privileges = {"wizard"}
     player.create_wiretap(julie)
     player.create_wiretap(attic)
     julie.tell("message for julie")
     attic.tell("message for room")
     pubsub.sync()
     output = pc.get_output()
     self.assertTrue("[wiretapped from 'Attic': message for room]" in output)
     self.assertTrue("[wiretapped from 'julie': message for julie]" in output)
     self.assertTrue("[wiretapped from 'julie': message for room]" in output)
     self.assertTrue("message for room " in output)
     # test removing the wiretaps
     player.clear_wiretaps()
     import gc
     gc.collect()
     julie.tell("message for julie")
     attic.tell("message for room")
     self.assertEqual(["message for room\n"], player.test_get_output_paragraphs())
Example #11
0
 def testSimpleCleanup(self):
     g = graph.Graph()
     op = OpSimple(graph=g)
     r = weakref.ref(op)
     del op
     gc.collect()
     assert r() is None, "cleanup failed"
Example #12
0
def test_main(verbose=None):
    import sys
    test_classes = (
        TestBasic,
        TestVariousIteratorArgs,
        TestSubclass,
        TestSubclassWithKwargs,
        TestSequence,
    )

    support.run_unittest(*test_classes)

    # verify reference counting
    if verbose and hasattr(sys, "gettotalrefcount"):
        import gc
        counts = [None] * 5
        for i in range(len(counts)):
            support.run_unittest(*test_classes)
            gc.collect()
            counts[i] = sys.gettotalrefcount()
        print(counts)

    # doctests
    from test import test_deque
    support.run_doctest(test_deque, verbose)
Example #13
0
    def cleanup_core_plugin(self):
        """Ensure that the core plugin is deallocated."""
        nm = manager.NeutronManager
        if not nm.has_instance():
            return

        # TODO(marun) Fix plugins that do not properly initialize notifiers
        agentschedulers_db.AgentSchedulerDbMixin.agent_notifiers = {}

        # Perform a check for deallocation only if explicitly
        # configured to do so since calling gc.collect() after every
        # test increases test suite execution time by ~50%.
        check_plugin_deallocation = (
            bool_from_env('OS_CHECK_PLUGIN_DEALLOCATION'))
        if check_plugin_deallocation:
            plugin = weakref.ref(nm._instance.plugin)

        nm.clear_instance()

        if check_plugin_deallocation:
            gc.collect()

            # TODO(marun) Ensure that mocks are deallocated?
            if plugin() and not isinstance(plugin(), mock.Base):
                raise AssertionError(
                    'The plugin for this test was not deallocated.')
Example #14
0
    def go():
        router_closed = asyncio.Future()
        dealer_closed = asyncio.Future()
        router, _ = yield from loop.create_zmq_connection(
            lambda: ZmqRouterProtocol(router_closed),
            zmq.ROUTER,
            bind='tcp://127.0.0.1:*')

        addr = next(iter(router.bindings()))
        dealer, _ = yield from loop.create_zmq_connection(
            lambda: ZmqDealerProtocol(count, dealer_closed),
            zmq.DEALER,
            connect=addr)

        msg = b'func', b'\0'*200

        gc.collect()
        t1 = time.monotonic()
        dealer.write(msg)
        yield from dealer_closed
        t2 = time.monotonic()
        gc.collect()
        router.close()
        yield from router_closed
        return t2 - t1
Example #15
0
def test_zmq_with_thread(count):
    """zmq with threads"""
    print('.', end='', flush=True)
    ctx = zmq.Context()
    dealer = ctx.socket(zmq.DEALER)
    dealer.bind('tcp://127.0.0.1:*')
    address = dealer.getsockopt(zmq.LAST_ENDPOINT).rstrip(b'\0')
    msg = b'func', b'\0'*200

    def router_thread():
        router = ctx.socket(zmq.ROUTER)
        router.connect(address)

        for i in range(count):
            addr, m1, m2 = router.recv_multipart()
            router.send_multipart((addr, m1, m2))

        router.close()

    th = threading.Thread(target=router_thread)
    th.start()
    gc.collect()
    t1 = time.monotonic()
    for i in range(count):
        dealer.send_multipart(msg)
        dealer.recv_multipart()
    t2 = time.monotonic()
    gc.collect()
    th.join()
    dealer.close()
    ctx.destroy()
    return t2 - t1
Example #16
0
def test_contourf_transform_path_counting():
    ax = plt.axes(projection=ccrs.Robinson())
    plt.draw()

    # Capture the size of the cache before our test.
    gc.collect()
    initial_cache_size = len(cgeoaxes._PATH_TRANSFORM_CACHE)

    path_to_geos_counter = CallCounter(cartopy.mpl.patch, 'path_to_geos')
    with path_to_geos_counter:
        x, y, z = sample_data((30, 60))
        cs = plt.contourf(x, y, z, 5, transform=ccrs.PlateCarree())
        n_geom = sum([len(c.get_paths()) for c in cs.collections])
        del cs
        if not six.PY3:
            del c
        plt.draw()

    # Before the performance enhancement, the count would have been 2 * n_geom,
    # but should now be just n_geom.
    msg = ('The given geometry was transformed too many times (expected: %s; '
           'got %s) - the caching is not working.'
           '' % (n_geom, path_to_geos_counter.count))
    assert path_to_geos_counter.count == n_geom, msg

    # Check the cache has an entry for each geometry.
    assert len(cgeoaxes._PATH_TRANSFORM_CACHE) == initial_cache_size + n_geom

    # Check that the cache is empty again once we've dropped all references
    # to the source paths.
    plt.clf()
    gc.collect()
    assert len(cgeoaxes._PATH_TRANSFORM_CACHE) == initial_cache_size

    plt.close()
Example #17
0
 def openFile(self, filename, weakreference=False):
     gc.collect()
     for item in self.rootItem:
         if item.file.filename == filename:
             ddict = {}
             ddict['event'] = "fileUpdated"
             ddict['filename'] = filename
             self.sigFileUpdated.emit(ddict)
             return item.file
     phynxFile = phynx.File(filename, 'r')
     if weakreference:
         def phynxFileInstanceDistroyed(weakrefObject):
             idx = self.rootItem._identifiers.index(id(weakrefObject))
             child = self.rootItem._children[idx]
             child.clearChildren()
             del self._idMap[id(child)]
             self.rootItem.deleteChild(child)
             if not self.rootItem.hasChildren:
                 self.clear()
             return
         refProxy = weakref.proxy(phynxFile, phynxFileInstanceDistroyed)
         self.rootItem.appendChild(refProxy)
     else:
         self.rootItem.appendChild(phynxFile)
     ddict = {}
     ddict['event'] = "fileAppended"
     ddict['filename'] = filename
     self.sigFileAppended.emit(ddict)
     return phynxFile
    def test_collect_garbage(self):
        self.preclean()
        # Each of these cause four objects to be garbage: Two
        # Uncolectables and their instance dicts.
        Uncollectable()
        Uncollectable()
        C1055820(666)
        gc.collect()
        for v in self.visit:
            if v[1] != "stop":
                continue
            info = v[2]
            self.assertEqual(info["collected"], 2)
            self.assertEqual(info["uncollectable"], 8)

        # We should now have the Uncollectables in gc.garbage
        self.assertEqual(len(gc.garbage), 4)
        for e in gc.garbage:
            self.assertIsInstance(e, Uncollectable)

        # Now, let our callback handle the Uncollectable instances
        self.cleanup=True
        self.visit = []
        gc.garbage[:] = []
        gc.collect()
        for v in self.visit:
            if v[1] != "stop":
                continue
            info = v[2]
            self.assertEqual(info["collected"], 0)
            self.assertEqual(info["uncollectable"], 4)

        # Uncollectables should be gone
        self.assertEqual(len(gc.garbage), 0)
 def test_class(self):
     class A:
         pass
     A.a = A
     gc.collect()
     del A
     self.assertNotEqual(gc.collect(), 0)
 def test_get_stats(self):
     stats = gc.get_stats()
     self.assertEqual(len(stats), 3)
     for st in stats:
         self.assertIsInstance(st, dict)
         self.assertEqual(set(st),
                          {"collected", "collections", "uncollectable"})
         self.assertGreaterEqual(st["collected"], 0)
         self.assertGreaterEqual(st["collections"], 0)
         self.assertGreaterEqual(st["uncollectable"], 0)
     # Check that collection counts are incremented correctly
     if gc.isenabled():
         self.addCleanup(gc.enable)
         gc.disable()
     old = gc.get_stats()
     gc.collect(0)
     new = gc.get_stats()
     self.assertEqual(new[0]["collections"], old[0]["collections"] + 1)
     self.assertEqual(new[1]["collections"], old[1]["collections"])
     self.assertEqual(new[2]["collections"], old[2]["collections"])
     gc.collect(2)
     new = gc.get_stats()
     self.assertEqual(new[0]["collections"], old[0]["collections"] + 1)
     self.assertEqual(new[1]["collections"], old[1]["collections"])
     self.assertEqual(new[2]["collections"], old[2]["collections"] + 1)
    def test_collect(self):
        self.preclean()
        gc.collect()
        # Algorithmically verify the contents of self.visit
        # because it is long and tortuous.

        # Count the number of visits to each callback
        n = [v[0] for v in self.visit]
        n1 = [i for i in n if i == 1]
        n2 = [i for i in n if i == 2]
        self.assertEqual(n1, [1]*2)
        self.assertEqual(n2, [2]*2)

        # Count that we got the right number of start and stop callbacks.
        n = [v[1] for v in self.visit]
        n1 = [i for i in n if i == "start"]
        n2 = [i for i in n if i == "stop"]
        self.assertEqual(n1, ["start"]*2)
        self.assertEqual(n2, ["stop"]*2)

        # Check that we got the right info dict for all callbacks
        for v in self.visit:
            info = v[2]
            self.assertTrue("generation" in info)
            self.assertTrue("collected" in info)
            self.assertTrue("uncollectable" in info)
    def test_boom2(self):
        class Boom2:
            def __init__(self):
                self.x = 0

            def __getattr__(self, someattribute):
                self.x += 1
                if self.x > 1:
                    del self.attr
                raise AttributeError

        a = Boom2()
        b = Boom2()
        a.attr = b
        b.attr = a

        gc.collect()
        garbagelen = len(gc.garbage)
        del a, b
        # Much like test_boom(), except that __getattr__ doesn't break the
        # cycle until the second time gc checks for __del__.  As of 2.3b1,
        # there isn't a second time, so this simply cleans up the trash cycle.
        # We expect a, b, a.__dict__ and b.__dict__ (4 objects) to get
        # reclaimed this way.
        self.assertEqual(gc.collect(), 4)
        self.assertEqual(len(gc.garbage), garbagelen)
Example #23
0
def growth(limit=10, peak_stats={}, shortnames=True):
    """Calculate the increase in peak object counts since last call.

    Returns a dict of {type_name: (delta, count)}.

    Limits the output to ``limit`` largest deltas.  You may set ``limit`` to
    None to see all of them.

    Uses and updates ``peak_stats``, a dictionary from type names to previously
    seen peak object counts.  Usually you don't need to pay attention to this
    argument.

    The caveats documented in :func:`typestats` apply.

    Example:

        >>> growth(limit=3)
        {'wrapper_descriptor': 14, 'tuple': 10, 'dict': 7}

    .. versionadded:: 1.8
    """
    gc.collect()
    stats = objgraph.typestats(shortnames=shortnames)
    deltas = []
    for name, count in iteritems(stats):
        delta = count - peak_stats.get(name, 0)
        if delta > 0:
            deltas.append((name, (delta, count)))
            peak_stats[name] = count
    deltas = sorted(deltas, key=operator.itemgetter(1, 0), reverse=True)

    if limit:
        deltas = deltas[:limit]
    return dict(deltas)
    def test_incrgc_simple(self):
        import gc
        from persistent.interfaces import UPTODATE
        from persistent._compat import _b
        cache = self._makeOne()
        oids = []
        for i in range(100):
            oid = _b('oid_%04d' % i)
            oids.append(oid)
            cache[oid] = self._makePersist(oid=oid, state=UPTODATE)
        self.assertEqual(cache.cache_non_ghost_count, 100)

        cache.incrgc()
        gc.collect() # banish the ghosts who are no longer in the ring

        self.assertEqual(cache.cache_non_ghost_count, 10)
        items = cache.lru_items()
        self.assertEqual(_len(items), 10)
        self.assertEqual(items[0][0], _b('oid_0090'))
        self.assertEqual(items[1][0], _b('oid_0091'))
        self.assertEqual(items[2][0], _b('oid_0092'))
        self.assertEqual(items[3][0], _b('oid_0093'))
        self.assertEqual(items[4][0], _b('oid_0094'))
        self.assertEqual(items[5][0], _b('oid_0095'))
        self.assertEqual(items[6][0], _b('oid_0096'))
        self.assertEqual(items[7][0], _b('oid_0097'))
        self.assertEqual(items[8][0], _b('oid_0098'))
        self.assertEqual(items[9][0], _b('oid_0099'))

        for oid in oids[:90]:
            self.assertTrue(cache.get(oid) is None)

        for oid in oids[90:]:
            self.assertFalse(cache.get(oid) is None)
Example #25
0
    def testGCCollectNoMemoryManagement(self):
        self.gateway = JavaGateway(
            gateway_parameters=GatewayParameters(
                enable_memory_management=False))
        gc.collect()
        # Should have nothing in the finalizers
        self.assertEqual(len(ThreadSafeFinalizer.finalizers), 0)

        def internal():
            sb = self.gateway.jvm.java.lang.StringBuffer()
            sb.append("Hello World")
            sb2 = self.gateway.jvm.java.lang.StringBuffer()
            sb2.append("Hello World")
            finalizers_size_middle = len(ThreadSafeFinalizer.finalizers)
            return finalizers_size_middle
        finalizers_size_middle = internal()
        gc.collect()

        # Before collection: two objects created + two returned objects (append
        # returns a stringbuffer reference for easy chaining).
        self.assertEqual(finalizers_size_middle, 0)

        # Assert after collection
        self.assertEqual(len(ThreadSafeFinalizer.finalizers), 0)

        self.gateway.shutdown()
Example #26
0
 def tearDown(self):
     if self.old_env is None:
         del os.environ["CUDA_DEVICE"]
     else:
         os.environ["CUDA_DEVICE"] = self.old_env
     del self.old_env
     gc.collect()
Example #27
0
def run_net((k,theta,T,g_inh_,spike_delay)):
    seed(int(os.getpid()*time.time()))
    print os.getpid()
    reinit()
    reinit_default_clock()
    clear(True)
    gc.collect()
    
    PKJ = PurkinjeCellGroup(1)
    PKJ.V = PKJ.El
    
    spikes = SpikeMonitor(PKJ)
    spikes.last_spike = None
    V_monitor = StateMonitor(PKJ,'V',record=0)
    ginh_monitor = StateMonitor(PKJ, 'g_inh', record=0)
    
    @network_operation(Clock(dt=defaultclock.dt))
    def random_current():
        PKJ.I = gamma(k,theta,size=len(PKJ)) * nA
        
    @network_operation(Clock(dt=defaultclock.dt))
    def trigger_spike():
        if spikes.spiketimes[0].shape[0] > 0:
            spikes.last_spike = spikes.spiketimes[0][-1]*second
        if spikes.last_spike is not None:
            if abs(defaultclock.t - (spikes.last_spike + spike_delay)) < .000001*ms:
                PKJ.g_inh = g_inh_
        
    run(T)

    V_monitor.insert_spikes(spikes)
    first_isi = diff(spikes.spiketimes[0])[0]
    
    return V_monitor.getvalues(), first_isi, spikes.spiketimes
Example #28
0
def find_chain(obj, predicate, edge_func, max_depth=20, extra_ignore=()):
    queue = [obj]
    depth = {id(obj): 0}
    parent = {id(obj): None}
    ignore = set(extra_ignore)
    ignore.add(id(extra_ignore))
    ignore.add(id(queue))
    ignore.add(id(depth))
    ignore.add(id(parent))
    ignore.add(id(ignore))
    ignore.add(id(sys._getframe()))  # this function
    ignore.add(id(sys._getframe(1))) # find_chain/find_backref_chain, most likely
    gc.collect()
    while queue:
        target = queue.pop(0)
        if predicate(target):
            chain = [target]
            while parent[id(target)] is not None:
                target = parent[id(target)]
                chain.append(target)
            return chain
        tdepth = depth[id(target)]
        if tdepth < max_depth:
            referrers = edge_func(target)
            ignore.add(id(referrers))
            for source in referrers:
                if id(source) in ignore:
                    continue
                if id(source) not in depth:
                    depth[id(source)] = tdepth + 1
                    parent[id(source)] = target
                    queue.append(source)
    return [obj] # not found
Example #29
0
    def __init__(self,inputLayerSize,hiddenLayerSize,outputLayerSize):
        #Define Hyperparameters
        gc.collect()
        self.inputLayerSize = inputLayerSize
        self.outputLayerSize = outputLayerSize
        self.hiddenLayerSize = hiddenLayerSize

        # Weights (parameters)
        # self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
        # self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)
        self.W1 = np.random.uniform(-0.5,0.5,(self.inputLayerSize,self.hiddenLayerSize))
        self.W2 = np.random.uniform(-0.5,0.5,(self.hiddenLayerSize,self.outputLayerSize))

        try:
            f = open('myfile','r')
            for i in range(self.inputLayerSize):
                for j in range(self.hiddenLayerSize):
                    temp = f.readline()
                    self.W1[i][j] = float(temp)
            for i in range(self.hiddenLayerSize):
                for j in range(self.outputLayerSize):
                    temp = f.readline()
                    self.W2[i][j] = float(temp)
            f.close()
        except Exception, e:
            print("File not Found")
    def test_bug21435(self):
        # This is a poor test - its only virtue is that it happened to
        # segfault on Tim's Windows box before the patch for 21435 was
        # applied.  That's a nasty bug relying on specific pieces of cyclic
        # trash appearing in exactly the right order in finalize_garbage()'s
        # input list.
        # But there's no reliable way to force that order from Python code,
        # so over time chances are good this test won't really be testing much
        # of anything anymore.  Still, if it blows up, there's _some_
        # problem ;-)
        gc.collect()

        class A:
            pass

        class B:
            def __init__(self, x):
                self.x = x

            def __del__(self):
                self.attr = None

        def do_work():
            a = A()
            b = B(A())

            a.attr = b
            b.attr = a

        do_work()
        gc.collect() # this blows up (bad C pointer) when it fails
    def period_over_period(self,
                           df,
                           start_date,
                           end_date,
                           period,
                           history_periods=2,
                           timestamp_col='timestamp_of_first_event'):
        try:
            # filter cols if necessary
            string = '0 {}(s) prev(current)'.format(period)

            # filter out the dates greater than today
            df_current = df.assign(period=string)
            # label the days being compared with the same label
            if len(df_current) > 0:
                df_current = self.label_dates_pop(df_current, period,
                                                  timestamp_col)

            # zero out time information
            start = datetime(start_date.year, start_date.month, start_date.day,
                             0, 0, 0)
            end = datetime(end_date.year, end_date.month, end_date.day, 0, 0,
                           0)

            cols = list(df.columns)
            logger.warning(' Line 293 %s:df %s', period, df.head(10))
            logger.warning(' Line 293 %s:df cols %s', period, cols)

            counter = 1
            if isinstance(history_periods, str):
                history_periods = int(history_periods)
            # make dataframes for request no. of periods
            start, end = self.shift_period_range(period, start, end)
            while counter < history_periods and start >= self.initial_date:
                # load data
                if period == 'quarter':
                    logger.warning('start:end %s:%s', start, end)
                if 'bcc' in self.table:
                    df_temp = self.load_df_pym(start, end, cols, timestamp_col)
                else:
                    df_temp = self.load_df(start, end, cols, timestamp_col)
                if df_temp is not None:
                    if len(df_temp) > 1:
                        string = '{} {}(s) prev'.format(counter, period)
                        # label period
                        df_temp = df_temp.assign(period=string)
                        # relabel days to get matching day of week,doy, dom, for different periods
                        df_temp = self.label_dates_pop(df_temp, period,
                                                       timestamp_col)
                        #logger.warning('df temp loaded for %s previous: %s',counter,len(df_temp))

                        df_current = concat_dfs(df_current, df_temp)
                        del df_temp
                        gc.collect()

                # shift the loading window
                counter += 1
                start, end = self.shift_period_range(period, start, end)
                if period == 'week':
                    logger.warning('LINE 327 df_current:%s',
                                   df_current.head(10))

            return df_current
        except Exception:
            logger.error('period over period', exc_info=True)
Example #32
0
    def store_survey(self, survey_name, R_table_name, destination_table_name, data_dir, variables=None, force_recreation=True):
        """
        Store a R data table in an HDF5 file

        Parameters
        ----------

        survey_name : string
                       the name of the survey
        R_table_name : string
                       the name of the R data table
        destination_table_name : string
                                 the name of the table in the HDFStore
        data_dir : path
                   the directory where to find the RData file

        variables : list of string, default None
                    When not None, list of the variables to keep
        """
        gc.collect()
        year = self.year
        def get_survey_year(survey_name, year):
            if survey_name == "logement":
                if year == 2003:
                    return 2003
                elif year in range(2006,2010):
                    return 2006
            if survey_name == "patrimoine":
                return 2004
            else:
                return year

        print "creating %s" %(destination_table_name)
        table_Rdata = R_table_name + ".Rdata"
        filename = os.path.join(data_dir, str(get_survey_year(survey_name, year)), table_Rdata)
        print filename
        if not os.path.isfile(filename):
            raise Exception("filename do  not exists")

        rpy.r.load(filename)
        stored_table = com.load_data(R_table_name)
        store = HDFStore(self.hdf5_filename)
        store_path = str(self.year)+"/"+destination_table_name

        if store_path in store:
            if force_recreation is not True:
                print store_path + "already exists, do not re-create and exit"
                store.close()
                return

        if variables is not None:

            print store
            print store_path
            print variables
            variables_stored = list(set(variables).intersection(set(stored_table.columns)))
            print list(set(variables).difference((set(stored_table.columns))))
            store[store_path] = stored_table[variables_stored]
        else:
            store[store_path] = stored_table
        store.close()
        del stored_table
        gc.collect()
Example #33
0
    def fit(self, X_train=None, Y_train=None, X_test=None, Y_test=None, dataset_train=None, dataset_val=None, time_limit=None, **kwargs):
        start_time = time.time()
        params = self.params.copy()

        # TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored.
        verbosity = kwargs.get('verbosity', 2)
        params = fixedvals_from_searchspaces(params)

        if verbosity <= 1:
            verbose_eval = False
        elif verbosity == 2:
            verbose_eval = 1000
        elif verbosity == 3:
            verbose_eval = 50
        else:
            verbose_eval = 1

        eval_metric, eval_metric_name = self.get_eval_metric()
        dataset_train, dataset_val = self.generate_datasets(X_train=X_train, Y_train=Y_train, params=params, X_test=X_test, Y_test=Y_test, dataset_train=dataset_train, dataset_val=dataset_val)
        gc.collect()

        num_boost_round = params.pop('num_boost_round', 1000)
        logger.log(15, f'Training Gradient Boosting Model for {num_boost_round} rounds...')
        logger.log(15, "with the following hyperparameter settings:")
        logger.log(15, params)

        num_rows_train = len(dataset_train.data)
        if 'min_data_in_leaf' in params:
            if params['min_data_in_leaf'] > num_rows_train:  # TODO: may not be necessary
                params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0))

        # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time
        if (dataset_val is not None) and (dataset_train is not None):
            modifier = 1 if num_rows_train <= 10000 else 10000 / num_rows_train
            early_stopping_rounds = max(round(modifier * 150), 10)
        else:
            early_stopping_rounds = 150

        callbacks = []
        valid_names = ['train_set']
        valid_sets = [dataset_train]
        if dataset_val is not None:
            reporter = kwargs.get('reporter', None)
            train_loss_name = self._get_train_loss_name() if reporter is not None else None
            if train_loss_name is not None:
                if 'metric' not in params or params['metric'] == '':
                    params['metric'] = train_loss_name
                elif train_loss_name not in params['metric']:
                    params['metric'] = f'{params["metric"]},{train_loss_name}'
            callbacks += [
                # Note: Don't use self.params_aux['max_memory_usage_ratio'] here as LightGBM handles memory per iteration optimally.  # TODO: Consider using when ratio < 1.
                early_stopping_custom(early_stopping_rounds, metrics_to_use=[('valid_set', eval_metric_name)], max_diff=None, start_time=start_time, time_limit=time_limit,
                                      ignore_dart_warning=True, verbose=False, manual_stop_file=False, reporter=reporter, train_loss_name=train_loss_name),
            ]
            valid_names = ['valid_set'] + valid_names
            valid_sets = [dataset_val] + valid_sets

        seed_val = params.pop('seed_value', 0)
        train_params = {
            'params': params,
            'train_set': dataset_train,
            'num_boost_round': num_boost_round,
            'valid_sets': valid_sets,
            'valid_names': valid_names,
            'callbacks': callbacks,
            'verbose_eval': verbose_eval,
        }
        if not isinstance(eval_metric, str):
            train_params['feval'] = eval_metric
        else:
            if 'metric' not in train_params['params'] or train_params['params']['metric'] == '':
                train_params['params']['metric'] = eval_metric
            elif eval_metric not in train_params['params']['metric']:
                train_params['params']['metric'] = f'{train_params["params"]["metric"]},{eval_metric}'
        if seed_val is not None:
            train_params['params']['seed'] = seed_val
            random.seed(seed_val)
            np.random.seed(seed_val)

        # Train LightGBM model:
        try_import_lightgbm()
        import lightgbm as lgb
        self.model = lgb.train(**train_params)
        self.params_trained['num_boost_round'] = self.model.best_iteration
Example #34
0
    def run_tests_sequential(self):
        if self.ns.trace:
            import trace
            self.tracer = trace.Trace(trace=False, count=True)

        save_modules = sys.modules.keys()

        print("Run tests sequentially")

        previous_test = None
        for test_index, test in enumerate(self.tests, 1):
            start_time = time.monotonic()

            text = test
            if previous_test:
                text = '%s -- %s' % (text, previous_test)
            self.display_progress(test_index, text)

            if self.tracer:
                # If we're tracing code coverage, then we don't exit with status
                # if on a false return value from main.
                cmd = ('result = runtest(self.ns, test); '
                       'self.accumulate_result(test, result)')
                ns = dict(locals())
                self.tracer.runctx(cmd, globals=globals(), locals=ns)
                result = ns['result']
            else:
                try:
                    result = runtest(self.ns, test)
                except KeyboardInterrupt:
                    self.interrupted = True
                    self.accumulate_result(test, (INTERRUPTED, None))
                    break
                else:
                    self.accumulate_result(test, result)

            previous_test = format_test_result(test, result[0])
            test_time = time.monotonic() - start_time
            if test_time >= PROGRESS_MIN_TIME:
                previous_test = "%s in %s" % (previous_test, format_duration(test_time))
            elif result[0] == PASSED:
                # be quiet: say nothing if the test passed shortly
                previous_test = None

            if self.ns.findleaks:
                gc.collect()
                if gc.garbage:
                    print("Warning: test created", len(gc.garbage), end=' ')
                    print("uncollectable object(s).")
                    # move the uncollectable objects somewhere so we don't see
                    # them again
                    self.found_garbage.extend(gc.garbage)
                    del gc.garbage[:]

            # Unload the newly imported modules (best effort finalization)
            for module in sys.modules.keys():
                if module not in save_modules and module.startswith("test."):
                    support.unload(module)

        if previous_test:
            print(previous_test)
def jorudan(num_rows=None):
    tmp_jorudan = pd.read_csv('../input/jorudan.tsv', sep='\t', nrows=num_rows)

    # 日付をdatetime型へ変換
    tmp_jorudan['access_date'] = pd.to_datetime(tmp_jorudan['access_date'])
    tmp_jorudan['datetime'] = pd.to_datetime(
        tmp_jorudan['departure_and_arrival_date'])

    # 当日以降のアクセスデータを削除
    tmp_jorudan = tmp_jorudan[
        tmp_jorudan['datetime'] > tmp_jorudan['access_date']]

    # 2018/1/1以降のデータを削除
    tmp_jorudan = tmp_jorudan[tmp_jorudan['datetime'] < '2018-01-01']

    # one-hot encoding
    jorudan, cols = one_hot_encoder(tmp_jorudan[[
        'departure_and_arrival_type', 'departure_and_arrival_place_type',
        'departure_prefecture', 'arrival_prefecture'
    ]],
                                    nan_as_category=False)

    # 日付と公園名のカラムを追加
    jorudan['park'] = tmp_jorudan['park']
    jorudan['datetime'] = tmp_jorudan['datetime']

    feats_jorudan = [
        c for c in jorudan.columns if c not in ['park', 'datetime']
    ]

    # 集約用のdictを生成
    agg_jorudan = {}
    for c in feats_jorudan:
        agg_jorudan[c] = ['sum', 'mean']

    # 日付と公園名で集約
    jorudan = jorudan.groupby(['park', 'datetime']).agg(agg_jorudan)

    # ゼロ埋め
    jorudan.fillna(0, inplace=True)

    # カラム名の変更
    jorudan.columns = pd.Index(
        [e[0] + "_" + e[1].upper() for e in jorudan.columns.tolist()])

    # 追加の特徴量
    jorudan['departure_and_arrival_place_mean_sum'] = jorudan[
        'departure_and_arrival_place_type_A_MEAN'] + jorudan[
            'departure_and_arrival_place_type_D_MEAN']
    jorudan['departure_and_arrival_place_sum_sum'] = jorudan[
        'departure_and_arrival_place_type_A_SUM'] + jorudan[
            'departure_and_arrival_place_type_D_SUM']
    jorudan['departure_and_arrival_type__mean_sum'] = jorudan[
        'departure_and_arrival_type_A_MEAN'] + jorudan[
            'departure_and_arrival_type_D_MEAN']
    jorudan['departure_and_arrival_type_sum_sum'] = jorudan[
        'departure_and_arrival_type_A_SUM'] + jorudan[
            'departure_and_arrival_type_D_SUM']
    jorudan['departure_and_arrival_place_mean_ratio'] = jorudan[
        'departure_and_arrival_place_type_A_MEAN'] / jorudan[
            'departure_and_arrival_place_type_D_MEAN']
    jorudan['departure_and_arrival_place_sum_ratio'] = jorudan[
        'departure_and_arrival_place_type_A_SUM'] / jorudan[
            'departure_and_arrival_place_type_D_SUM']
    jorudan['departure_and_arrival_type_mean_ratio'] = jorudan[
        'departure_and_arrival_type_A_MEAN'] / jorudan[
            'departure_and_arrival_type_D_MEAN']
    jorudan['departure_and_arrival_type_sum_ratio'] = jorudan[
        'departure_and_arrival_type_A_SUM'] / jorudan[
            'departure_and_arrival_type_D_SUM']

    # カラム名を変更
    jorudan.columns = ['JORUDAN_' + c for c in jorudan.columns]

    del tmp_jorudan
    gc.collect()

    return jorudan
def hotlink(num_rows=None):
    # load csv
    hotlink = pd.read_csv('../input/hotlink.tsv', sep='\t')

    # aggregate by datetime & keyword
    hotlink_all = hotlink.pivot_table(index='datetime',
                                      columns='keyword',
                                      values='count',
                                      aggfunc=[np.sum, np.max, 'mean'])
    hotlink_bbs = hotlink[hotlink.domain == 'bbs'].pivot_table(
        index='datetime',
        columns='keyword',
        values='count',
        aggfunc=[np.sum, np.max, 'mean'])
    hotlink_twitter = hotlink[hotlink.domain ==
                              'twitter_sampling'].pivot_table(
                                  index='datetime',
                                  columns='keyword',
                                  values='count',
                                  aggfunc=[np.sum, np.max, 'mean'])
    hotlink_blog = hotlink[hotlink.domain == 'blog'].pivot_table(
        index='datetime',
        columns='keyword',
        values='count',
        aggfunc=[np.sum, np.max, 'mean'])

    # 欠損値をゼロ埋め
    hotlink_all.fillna(0, inplace=True)
    hotlink_bbs.fillna(0, inplace=True)
    hotlink_twitter.fillna(0, inplace=True)
    hotlink_blog.fillna(0, inplace=True)

    # indexをdatetime型に変換
    hotlink_all.index = pd.to_datetime(hotlink_all.index)
    hotlink_bbs.index = pd.to_datetime(hotlink_bbs.index)
    hotlink_twitter.index = pd.to_datetime(hotlink_twitter.index)
    hotlink_blog.index = pd.to_datetime(hotlink_blog.index)

    # 1日先へシフト
    hotlink_all = hotlink_all.shift()
    hotlink_bbs = hotlink_bbs.shift()
    hotlink_twitter = hotlink_twitter.shift()
    hotlink_blog = hotlink_blog.shift()

    # カラム名を変更
    hotlink_all.columns = pd.Index(
        [e[1] + "_" + e[0].upper() for e in hotlink_all.columns.tolist()])
    hotlink_bbs.columns = pd.Index(
        [e[1] + "_" + e[0].upper() for e in hotlink_bbs.columns.tolist()])
    hotlink_twitter.columns = pd.Index(
        [e[1] + "_" + e[0].upper() for e in hotlink_twitter.columns.tolist()])
    hotlink_blog.columns = pd.Index(
        [e[1] + "_" + e[0].upper() for e in hotlink_blog.columns.tolist()])

    hotlink_all.columns = ['HOTLINK_ALL_' + c for c in hotlink_all.columns]
    hotlink_bbs.columns = ['HOTLINK_BBS_' + c for c in hotlink_bbs.columns]
    hotlink_twitter.columns = [
        'HOTLINK_TWITTER_' + c for c in hotlink_twitter.columns
    ]
    hotlink_blog.columns = ['HOTLINK_BLOG_' + c for c in hotlink_blog.columns]

    # merge
    hotlink = pd.concat(
        [hotlink_all, hotlink_bbs, hotlink_twitter, hotlink_blog], axis=1)

    del hotlink_all, hotlink_bbs, hotlink_twitter, hotlink_blog
    gc.collect()

    return hotlink
def train_test(num_rows=None):
    print("Loading datasets...")
    # load datasets
    train_df = pd.read_csv('../input/train.tsv', sep='\t', nrows=num_rows)
    test_df = pd.read_csv('../input/test.tsv', sep='\t', nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(train_df),
                                                       len(test_df)))

    #testのtargetをnanにしときます
    test_df['visitors'] = np.nan

    # merge
    df = train_df.append(test_df[['datetime', 'park',
                                  'visitors']]).reset_index()

    del train_df, test_df
    gc.collect()

    # 日付をdatetime型へ変換
    df['datetime'] = pd.to_datetime(df['datetime'])

    # 日本の祝日データを追加
    df['japanese_holiday'] = getJapaneseHolidays(df['datetime']).replace(2, 1)

    # 連休数のファクターを生成
    holidays = df.groupby('datetime')['japanese_holiday'].mean().replace(2, 1)
    holidays = fillHolidays(holidays).replace(2, 1)  # 休日の谷間の平日を休日にする
    df['num_holidays'] = df['datetime'].map(getNumHolidays(holidays))

    # 季節性の特徴量を追加
    df['day'] = df['datetime'].dt.day.astype(object)
    df['month'] = df['datetime'].dt.month.astype(object)
    df['weekday'] = df['datetime'].dt.weekday.astype(object)
    df['weekofyear'] = df['datetime'].dt.weekofyear.astype(object)
    #    df['day_month'] = df['day'].astype(str)+'_'+df['month'].astype(str)
    #    df['day_weekday'] = df['day'].astype(str)+'_'+df['weekday'].astype(str)
    #    df['day_weekofyear'] = df['day'].astype(str)+'_'+df['weekofyear'].astype(str)
    df['month_weekday'] = df['month'].astype(str) + '_' + df['weekday'].astype(
        str)
    df['month_weekofyear'] = df['month'].astype(
        str) + '_' + df['weekofyear'].astype(str)
    #    df['weekday_weekofyear'] = df['weekday'].astype(str)+'_'+df['weekofyear'].astype(str)
    df['new_years_day'] = getNewYearsDay(df['datetime'])
    df['golden_week'] = getGoldenWeek(df['datetime'])

    #    df['park_day'] = df['park'].astype(str)+'_'+df['day'].astype(str)
    df['park_month'] = df['park'].astype(str) + '_' + df['month'].astype(str)
    df['park_weekday'] = df['park'].astype(str) + '_' + df['weekday'].astype(
        str)
    df['park_japanese_holiday'] = df['park'].astype(
        str) + '_' + df['japanese_holiday'].astype(str)
    #    df['park_weekofyear'] = df['park'].astype(str)+'_'+df['weekofyear'].astype(str)
    df['park_num_holiday'] = df['park'].astype(
        str) + '_' + df['num_holidays'].astype(str)
    df['park_new_years_day'] = df['park'].astype(
        str) + '_' + df['new_years_day'].astype(str)
    df['park_golden_week'] = df['park'].astype(
        str) + '_' + df['golden_week'].astype(str)

    # categorical変数を変換
    df_res, cat_cols = one_hot_encoder(df, nan_as_category=False)

    # stratify & mearge用
    df_res['park'] = df['park']
    df_res['weekofyear'] = df['weekofyear'].astype(int)
    df_res['weekday'] = df['weekday'].astype(int)
    df_res['year'] = df['datetime'].dt.year.astype(int)
    df_res['month'] = df['datetime'].dt.month.astype(int)
    df_res['park_month'], _ = pd.factorize(df['park_month'])
    df_res['park_japanese_holiday'], _ = pd.factorize(
        df['park_japanese_holiday'])
    #    df_res['ISESHIMA_summit'] = ((df['park']=='伊勢志摩国立公園')&df['japanese_holiday']&('2016-5-27'>df['datetime'])&(df['datetime']>'2015-6-5')).astype(int) # 2016年伊勢島サミット開催決定後の休日フラグ

    return df_res
from datetime import datetime

t1=datetime.now()
all_vector=['creativeSize','aid','advertiserId','campaignId','creativeId','adCategoryId','productId','productType','LBS','age','appIdAction','appIdInstall',
'carrier','consumptionAbility','ct','education','gender','house','interest1','interest2','interest3',
'interest4','interest5','kw1','kw2','kw3','os','marriageStatus','topic1','topic2','topic3']
#15
usecols=['uid','label']
feat_List=usecols[:-1]

train_test=pd.read_csv("../data/train_test1_2_data.csv",usecols=usecols)
train_test=train_test.fillna('-1')
train=train_test[train_test.label!=-1]
test=train_test[train_test.label==-1]
del train_test
gc.collect()

print(train.shape)
print(test.shape)

def statis_feat(df,df_val,feature):
    df=df.groupby(feature)["label"].agg(['sum','count']).reset_index()
    new_feat_name=feature+'_stas'
    df.loc[:,new_feat_name]=100*(df['sum']+1+0.0001)/(df['count']+30+0.0001)
    df.loc[:,new_feat_name] = np.round(df.loc[:,new_feat_name].values,4)
    df_stas = df[[feature,new_feat_name]]
    df_val=pd.merge(df_val,df_stas,how='left',on=feature)
    return df_val[['index',new_feat_name]]#返回index,new_feat_name

def Feature(train,predict,feat):
    train['index']=list(range(train.shape[0]))
Example #39
0
    def test_ret_struct_val(self):
        from rpython.translator.tool.cbuild import ExternalCompilationInfo
        from rpython.translator.platform import platform
        from rpython.tool.udir import udir

        c_file = udir.ensure("test_libffi", dir=1).join("xlib.c")
        c_file.write(py.code.Source('''
        #include "src/precommondefs.h"
        #include <stdlib.h>
        #include <stdio.h>

        struct s2h {
            short x;
            short y;
        };

        RPY_EXPORTED
        struct s2h give(short x, short y) {
            struct s2h out;
            out.x = x;
            out.y = y;
            return out;
        }

        RPY_EXPORTED
        struct s2h perturb(struct s2h inp) {
            inp.x *= 2;
            inp.y *= 3;
            return inp;
        }
        
        '''))
        eci = ExternalCompilationInfo(include_dirs=[cdir])
        lib_name = str(platform.compile([c_file], eci, 'x2', standalone=False))

        lib = CDLL(lib_name)

        size = ffi_type_sshort.c_size*2
        alignment = ffi_type_sshort.c_alignment
        tpe = make_struct_ffitype_e(size, alignment, [ffi_type_sshort]*2)

        give  = lib.getrawpointer('give', [ffi_type_sshort, ffi_type_sshort],
                                  tpe.ffistruct)
        inbuffer = lltype.malloc(rffi.SHORTP.TO, 2, flavor='raw')
        inbuffer[0] = rffi.cast(rffi.SHORT, 40)
        inbuffer[1] = rffi.cast(rffi.SHORT, 72)

        outbuffer = lltype.malloc(rffi.SHORTP.TO, 2, flavor='raw')

        give.call([rffi.cast(rffi.VOIDP, inbuffer),
                   rffi.cast(rffi.VOIDP, rffi.ptradd(inbuffer, 1))],
                   rffi.cast(rffi.VOIDP, outbuffer))

        assert outbuffer[0] == 40
        assert outbuffer[1] == 72

        perturb  = lib.getrawpointer('perturb', [tpe.ffistruct], tpe.ffistruct)

        inbuffer[0] = rffi.cast(rffi.SHORT, 7)
        inbuffer[1] = rffi.cast(rffi.SHORT, 11)

        perturb.call([rffi.cast(rffi.VOIDP, inbuffer)],
                     rffi.cast(rffi.VOIDP, outbuffer))

        assert inbuffer[0] == 7
        assert inbuffer[1] == 11

        assert outbuffer[0] == 14
        assert outbuffer[1] == 33

        lltype.free(outbuffer, flavor='raw')
        lltype.free(inbuffer, flavor='raw')
        del give
        del perturb
        lltype.free(tpe, flavor='raw')
        gc.collect()
        del lib

        assert not ALLOCATED
def DO(num_leaves,max_depth, option):
    print('------------------------------------------------')
    print('start...')
    print('fraction:', frac)
    print('prepare predictors, categorical and target...')
    predictors = get_predictors(option)
    categorical = get_categorical(predictors)
    target = TARGET
   

    if debug==0:
        print('=======================================================================')
        print('process on server...')
        print('=======================================================================')
    if debug==1:
        print('=======================================================================')
        print('for testing only...')
        print('=======================================================================')
    if debug==2:
        print('=======================================================================')
        print('for LIGHT TEST only...')
        print('=======================================================================')
        print('reading train')

    subfilename = yearmonthdate_string + '_' + str(len(predictors)) + \
            'features_' + boosting_type + '_cv_newparam2_' + str(int(100*frac)) + \
            'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) + '.csv.gz'
    modelfilename = yearmonthdate_string + '_' + str(len(predictors)) + \
            'features_' + boosting_type + '_cv_newparam2_' + str(int(100*frac)) + \
            'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option)

    print('----------------------------------------------------------')
    print('SUMMARY:')
    print('----------------------------------------------------------')
    print('predictors:',predictors)
    print('taget', target)
    print('categorical', categorical)
    print('submission file name:', subfilename)
    print('model file name:', modelfilename)
    print('fraction:', frac)
    print('option:', option)

    print('----------------------------------------------------------')
    train_df = read_processed_h5(TRAIN_HDF5, predictors+target)
    if frac<1:
        train_df = train_df.sample(frac=frac, random_state = SEED)
    print_memory('afer reading train:')
    print(train_df.head())
    print("train size: ", len(train_df))
    gc.collect()

    print('----------------------------------------------------------')
    print("Training...")
    start_time = time.time()

    params = {
        'boosting_type': boosting_type,
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.05,
        'num_leaves': num_leaves,  # we should let it be smaller than 2^(max_depth)
        'max_depth': max_depth,  # -1 means no limit
        'min_data_in_leaf': 128,  # Minimum number of data need in a child(min_data_in_leaf)
        # 'max_bin': 512,  # Number of bucketed bin for feature values
        'max_bin': 1024,  # Number of bucketed bin for feature values
        'subsample': 0.5,  # Subsample ratio of the training instance.
        'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        'feature_fraction': 0.9,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 10,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'scale_pos_weight': 200, # because training data is extremely unbalanced
    }
    print ('params:', params)

    print('>> cleaning train...')
    train_df_array = train_df[predictors].values
    train_df_labels = train_df[target].values.astype('int').flatten()
    del train_df; gc.collect()
    print_memory()

    print('>> prepare dataset...')
    dtrain_lgb = lgb.Dataset(train_df_array, label=train_df_labels,
                        feature_name=predictors,
                        categorical_feature=categorical)
    del train_df_array, train_df_labels; gc.collect()                        
    print_memory()                        
    
    print('>> start cv...')


    cv_results  = lgb.cv(params, 
                        dtrain_lgb, 
                        categorical_feature = categorical,
                        num_boost_round=2000,                       
                        metrics='auc',
                        seed = SEED,
                        shuffle = False,
                        stratified=True, 
                        nfold=5, 
                        show_stdv=True,
                        early_stopping_rounds=30, 
                        verbose_eval=True)                     


    print('[{}]: model training time'.format(time.time() - start_time))
    print('Total memory in use after cv training: ', process.memory_info().rss/(2**30), ' GB\n')


    # print (cv_results)
    print('--------------------------------------------------------------------') 
    num_boost_rounds_lgb = len(cv_results['auc-mean'])
    print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb))

    print ('>> start trainning... ')
    model_lgb = lgb.train(
                        params, dtrain_lgb, 
                        num_boost_round=num_boost_rounds_lgb,
                        feature_name = predictors,
                        categorical_feature = categorical)
    del dtrain_lgb
    gc.collect()

    print('--------------------------------------------------------------------') 
    print('>> save model...')
    # save model to file
    model_lgb.save_model(modelfilename+'.txt')

    print('--------------------------------------------------------------------') 
    print('>> reading test')
    test_df = read_processed_h5(TEST_HDF5,predictors+['click_id'])
    print(test_df.info()); print(test_df.head())
    print_memory()
    print("test size : ", len(test_df))
    sub = pd.DataFrame()
    sub['click_id'] = test_df['click_id'].astype('int')

    print(">> predicting...")
    sub['is_attributed'] = model_lgb.predict(test_df[predictors])
    # if not debug:
    print("writing...")
    sub.to_csv(subfilename,index=False,compression='gzip')
    print("done...")
    return sub
Example #41
0
def build_useful_data():
    """
    #TODO 利用pca降维,或者LDA降维......方式构建特征文件
    构建可用的初始特征数据, 默认原始竞赛数据储存在当前文件夹中的datas文件夹中.
    :return: 可用数据(pd.DataFrame实例)
    """

    # 读取蛋白质数据
    print("Loading and merging data")
    protein_train = pd.read_csv('datas/df_protein_train.csv')

    protein_test = pd.read_csv('datas/df_protein_test.csv')

    protein_all = pd.concat([protein_train, protein_test])

    #添加蛋白质序列长度作为特征
    protein_all['seq_len'] = protein_all['Sequence'].apply(len)

    #读取分子数据
    mol_train = pd.read_csv('datas/df_molecule.csv')

    aff_train = pd.read_csv('datas/df_affinity_train.csv')

    aff_test = pd.read_csv('datas/df_affinity_test_toBePredicted.csv')

    #初始化待预测的Ki值为-11
    aff_test['Ki'] = -11

    aff_all = pd.concat([aff_train, aff_test])

    data = aff_all.merge(mol_train, on="Molecule_ID", how='left')
    data = data.merge(protein_all, on='Protein_ID', how='left')

    #获取蛋白质ID
    PID = list(protein_all["Protein_ID"])

    #word_length = 1时的wordcount特征
    print("Processing wordcount1")
    _, word_counts1 = tfidf_and_wordcounts(protein_all, PID, word_length=1, stride=1)

    #word_length = 2时的wordcount特征
    print("Processing wordcount2")
    _, word_counts2 = tfidf_and_wordcounts(protein_all, PID, word_length=2, stride=1)

    word_counts1_2 = word_counts1.merge(word_counts2, on="Protein_ID", how="left")
    # 保存特征文件,以供后期训练
    word_counts1_2.to_csv("datas/1and2_1_421_protein_std.csv", index=False)

    del word_counts1_2, word_counts1, word_counts2

    print("Processing wordcount3")
    _, word_count3 = tfidf_and_wordcounts(protein_all, PID, word_length=3, stride=1)

    word_count3_features = list(word_count3.columns) #8000维的数据,需要降维
    word_count3_features.remove("Protein_ID")

    #利用标准差进行降维,设置标准差阈值为0.42,去掉标准差小于0.42的特征
    new_word_count3 = reduce_dims_with_std(word_count3, word_count3_features, std_threshold=0.42)
    #保存特征文件,以供后期训练
    new_word_count3.to_csv("datas/3_1_661_protein_std0.42.csv", index=False)
    del new_word_count3

    for i in range(len(word_count3_features) // 500):
        #每次划分500个特征,并保存在特征文件里,以供后期训练
        file = word_count3[["Protein_ID"] + word_count3_features[i * 500:(i + 1) * 500]]
        file_name = "3_1_500_protein_" + str(i)
        file.to_csv("datas/" + file_name + ".csv", index=False)
    del word_count3, word_count3_features

    print("Processing wordcount4")
    gc.collect()
    _, word_count4 = tfidf_and_wordcounts(protein_all, PID, word_length=4, stride=1)

    word_count4_features = list(word_count4.columns)#140000+ 维的数据,需要降维
    word_count4_features.remove("Protein_ID")

    # 利用标准差进行降维,设置标准差阈值为0.16,去掉标准差小于0.16的特征
    new_word_count4 = reduce_dims_with_std(word_count4, word_count4_features, std_threshold=0.16)
    new_word_count4.to_csv("datas/4_1_679_protein_std0.16.csv", index=False)

    # 利用标准差进行降维,设置标准差阈值为0.13,去掉标准差小于0.13的特征
    new_word_count4 = reduce_dims_with_std(word_count4, word_count4_features, std_threshold=0.13)

    word_count4_features = list(new_word_count4.columns)
    word_count4_features.remove("Protein_ID")

    for i in range(len(word_count4_features) // 500):
        #每次划分500个特征,并保存在特征文件里,以供日后训练
        file = new_word_count4[["Protein_ID"] + word_count4_features[i * 500:(i + 1) * 500]]
        file_name = "4_1_500_protein_" + str(i)
        file.to_csv("datas/" + file_name + ".csv", index=False)

    del new_word_count4, word_count4

    #以下特征是蛋白质的词向量特征, 来自技术圈, 谢谢"小武哥"同学.但我们的最终提交版本没用这些特征
    "=====================================词向量特征==========================================="
    #feat2 = protein_embedding(protein_all, word_length = 2)
    #data = data.merge(feat2, on="Protein_ID", how="left")
    #del feat2
    #feat3 = protein_embedding(protein_all, word_length = 3)
    #data = data.merge(feat3, on="Protein_ID", how="left")
    #del feat3
    #feat4 = protein_embedding(protein_all, word_length = 4)
    #data = data.merge(feat4, on="Protein_ID", how="left")
    #del feat4
    "================================================================================"

    #分子指纹展开
    mol_fingerprints = list(mol_train["Fingerprint"].apply(lambda x: list(np.array(x.split(',')).astype(int))))
    mol_fingerprints = pd.DataFrame(mol_fingerprints, columns=["Fingerprint_"+str(i) for i in range(167)])
    mol_fingerprints["Molecule_ID"] = mol_train["Molecule_ID"]

    del PID
    "=================================================================================================="
    data = data.merge(mol_fingerprints, on="Molecule_ID", how='left')
    del mol_fingerprints
    del data["Sequence"], protein_train, protein_test, mol_train

    data.reset_index(drop = True, inplace = True)

    data.to_csv("datas/original_data.csv", index=False)

    del data
    print("Useful data have builded")
Example #42
0
 def test_library_open(self):
     lib = self.get_libc()
     del lib
     gc.collect()
     assert not ALLOCATED
Example #43
0
    def main(cmd_args):
        import optparse
        global options, PSYCO
        usage = "\n%prog [options] command [input-file-patterns]\n" + cmd_doc
        oparser = optparse.OptionParser(usage)
        oparser.add_option(
            "-l", "--logfilename",
            default="",
            help="contains error messages")
        oparser.add_option(
            "-v", "--verbosity",
            type="int", default=0,
            help="level of information and diagnostics provided")
        oparser.add_option(
            "-m", "--mmap",
            type="int", default=-1,
            help="1: use mmap; 0: don't use mmap; -1: accept heuristic")
        oparser.add_option(
            "-e", "--encoding",
            default="",
            help="encoding override")
        oparser.add_option(
            "-f", "--formatting",
            type="int", default=0,
            help="0 (default): no fmt info\n"
                 "1: fmt info (all cells)\n",
        )
        oparser.add_option(
            "-g", "--gc",
            type="int", default=0,
            help="0: auto gc enabled; 1: auto gc disabled, manual collect after each file; 2: no gc")
        oparser.add_option(
            "-s", "--onesheet",
            default="",
            help="restrict output to this sheet (name or index)")
        oparser.add_option(
            "-u", "--unnumbered",
            action="store_true", default=0,
            help="omit line numbers or offsets in biff_dump")
        oparser.add_option(
            "-d", "--on-demand",
            action="store_true", default=0,
            help="load sheets on demand instead of all at once")
        oparser.add_option(
            "-t", "--suppress-timing",
            action="store_true", default=0,
            help="don't print timings (diffs are less messy)")
        oparser.add_option(
            "-r", "--ragged-rows",
            action="store_true", default=0,
            help="open_workbook(..., ragged_rows=True)")
        options, args = oparser.parse_args(cmd_args)
        if len(args) == 1 and args[0] in ("version", ):
            pass
        elif len(args) < 2:
            oparser.error("Expected at least 2 args, found %d" % len(args))
        cmd = args[0]
        xlrd_version = getattr(xlrd, "__VERSION__", "unknown; before 0.5")
        if cmd == 'biff_dump':
            xlrd.dump(args[1], unnumbered=options.unnumbered)
            sys.exit(0)
        if cmd == 'biff_count':
            xlrd.count_records(args[1])
            sys.exit(0)
        if cmd == 'version':
            print("xlrd: %s, from %s" % (xlrd_version, xlrd.__file__))
            print("Python:", sys.version)
            sys.exit(0)
        if options.logfilename:
            logfile = LogHandler(open(options.logfilename, 'w'))
        else:
            logfile = sys.stdout
        mmap_opt = options.mmap
        mmap_arg = xlrd.USE_MMAP
        if mmap_opt in (1, 0):
            mmap_arg = mmap_opt
        elif mmap_opt != -1:
            print('Unexpected value (%r) for mmap option -- assuming default' % mmap_opt)
        fmt_opt = options.formatting | (cmd in ('xfc', ))
        gc_mode = options.gc
        if gc_mode:
            gc.disable()
        for pattern in args[1:]:
            for fname in glob.glob(pattern):
                print("\n=== File: %s ===" % fname)
                if logfile != sys.stdout:
                    logfile.setfileheading("\n=== File: %s ===\n" % fname)
                if gc_mode == 1:
                    n_unreachable = gc.collect()
                    if n_unreachable:
                        print("GC before open:", n_unreachable, "unreachable objects")
                if PSYCO:
                    import psyco
                    psyco.full()
                    PSYCO = 0
                try:
                    t0 = time.time()
                    bk = xlrd.open_workbook(
                        fname,
                        verbosity=options.verbosity, logfile=logfile,
                        use_mmap=mmap_arg,
                        encoding_override=options.encoding,
                        formatting_info=fmt_opt,
                        on_demand=options.on_demand,
                        ragged_rows=options.ragged_rows,
                    )
                    t1 = time.time()
                    if not options.suppress_timing:
                        print("Open took %.2f seconds" % (t1-t0,))
                except xlrd.XLRDError as e:
                    print("*** Open failed: %s: %s" % (type(e).__name__, e))
                    continue
                except KeyboardInterrupt:
                    print("*** KeyboardInterrupt ***")
                    traceback.print_exc(file=sys.stdout)
                    sys.exit(1)
                except BaseException as e:
                    print("*** Open failed: %s: %s" % (type(e).__name__, e))
                    traceback.print_exc(file=sys.stdout)
                    continue
                t0 = time.time()
                if cmd == 'hdr':
                    bk_header(bk)
                elif cmd == 'ov': # OverView
                    show(bk, 0)
                elif cmd == 'show': # all rows
                    show(bk)
                elif cmd == '2rows': # first row and last row
                    show(bk, 2)
                elif cmd == '3rows': # first row, 2nd row and last row
                    show(bk, 3)
                elif cmd == 'bench':
                    show(bk, printit=0)
                elif cmd == 'fonts':
                    bk_header(bk)
                    show_fonts(bk)
                elif cmd == 'names': # named reference list
                    show_names(bk)
                elif cmd == 'name_dump': # named reference list
                    show_names(bk, dump=1)
                elif cmd == 'labels':
                    show_labels(bk)
                elif cmd == 'xfc':
                    count_xfs(bk)
                else:
                    print("*** Unknown command <%s>" % cmd)
                    sys.exit(1)
                del bk
                if gc_mode == 1:
                    n_unreachable = gc.collect()
                    if n_unreachable:
                        print("GC post cmd:", fname, "->", n_unreachable, "unreachable objects")
                if not options.suppress_timing:
                    t1 = time.time()
                    print("\ncommand took %.2f seconds\n" % (t1-t0,))

        return None
Example #44
0
 def collectGarbage(self):
     """
     Run a garbage collection run.
     """
     gc.collect()
    def evaluate(self, loader, epoch):

        self.model.eval()
        user_id_list, true_y, pred_y = [], [], []
        loss_all, num_batch = 0., 0.
        with torch.no_grad():
            for index, datum_tuple in enumerate(loader):
                creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, user_id, y_label = datum_tuple


                advertiser_id, product_id, product_category, industry, time = advertiser_id.to(device,non_blocking=True),\
                                                                              product_id.to(device,non_blocking=True), \
                                                                              product_category.to(device,non_blocking=True), \
                                                                              industry.to(device,non_blocking=True),\
                                                                              time.to(device,non_blocking=True)

                #获取embedding抽取的向量
                inputlist_tensor = [
                    creative_id, ad_id, advertiser_id, product_id,
                    product_category, industry, time
                ]
                emb_layer_mat = []
                for index, input_col in enumerate(inputlist_tensor):
                    emb_layer_col_mat = {}
                    for j in range(len(self.emb_layer[index])):
                        if index in [2, 3, 4, 5, 6]:
                            self.emb_layer[index][j] = self.emb_layer[index][
                                j].to(device, non_blocking=True)
                        emb_layer_col_mat[j] = self.emb_layer[index][j](
                            input_col)
                        emb_layer_col_mat[j] = emb_layer_col_mat[j].to(
                            device, non_blocking=True)
                    emb_layer_mat.append(emb_layer_col_mat)

                output = self.model(emb_layer_mat)
                y_label = y_label.to(device, non_blocking=True)

                y_label = y_label.long()

                loss = self.loss_func(output, y_label)
                loss_all += loss.item()
                num_batch += 1

                pred_y.extend(list(output.cpu().detach().numpy()))
                true_y.extend(list(y_label.cpu().detach().numpy()))
                user_id_list.extend(list(user_id.numpy()))

                del creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, y_label
                _ = gc.collect()

        pred = np.argmax(np.array(pred_y), 1)
        true = np.array(true_y).reshape((-1, ))
        acc_score = accuracy_score(true, pred)

        loss_valid = loss_all / num_batch

        output_data = DataFrame({'user_id': user_id_list, 'pred': pred_y})

        if acc_score > 0.48:

            if not os.path.isdir(
                    '../../oof/bk_oof/Multi_Head_ResNext_4seeds_all'):
                os.mkdir('../../oof/bk_oof/Multi_Head_ResNext_4seeds_all')

            pickle.dump(
                output_data,
                open(
                    '../../oof/bk_oof/Multi_Head_ResNext_4seeds_all/seed_{}_val_{}_folds_{}.pkl'
                    .format(self.seed, epoch, self.folds), 'wb'))

        del pred, true, pred_y, true_y
        _ = gc.collect()

        return acc_score, loss_valid
def buildADS1115Graph(password, myGraphSampleCount, graphNumber):
    		print('buildADS1115Graph%d - The time is: %s' % (graphNumber, datetime.now()))

		# open database
		con1 = mdb.connect('localhost', 'root', password, 'DataLogger' )
		# now we have to get the data, stuff it in the graph 

    		mycursor = con1.cursor()

		print myGraphSampleCount
		query = '(SELECT timestamp, deviceid, channel0_voltage, channel0_raw, channel1_voltage, channel1_raw, channel2_voltage, channel2_raw, channel3_voltage, channel3_raw, id FROM '+ADS1115tableName+' ORDER BY id DESC LIMIT '+ str(myGraphSampleCount) + ') ORDER BY id ASC' 

		print "query=", query
		try:
			mycursor.execute(query)
			result = mycursor.fetchall()
		except:
			e=sys.exc_info()[0]
			print "Error: %s" % e


		print result[0]
		t = []   # time
		u = []   # channel 1 - Current 
		averageCurrent = 0.0
 		currentCount = 0
		for record in result:
			t.append(record[0])

			# adjust according to graphNumber
			if (graphNumber == 0):
				addValue = record[graphNumber*2+3] 

			if (graphNumber == 1):
        			# O2 Sensor
        			sensorVoltage = record[graphNumber*2+2]*(5.0/6.144)
        			AMP  = 121
        			K_O2  = 7.43
        			sensorVoltage = sensorVoltage/AMP*10000.0
        			Value_O2 = sensorVoltage/K_O2
				addValue =  Value_O2 - 1.05

			if (graphNumber == 2):
				addValue = record[graphNumber*2+2] 

			if (graphNumber == 3):
				addValue = record[graphNumber*2+2] 


			u.append(addValue)
		
			averageCurrent = averageCurrent+addValue
			currentCount=currentCount+1

		averageCurrent = averageCurrent/currentCount
		
		print ("count of t=",len(t))
                x1 = [datetime.strptime(d, '%Y-%m-%d %H:%M:%S',) for d in t]

                fds = dates.date2num(x1) # converted
		# matplotlib date format object
		hfmt = dates.DateFormatter('%H:%M:%S')
		#hfmt = dates.DateFormatter('%m/%d-%H')

		fig = pyplot.figure()
		fig.set_facecolor('white')
		ax = fig.add_subplot(111,axisbg = 'white')
		ax.vlines(fds, -200.0, 1000.0,colors='w')



		#ax.xaxis.set_major_locator(dates.MinuteLocator(interval=1))
		ax.xaxis.set_major_formatter(hfmt)
		if (graphNumber == 0):
			ax.set_ylim(bottom = 0.0)
			pyplot.xticks(rotation='45')
			pyplot.subplots_adjust(bottom=.3)
			pylab.plot(fds, u, color='r',label="Air Quality Sensor",linestyle="-",marker=".")

		if (graphNumber == 1):
			ax.set_ylim(bottom = 0.0)
			pyplot.xticks(rotation='45')
			pyplot.subplots_adjust(bottom=.3)
			pylab.plot(fds, u, color='r',label="Oxygen (O2) Sensor ",linestyle="-",marker=".")

		if (graphNumber == 2):
			ax.set_ylim(bottom = 0.0)
			pyplot.xticks(rotation='45')
			pyplot.subplots_adjust(bottom=.3)
			pylab.plot(fds, u, color='r',label="Light Sensor",linestyle="-",marker=".")

		if (graphNumber == 3):
			ax.set_ylim(bottom = -200.0)
			pyplot.xticks(rotation='45')
			pyplot.subplots_adjust(bottom=.3)
			pylab.plot(fds, u, color='r',label="Voltage Divider ",linestyle="-",marker=".")

		pylab.xlabel("Seconds")

		pylab.legend(loc='lower center')
		if (graphNumber == 0):
			pylab.axis([min(fds), max(fds), 0, max(u)+1000])
			pylab.ylabel("Raw Data")

		if (graphNumber == 1):
			pylab.axis([min(fds), max(fds), 0, max(u)+2])
			pylab.ylabel("Percent (%)")

		if (graphNumber == 2):
			pylab.axis([min(fds), max(fds), 0, max(u)+2])
			pylab.ylabel("Voltage (V)")

		if (graphNumber == 3):
			pylab.axis([min(fds), max(fds), 0, max(u)+2])
			pylab.ylabel("Voltage Divider (V)")


		if (graphNumber == 0):
			pylab.figtext(.5, .05, ("Average Air Quality %6.2f\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center')

		if (graphNumber == 1):
			pylab.figtext(.5, .05, ("Average O2 %6.2f %%\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center')

		if (graphNumber == 2):
			pylab.figtext(.5, .05, ("Average Light Sensor %6.2f V\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center')

		if (graphNumber == 3):
			pylab.figtext(.5, .05, ("Average Voltage Divider %6.2f V\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center')

		pylab.grid(True)

		pyplot.show()
		pyplot.savefig("/var/www/html/ADS1115DataLoggerGraph"+str(graphNumber)+".png", facecolor=fig.get_facecolor())	



		mycursor.close()       	 
		con1.close()

		fig.clf()
		pyplot.close()
		pylab.close()
		gc.collect()
		print "------ADS1115Graph"+str(graphNumber)+" finished now"
    #获取emb_layer
    emb_layer = []
    for index, col in enumerate(inputlist):
        emb_layer_col = {}
        for indexj, matrixi in enumerate(emb_matrix_dict[col]):
            emb_layer_col[indexj] = nn.Embedding.from_pretrained(
                torch.from_numpy(matrixi))
            if col in train_able_dict:
                emb_layer_col[indexj].weight.requires_grad = False
            else:
                emb_layer_col[indexj].weight.requires_grad = True

        emb_layer.append(emb_layer_col)

    del id_list_dict, emb_matrix_dict
    _ = gc.collect()

    #换4个seed,每个seed跑五折
    for seed in [34, 2020, 1111, 200]:
        for folds in range(5):
            print('This is fold: ', folds)
            data = pickle.load(
                open('../../cached_data/input_data_20class.pkl', 'rb'))  #读数据
            train_idx = list(
                np.load(
                    '../../cached_data/5folds_4seeds_index/seed_{}_train_index_fold_{}.npy'
                    .format(seed, folds)))
            val_idx = list(
                np.load(
                    '../../cached_data/5folds_4seeds_index/seed_{}_val_index_fold_{}.npy'
                    .format(seed, folds)))
def set_model_ma10(uid, force_full_update, connection):
    """ xxx """
    ret = 0
    ########################################################################
    # (2.1) Define names of column in use by the model
    ########################################################################
    model_tp_column = 'price_instruments_data.ma10_tp'
    model_score_column = 'instruments.score_ma10'
    #-----------------------------------------------------------------------

    day_to_process = 370
    score = 0

    if force_full_update:
        sql_selection = "SELECT price_instruments_data.symbol, "+\
        "price_instruments_data.date, price_instruments_data.price_close, " +\
        str(model_tp_column) + " FROM price_instruments_data "+\
        "JOIN symbol_list ON symbol_list.symbol = price_instruments_data.symbol "+\
        "WHERE symbol_list.uid = "+ str(uid) +" ORDER BY date DESC LIMIT "+\
        str(day_to_process)
    else:
        sql_selection = "SELECT price_instruments_data.symbol, "+\
        "price_instruments_data.date, price_instruments_data.price_close, " +\
        str(model_tp_column) + " FROM price_instruments_data "+\
        "JOIN symbol_list ON symbol_list.symbol = price_instruments_data.symbol "+\
        "WHERE symbol_list.uid = "+ str(uid) +\
        " AND price_instruments_data.is_ta_calc = 0 ORDER BY date DESC"

    cursor = connection.cursor(pymysql.cursors.SSCursor)
    sql = sql_selection
    cursor.execute(sql)
    res = cursor.fetchall()
    symbol = ''
    for row in res:
        symbol = row[0]
        last_date = row[1].strftime('%Y%m%d')
        last_price = row[2]
        model_tp = row[3]

        cr_c = connection.cursor(pymysql.cursors.SSCursor)
        sql_c = "SELECT " + str(model_tp_column) +\
        ", price_instruments_data.price_close "+\
        "FROM price_instruments_data JOIN symbol_list "+\
        "ON symbol_list.symbol = price_instruments_data.symbol "+\
        "WHERE symbol_list.uid = "+ str(uid) +" AND date = DATE_SUB("+\
        str(last_date) +", INTERVAL 7 DAY)"
        cr_c.execute(sql_c)
        rs_c = cr_c.fetchall()
        model_prediction_tp = 0
        previous_price = 0
        for row in rs_c:
            model_prediction_tp = row[0]
            previous_price = row[1]
        cr_c.close()


        if model_prediction_tp != 0 and previous_price != 0:
            type_of_trade = ''
            if previous_price <= model_prediction_tp:
                type_of_trade = 'b'
            if previous_price > model_prediction_tp:
                type_of_trade = 's'
            if (previous_price >= last_price) and (type_of_trade == 'b'):
                if score > 0:
                    score = score - 0.01
            if (previous_price >= last_price) and (type_of_trade == 's'):
                score = score + 0.01
            if (previous_price < last_price) and (type_of_trade == 'b'):
                score = score + 0.01
            if (previous_price < last_price) and (type_of_trade == 's'):
                if score > 0:
                    score = score - 0.01
            debug("### score calc "+ str(model_score_column) +\
                  ": current score = " + str(score))

        if model_tp == 0:
            ########################################################################
            # (3) Define function that calc the model target price
            ########################################################################
            last_model_tp = get_model_price_ma10(uid, last_date, connection)
            cr_u = connection.cursor(pymysql.cursors.SSCursor)
            sql_u = "UPDATE price_instruments_data SET " +\
            str(model_tp_column) + " = " + str(last_model_tp) +\
            " WHERE symbol = '"+ str(symbol) +"' AND date = " + str(last_date)
            cr_u.execute(sql_u)
            connection.commit()
            ret = last_model_tp
            cr_u.close()
        gc.collect()
    model_score = 0
    if not force_full_update:
        sql = "SELECT "+ str(model_score_column) +\
        " FROM instruments WHERE symbol = '"+ str(symbol) +"'"
        cursor.execute(sql)
        res = cursor.fetchall()
        for row in res:
            model_score = row[0]
    debug("### Total score calc "+ str(model_score_column) +": " +\
          str(model_score) + " + " + str(score))
    model_score = round(model_score + score, 2)
    debug("### Total score "+ str(model_score_column) +": " + str(model_score))

    sql = "UPDATE instruments SET " + str(model_score_column) +\
    " = " + str(model_score) + " WHERE symbol = '"+ str(symbol) +"'"
    cursor.execute(sql)
    connection.commit()
    cursor.close()
    gc.collect()
    return ret
Example #49
0
 def tearDown(self):
     ImpalaE2E.tearDown(self)
     gc.collect()
    def train(self):

        iter_wrapper = lambda x: tqdm(x, total=len(self.train_data))
        start_epoch = -1
        best_valid = 0.
        min_lr = 1e-7

        if self.is_resume:
            print('Let Continue!')
            checkpoint = torch.load(PATH_CHECKPOINT)  # 加载断点

            self.model.load_state_dict(checkpoint['model_state_dict'])

            self.optim.load_state_dict(checkpoint['optimizer_state_dict'])
            start_epoch = checkpoint['epoch']
            best_valid = checkpoint['best_valid']

        for epoch in range(start_epoch + 1, EPOCHS):

            print('=========================')
            print('Processing Epoch {}'.format(epoch))
            print('=========================')

            loss_per_epoch, train_n_batch = 0., 0.

            for index, data in iter_wrapper(enumerate(self.train_data)):

                creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, user_id, y_label = data


                advertiser_id, product_id, product_category, industry, time = advertiser_id.to(device,non_blocking=True),\
                                                                              product_id.to(device,non_blocking=True), \
                                                                              product_category.to(device,non_blocking=True), \
                                                                              industry.to(device,non_blocking=True),\
                                                                              time.to(device,non_blocking=True)

                self.model.train()
                self.optim.zero_grad()

                #获取embedding抽取的向量
                inputlist_tensor = [
                    creative_id, ad_id, advertiser_id, product_id,
                    product_category, industry, time
                ]
                emb_layer_mat = []
                for index, input_col in enumerate(inputlist_tensor):
                    emb_layer_col_mat = {}
                    for j in range(len(self.emb_layer[index])):
                        if index in [2, 3, 4, 5, 6]:
                            self.emb_layer[index][j] = self.emb_layer[index][
                                j].to(device, non_blocking=True)
                        emb_layer_col_mat[j] = self.emb_layer[index][j](
                            input_col)
                        emb_layer_col_mat[j] = emb_layer_col_mat[j].to(
                            device, non_blocking=True)
                    emb_layer_mat.append(emb_layer_col_mat)

                output = self.model(emb_layer_mat)
                y_label = y_label.to(device, non_blocking=True)

                y_label = y_label.long()

                loss = self.loss_func(output, y_label)

                loss_per_epoch += loss.item()
                train_n_batch += 1

                loss.backward()

                nn.utils.clip_grad_norm_(self.model.parameters(), 10.)  # 梯度裁剪

                self.optim.step()

                del creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, y_label
                _ = gc.collect()

            if self.val_data is not None:  # Do Validation

                valid_score, valid_loss = self.evaluate(self.val_data, epoch)
                print('evaluate done!')
                if valid_score > 0.48:
                    self.test(self.test_data, epoch)

                if valid_score > best_valid:
                    best_valid = valid_score

            self.scheduler_ReduceLROnPlateauLR.step(valid_score)

            if self.optim.param_groups[0]['lr'] < min_lr:
                print("stopping")
                break

            torch.cuda.empty_cache()
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min')

matrix = matrix[matrix.date_block_num > 11]

def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)

matrix.to_pickle('data.pkl')

gc.collect();

data = pd.read_pickle('data.pkl')

data = data[[
    'date_block_num',
    'shop_id',
    'item_id',
    'item_cnt_month',
    'city_code',
    'item_category_id',
    'main_cate_id',
    'sub_cate_id',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
 def write(i, batch):
     this_batch = helper.reshape_batch(batch, new_size, dim)
     with open(to+"/batch{}.pickle".format(str(i)), "wb") as f:
         pickle.dump(this_batch, f)
     del this_batch
     gc.collect()
Example #53
0
 def tearDown(self):
     gc.collect()
     # This will only contain uncollectable garbage, i.e. reference cycles
     # involving objects with __del__ defined.
     self.assertEmpty(gc.garbage)
     super().tearDown()
Example #54
0
def main(mod1, mod2, epochs, learning_rate, l2s, batch_size, network='MAESurv'):
    if mod2 != 'None':
        MAE_params = pickle.load(io.open(STATE_FOLDER+mod1+'+'+mod2+'.dict', 'rb'))        
    else:
        MAE_params = pickle.load(io.open(STATE_FOLDER+mod1+'.dict', 'rb'))        
    d_dims = MAE_params['D dims']
    hidden_dims = MAE_params['Hidden dims']
    x1, x2, index = read_data(DATA_FOLDER, mod1, mod2, suffix=DATA_SUFFIX)
    with open(DATA_FOLDER+'train'+INDEX_SET+'.pickle','rb') as f:
        train_index = pickle.load(f)
    with open(DATA_FOLDER+'test'+INDEX_SET+'.pickle','rb') as f:
        test_index = pickle.load(f)
    x1_train = x1.loc[train_index].to_numpy()#x1_train = torch.from_numpy(x1.loc[train_index].to_numpy()).float().to(device)
    in_dims = [x1_train.shape[1]]
    x1_test = x1.loc[test_index].to_numpy()#x1_test = torch.from_numpy(x1.loc[test_index].to_numpy()).float().to(device)
    x1_train = torch.from_numpy(x1_train).float().to(device)
    x1_test = torch.from_numpy(x1_test).float().to(device)
    
    if x2 is not None:
        x2_train = x2.loc[train_index].to_numpy()
        x2_test = x2.loc[test_index].to_numpy()
        in_dims.append(x2_train.shape[1])
  
        x2_train = torch.from_numpy(x2_train).float().to(device)
        x2_test = torch.from_numpy(x2_test).float().to(device)
    else:
        x2_train = None
        in_features_two = None
        x2_test = None
    with open(DATA_FOLDER + 'survival_TT.pickle', 'rb') as f:
        survival=pickle.load(f)
    get_target = lambda df: (df['OS.time'].values, df['OS'].values)
    y1_train, y2_train = get_target(survival.loc[train_index])
    y1_test, y2_test = get_target(survival.loc[test_index])    
    y1_train = torch.from_numpy(y1_train).to(device)
    y2_train = torch.from_numpy(y2_train).to(device)
    y1_test = torch.from_numpy(y1_test).to(device)
    y2_test = torch.from_numpy(y2_test).to(device)

    for epoch in epochs:
        for do in [0,0.1,0.2]:
            for l2 in l2s:
                for lr in learning_rate:
                    for bs in batch_size:
                        for ns in [16,32,64,100]:
                            hyperparameters = {'Epoch': epoch,
                            'Dropout': do,
                            'L2 reg': l2,
                            'State file path': STATE_FOLDER,
                            'Learning rate': lr, #tt.optim.Adam
                            'batch_size': bs,
                            'Input dimension': in_dims,
                            'Embedding dimension': d_dims,
                            'Latent size': hidden_dims,
                            'Neuron size': ns
                            }
                            
                            
                            stdOrigin = sys.stdout
                            PATH = mod1+" "+str(mod2)+"_"+str(epoch)+"_"+str(do)+"_"+str(lr)+"_"+str(l2)+"_"+str(bs)+"_"+str(ns)
                            sys.stdout = open(os.path.join(LOG_FOLDER, "MAESurv_"+PATH+".out"), "w")
                            print(hyperparameters)
                           
                            CV(mod1, mod2, x1_train, x2_train, y1_train, y2_train, 5, hyperparameters)
                            
                            p_time = time.time()
                            model, log = MAESurv_pipeline(mod1, mod2, x1_train, x2_train, y1_train, y2_train, hyperparameters, True, PATH)
                            a_time = time.time()
                            print(f'Training time: {a_time-p_time}')
                             
                            Cindex = MAESurv_evaluate(model, x1_test, x2_test, y1_test, y2_test)
                            print(f"Test C-index: {Cindex}")
                            
                            gc.collect()
                            sys.stdout.close()
                            sys.stdout = stdOrigin
Example #55
0
    def display(self):
        try:
            from bsddb3.db import DBError
        except:

            class DBError(Exception):
                """
                Dummy.
                """

        self.parent = self.top.get_toplevel()
        progress = ProgressMeter(_('Updating display...'),
                                 '',
                                 parent=self.parent,
                                 can_cancel=True)
        self.model.clear()
        self.junk = []
        gc.collect(2)
        self.junk = gc.garbage
        self.label.set_text(_('Uncollected Objects: %s') % str(len(self.junk)))
        progress.set_pass(_('Updating display...'), len(self.junk))
        for count in range(0, len(self.junk)):
            if progress.step():
                break
            try:
                refs = []
                referrers = gc.get_referrers(self.junk[count])
                for referrer in referrers:
                    try:
                        if referrer is not self.junk:
                            for indx in range(0, len(self.junk)):
                                if referrer is self.junk[indx]:
                                    refs.append(str(indx) + ' ')
                                    break
                    except:
                        print(sys.exc_info())
                if len(refs) > 3:
                    ref = ' '.join(refs[0:2]) + "..."
                else:
                    ref = ' '.join(refs)
                try:
                    self.model.append((count, ref, str(self.junk[count])))
                except DBError:
                    self.model.append(
                        (count, ref,
                         'db.DB instance at %s' % id(self.junk[count])))
                except ReferenceError:
                    self.model.append(
                        (count, ref,
                         'weakly-referenced object no longer exists %s' %
                         type(self.junk[count])))
                except TypeError:
                    self.model.append(
                        (count, ref, 'Object cannot be displayed %s' %
                         type(self.junk[count])))
                except:
                    print(sys.exc_info())
            except ReferenceError:
                InfoDialog(_('Reference Error'),
                           "Refresh to correct",
                           parent=self.parent)
        progress.close()
Example #56
0
def main():  # pylint: disable=too-many-statements,too-many-branches,too-many-locals
    """main routine"""

    if int(platform.python_version().split('.')[0]) < 3:
        LOGGER.fatal("%s needs at least python version 3, currently %s",
                     ME, platform.python_version())
        sys.exit(1)

    start_time = int(time.time())
    _parser = ArgumentParser()
    _parser.add_argument("-c", "--cfile", dest="configfile", default=ME+".cfg",
                         help="Configuration file", metavar="FILE", required=True)
    _parser.add_argument("-v", "--verbosity", action="count", default=0,
                         help="increase output verbosity overriding the default")
    _parser.add_argument("-p", "--parameter", action="store",
                         help="show parameter from configfile")
    _args = _parser.parse_args()

    set_logfile(LOGGER, _args.configfile+".log")

    _config = get_config(_args.configfile, ME)

    if _args.parameter:
        if _args.parameter == 'password':
            print('parameter {}: {}\n'.format(_args.parameter,
                                              decrypted(_config[_args.parameter+'_enc'])))
        else:
            print('parameter {}: {}\n'.format(
                _args.parameter, _config[_args.parameter]))
        sys.exit(0)

    if _args.verbosity:
        newLevel = logging.getLogger().getEffectiveLevel() - (_args.verbosity*10)

        if newLevel < 0:
            newLevel = 0
        LOGGER.warning("Changing loglevel from %d to %d", logging.getLogger().getEffectiveLevel(),
                       newLevel)
        logging.getLogger().setLevel(newLevel)
    LOGGER.debug("log level %d", logging.getLogger().getEffectiveLevel())
    LOGGER.warning("start python-%s %s-%s pid=%s Connecting ...\n",
                   platform.python_version(), ME, VERSION, os.getpid()
                   )

    if _config['password']:
        LOGGER.warning(
            "first encrypted the plaintext password and removed from config\n")
    # we need the password ....
    _config['password'] = decrypted(_config['password_enc'])

# add a few seconds extra to allow the driver timeout handling to do the it's job.
# for example, cx_oracle has a cancel routine that we call after a timeout. If
# there is a network problem, the cancel gets a ORA-12152: TNS:unable to send break message
# setting this defaulttimeout should speed this up
    socket.setdefaulttimeout(_config['sqltimeout']+3)

    LOGGER.warning("%s found db_type=%s, driver %s; checking for driver\n",
                   ME,
                   _config['db_type'], _config['db_driver'])

    if not os.path.exists(
            os.path.join(_config['checks_dir'], _config['db_type'])):
        raise ValueError("db_type "+_config['db_type'] +
                         " does not exist in the "+_config['checks_dir']+" directory")
    db_driver = load_driver(_config)
    driver_errors = load_driver_errors(_config)
    db_connections = load_db_connections(_config)
    LOGGER.info(db_connections)
    LOGGER.info(driver_errors)

    LOGGER.info("hostname in zabbix: %s", _config['hostname'])
    #  hide password, hoping username != password ;-)
    LOGGER.info("connect string    : %s\n",
                db_connections.connect_string(_config).replace(_config['password'],
                                                               '******'))
    LOGGER.info('using sql_timeout : %ds\n', _config['sqltimeout'])
    LOGGER.info("out_file          : %s\n", _config['out_file'])

    if _config['site_checks']:
        LOGGER.info("site_checks       : %s\n", _config['site_checks'])

    if LOG_CONF:
        sys_files = 4
    else:
        sys_files = 3
    check_files = [{'name': __file__, 'lmod': os.path.getmtime(__file__)},
                   {'name': db_connections.__file__,
                    'lmod': os.path.getmtime(db_connections.__file__)},
                   {'name': driver_errors.__file__,
                    'lmod': os.path.getmtime(driver_errors.__file__)},
                   {'name': LOG_CONF,
                    'lmod': os.path.getmtime(LOG_CONF)}
                   ]

    if LOG_CONF:
        check_files.append(
            {'name': LOG_CONF, 'lmod': os.path.getmtime(LOG_CONF)})

    for i in range(sys_files):
        to_outfile(_config,
                   "{}[checks,{},name]".format(ME, i), check_files[i]['name'])
        to_outfile(_config,
                   "{}[checks,{},lmod]".format(ME, i), int(check_files[i]['lmod']))

    conn_counter = 0
    conn_errors = 0
    query_counter = 0
    query_errors = 0

    sleep_c = 0
    sleep_s = 1
    prev_err = 0

    while True:
        try:
            for i in range(sys_files):
                if check_files[i]['lmod'] != os.stat(check_files[i]['name']).st_mtime:
                    LOGGER.warning("%s Changed, from %s to %s restarting ..\n",
                                   check_files[i]['name'],
                                   time.ctime(check_files[i]['lmod']),
                                   time.ctime(os.path.getmtime(check_files[i]['name'])))
                    os.execv(__file__, sys.argv)

            # reset list in case of a just new connection that reloads the config
            check_files = [{'name': __file__, 'lmod': os.path.getmtime(__file__)},
                           {'name': db_connections.__file__,
                            'lmod': os.path.getmtime(db_connections.__file__)},
                           {'name': driver_errors.__file__,
                            'lmod': os.path.getmtime(driver_errors.__file__)}]

            if LOG_CONF:
                check_files.append(
                    {'name': LOG_CONF, 'lmod': os.path.getmtime(LOG_CONF)})

            _config = get_config(_args.configfile, ME)
            _config['password'] = decrypted(_config['password_enc'])

            _start = timer()

            #  hide password, hoping username != password ;-)
            LOGGER.info('connecting to %s\n',
                        db_connections.connect_string(_config).replace(_config['password'],
                                                                       '******'))
            conn_has_cancel = False
            _conn = db_connections.connect(db_driver, _config)

            if "cancel" in dir(_conn):
                conn_has_cancel = True
            LOGGER.info(_conn)
            conn_counter += 1
            to_outfile(_config, ME+"[connect,status]", 0)
            _cursor = _conn.cursor()
            connect_info = db_connections.connection_info(_conn)
            LOGGER.info('connected db_url %s type %s db_role %s version %s\n'
                        '%s user %s %s sid,serial %d,%d instance %s as %s cancel:%s\n',
                        _config['db_url'], connect_info['instance_type'],
                        connect_info['db_role'],
                        connect_info['dbversion'],
                        datetime.datetime.fromtimestamp(time.time()),
                        _config['username'], connect_info['uname'],
                        connect_info['sid'],
                        connect_info['serial'],
                        connect_info['iname'],
                        _config['role'], conn_has_cancel)

            if connect_info['db_role'] in ["PHYSICAL STANDBY", "SLAVE"]:
                checks_file = os.path.join(_config['checks_dir'],
                                           _config['db_type'], "standby" +
                                           "." + connect_info['dbversion'] + ".cfg")
            else:
                checks_file = os.path.join(_config['checks_dir'],
                                           _config['db_type'],
                                           connect_info['db_role'].lower() + "." +
                                           connect_info['dbversion']+".cfg")

            _files = [checks_file]
            check_files.append({'name': checks_file, 'lmod': 0})

            if _config['site_checks']:
                for addition in _config['site_checks'].split(","):
                    addfile = os.path.join(_config['checks_dir'], _config['db_type'],
                                           addition + ".cfg")
                    check_files.append({'name': addfile, 'lmod': 0})
                    _files.extend([addfile])
            LOGGER.info('using checks from %s\n', _files)

            for checks_file in check_files:
                if not os.path.exists(checks_file['name']):
                    raise ValueError(
                        "Configfile " + checks_file['name'] + " does not exist")
            # all checkfiles exist

            sleep_c = 0
            sleep_s = 1
            prev_err = 0
            con_mins = 0
            open_time = int(time.time())

            while True:
                LOGGER.debug("%s while True\n", ME)

                if connect_info['db_role'] != db_connections.current_role(_conn, connect_info):
                    LOGGER.info("db_role changed from %s to %s",
                                connect_info['db_role'],
                                db_connections.current_role(_conn, connect_info))
                    # re connect to get the correct monitoring config again

                    break
                # keep this to compare for when to dump stats
                now_run = int(time.time())
                run_timer = timer()  # keep this to compare for when to dump stats
                # loading checks from the various checkfiles:
                need_to_load = "no"

                # pylint: disable=consider-using-enumerate

                for i in range(len(check_files)):  # at 0 - sys_files is the script itself
                    try:
                        current_lmod = os.path.getmtime(check_files[i]['name'])
                    except OSError as _e:
                        LOGGER.warning("%s: %s\n",
                                       check_files[i]['name'], _e.strerror)
                        # ignore the error, maybe temporary due to an update
                        current_lmod = check_files[i]['lmod']

                    if check_files[i]['lmod'] != current_lmod:
                        if i < sys_files:  # it is the script, a module or LOG_CONF
                                           # that changed
                            LOGGER.warning("%s changed, from %s to %s restarting ...\n",
                                           check_files[i]['name'],
                                           time.ctime(check_files[i]['lmod']),
                                           time.ctime(current_lmod))
                            os.execv(__file__, sys.argv)
                        else:
                            if check_files[i]['lmod'] == 0:
                                LOGGER.info("checks loading %s\n",
                                            check_files[i]['name'])
                                need_to_load = "yes"
                            else:
                                LOGGER.warning("checks changed, reloading %s\n",
                                               check_files[i]['name'])
                                need_to_load = "yes"

                if need_to_load == "yes":
                    to_outfile(_config, ME + "[version]", VERSION)
                    to_outfile(
                        _config, ME + "[config,db_type]", _config['db_type'])
                    to_outfile(
                        _config, ME + "[config,db_driver]", _config['db_driver'])
                    to_outfile(
                        _config, ME + "[config,instance_type]", _config['instance_type'])
                    to_outfile(_config, ME + "[conn,db_role]",
                               connect_info['db_role'])
                    to_outfile(
                        _config, ME + "[conn,instance_type]", connect_info['instance_type'])
                    to_outfile(_config, ME + "[conn,dbversion]",
                               connect_info['dbversion'])
                    to_outfile(
                        _config, ME + "[connect,instance_name]", connect_info['iname'])
                    # sometimes the instance_name query follows within a second
                    # missing event so give it some more time
                    time.sleep(3)
                    objects_list = []
                    sections_list = []
                    file_list = []
                    all_checks = []

                    for i in range(len(check_files)):
                        _e = collections.OrderedDict()
                        _e = {"{#CHECKS_FILE}": i}
                        file_list.append(_e)

                    files_json = '{\"data\":'+json.dumps(file_list)+'}'
                    to_outfile(_config, ME+".files.lld", files_json)

                    for i in range(sys_files, len(check_files)):
                        # #0 is executable that is also checked for updates
                        # #1 db_connections module
                        # #2 driver_errors module
                        # #3 LOG_CONF if it exists ...
                        # so, skip those and pick the real check_files
                        _checks = configparser.RawConfigParser()
                        try:
                            check_file = open(check_files[i]['name'], 'r')
                            to_outfile(_config, "{}[checks,{},name]".format(ME, i),
                                       check_files[i]['name'])
                            to_outfile(_config, "{}[checks,{},lmod]".format(ME, i),
                                       str(int(os.stat(check_files[i]['name']).st_mtime)))
                            try:
                                _checks.read_file(check_file)
                                check_file.close()
                                to_outfile(_config, ME + "[checks," + str(i) +
                                           ",status]", 0)
                            except configparser.Error:
                                to_outfile(_config, ME + "[checks," + str(i) +
                                           ",status]", 13)
                                LOGGER.critical("file %s has parsing errors ->(13)\n",
                                                check_files[i]['name'])
                        except IOError as io_error:
                            to_outfile(
                                _config, ME + "[checks," + str(i) + ",status]", 11)
                            LOGGER.critical("file %s IOError %s %s ->(11)\n",
                                            check_files[i]['name'],
                                            io_error.errno, io_error.strerror)

                        check_files[i]['lmod'] = os.stat(
                            check_files[i]['name']).st_mtime
                        all_checks.append(_checks)

                        for section in sorted(_checks.sections()):
                            sec_mins = int(_checks.get(section, "minutes"))

                            if sec_mins == 0:
                                LOGGER.info(
                                    "%s run at connect only\n", section)
                            else:
                                LOGGER.info("%s run every %d minutes\n",
                                            section, sec_mins)
                            # dump own discovery items of the queries per section
                            _e = collections.OrderedDict()
                            _e = {"{#SECTION}": section}
                            sections_list.append(_e)
                            _x = dict(_checks.items(section))

                            for key, sqls in sorted(_x.items()):
                                if sqls and key != "minutes":
                                    _d = collections.OrderedDict()
                                    _d = {"{#SECTION}": section, "{#KEY}": key}
                                    objects_list.append(_d)
                                    LOGGER.info("%s: %s\n",
                                                key,
                                                sqls[0: 60].
                                                replace('\n', ' ').replace('\r', ' '))
                    # checks are loaded now.
                    sections_json = '{\"data\":'+json.dumps(sections_list)+'}'
                    LOGGER.debug("lld key: %s json: %s\n",
                                 ME+".lld", sections_json)
                    to_outfile(_config, ME+".section.lld", sections_json)
                    rows_json = '{\"data\":'+json.dumps(objects_list)+'}'
                    LOGGER.debug("lld key: %s json: %s\n",
                                 ME+".lld", rows_json)
                    to_outfile(_config, ME + ".query.lld", rows_json)
                    # sqls can contain multiple statements per key. sqlparse to split them
                    # now. Otherwise use a lot of extra cycles when splitting at runtime
                    # all_sql { {section, key}: statements }
                    all_sql = {}

                    for _checks in all_checks:
                        for section in sorted(_checks.sections()):
                            _x = dict(_checks.items(section))

                            for key, sqls in sorted(_x.items()):
                                if sqls and key != "minutes":
                                    all_sql[(section, key)] = []

                                    for statement in sqlparse.split(sqls):
                                        all_sql[(section, key)].append(
                                            statement)

                # checks discovery is also printed
                #
                # assume we are still connected. If not, exception will tell real story
                to_outfile(_config, ME + "[connect,status]", 0)
                to_outfile(_config, ME + "[uptime]",
                           int(time.time() - start_time))
                to_outfile(_config, ME + "[opentime]",
                           int(time.time() - open_time))

                # the connect status is only real if executed a query ....

                for _checks in all_checks:
                    for section in sorted(_checks.sections()):
                        section_timer = timer()  # keep this to compare for when to dump stats
                        sec_mins = int(_checks.get(section, "minutes"))

                        if ((con_mins == 0 and sec_mins == 0) or
                                (sec_mins > 0 and con_mins % sec_mins == 0)):
                            # time to run the checks again from this section
                            _x = dict(_checks.items(section))
                            _cursor = _conn.cursor()

                            for key, sqls in sorted(_x.items()):
                                if sqls and key != "minutes":
                                    LOGGER.debug("%s section: %s key: %s\n",
                                                 ME, section, key)
                                    try:
                                        query_counter += 1

                                        if conn_has_cancel:
                                            # pymysql has no cancel but does have
                                            # timeout in connect
                                            sqltimeout = threading.Timer(
                                                _config['sqltimeout'],
                                                cancel_sql, [_conn, section, key])
                                            sqltimeout.start()
                                        _start = timer()

                                        for statement in all_sql[(section, key)]:

                                            LOGGER.debug("%s section: %s key: %s sql: %s\n",
                                                         ME, section, key, statement)
                                            _cursor.execute(statement)
                                        startf = timer()
                                        # output for the last query must include the
                                        # output for the preparing queries is ignored
                                        #        complete key and value
                                        rows = _cursor.fetchall()

                                        if conn_has_cancel:
                                            sqltimeout.cancel()

                                        if "discover" in section:
                                            objects_list = []

                                            for row in rows:
                                                _d = collections.OrderedDict()

                                                for col in range(len(_cursor.description)):
                                                    _d[_cursor.description[col]
                                                       [0]] = row[col]
                                                objects_list.append(_d)
                                            rows_json = '{\"data\":' + \
                                                json.dumps(objects_list)+'}'
                                            LOGGER.debug("DEBUG lld key: %s json: %s\n", key,
                                                         rows_json)
                                            to_outfile(_config, key, rows_json)
                                            to_outfile(_config, ME +
                                                       "[query," + section + "," +
                                                       key + ",status]", 0)
                                        else:
                                            if rows and len(rows[0]) == 2:
                                                _config['section'] = section
                                                _config['key'] = key

                                                for row in rows:
                                                    to_outfile(
                                                        _config, row[0], row[1])
                                                to_outfile(_config, ME +
                                                           "[query," + section + "," +
                                                           key + ",status]", 0)
                                            elif not rows:
                                                to_outfile(_config, ME + "[query," +
                                                           section + "," +
                                                           key + ",status]", 0)
                                            else:
                                                LOGGER.critical('key=%s.%s ZBXDB-%d: '
                                                                'SQL format error: %s\n',
                                                                section, key, 2,
                                                                "expect key,value pairs")
                                                to_outfile(_config, ME +
                                                           "[query," + section + "," +
                                                           key + ",status]", 2)
                                        _config['section'] = ""
                                        _config['key'] = ""
                                        fetchela = timer() - startf
                                        elapsed_s = timer() - _start
                                        to_outfile(_config, ME + "[query," +
                                                   section + "," +
                                                   key + ",ela]", elapsed_s)
                                        to_outfile(_config, ME + "[query," +
                                                   section + "," +
                                                   key + ",fetch]", fetchela)
                                    # except (db_driver.DatabaseError,
                                        # socket.timeout) as dberr:
                                    except Exception as dberr:
                                        if conn_has_cancel:
                                            sqltimeout.cancel()
                                        ecode, emsg = driver_errors.db_errorcode(
                                            db_driver, dberr)

                                        elapsed_s = timer() - _start
                                        query_errors += 1
                                        to_outfile(_config, ME + "[query," +
                                                   section + "," +
                                                   key + ",status]", ecode)
                                        to_outfile(_config, ME + "[query," +
                                                   section + "," +
                                                   key + ",ela]", elapsed_s)
                                        LOGGER.info('key=%s.%s ZBXDB-%s: '
                                                    'Db execution error: %s\n',
                                                    section, key, ecode, emsg.strip())

                                        if driver_errors.db_error_needs_new_session(db_driver,
                                                                                    ecode):
                                            raise

                                        LOGGER.debug("%s commit\n", ME)
                                        _conn.commit()

                                        LOGGER.debug("%s committed\n", ME)
                            # end of a section ## time to run the checks again from this section
                            to_outfile(_config, ME + "[query," + section + ",,ela]",
                                       timer() - section_timer)
                # release locks that might have been taken

                LOGGER.debug("%s commit 2\n", ME)

                _conn.commit()

                LOGGER.debug("%s committed.\n", ME)
                # dump metric for summed elapsed time of this run
                to_outfile(_config, ME + "[query,,,ela]",
                           timer() - run_timer)
                to_outfile(_config, ME + "[cpu,user]",
                           resource.getrusage(resource.RUSAGE_SELF).ru_utime)
                to_outfile(_config, ME + "[cpu,sys]",
                           resource.getrusage(resource.RUSAGE_SELF).ru_stime)
                to_outfile(_config, ME + "[mem,maxrss]",
                           resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                # passed all sections

                if ((now_run - start_time) % 3600) == 0:
                    gc.collect()
                    # dump stats
                    LOGGER.info("connect %d times, %d fail; started %d queries, "
                                "%d fail memrss:%d user:%f sys:%f\n",
                                conn_counter, conn_errors, query_counter,
                                query_errors,
                                resource.getrusage(
                                    resource.RUSAGE_SELF).ru_maxrss,
                                resource.getrusage(
                                    resource.RUSAGE_SELF).ru_utime,
                                resource.getrusage(resource.RUSAGE_SELF).ru_stime)
                # try to keep activities on the same starting second:
                sleep_time = 60 - ((int(time.time()) - start_time) % 60)

                LOGGER.debug("Sleeping for %d seconds\n", sleep_time)
                time.sleep(sleep_time)
                con_mins = con_mins + 1  # not really mins since the checks could
                #                       have taken longer than 1 minute to complete
            # end while True
        # except (db_driver.DatabaseError, socket.timeout, ConnectionResetError) as dberr:
        except Exception as dberr:
            err_code, err_msg = driver_errors.db_errorcode(db_driver, dberr)
            elapsed_s = timer() - _start
            to_outfile(_config, ME + "[connect,status]", err_code)

            if not driver_errors.db_error_needs_new_session(db_driver, err_code):
                # from a killed session, crashed instance or similar
                conn_errors += 1

            if prev_err != err_code:
                sleep_c = 0
                sleep_s = 1
                prev_err = err_code
            sleep_c += 1

            if sleep_c >= 10:
                if sleep_s <= 301:
                    # don't sleep longer than 5 mins after connect failures
                    sleep_s += 10
                sleep_c = 0
            LOGGER.warning('(%d.%d)connection error: [%s] %s for %s@%s\n',
                           sleep_c, sleep_s, err_code,
                           err_msg.strip().replace('\n', ' ').replace('\r', ' '),
                           _config['username'], _config['db_url'])
            # set_trace()
            time.sleep(sleep_s)
        except (KeyboardInterrupt, SystemExit):
            exit(0)
Example #57
0
def load_scripts(reload_scripts=False, refresh_scripts=False):
    """
    Load scripts and run each modules register function.

    :arg reload_scripts: Causes all scripts to have their unregister method
       called before loading.
    :type reload_scripts: bool
    :arg refresh_scripts: only load scripts which are not already loaded
       as modules.
    :type refresh_scripts: bool
    """
    use_time = use_class_register_check = _bpy.app.debug_python
    use_user = not _is_factory_startup

    if use_time:
        import time
        t_main = time.time()

    loaded_modules = set()

    if refresh_scripts:
        original_modules = _sys.modules.values()

    if reload_scripts:
        # just unload, don't change user defaults, this means we can sync
        # to reload. note that they will only actually reload of the
        # modification time changes. This `won't` work for packages so...
        # its not perfect.
        for module_name in [ext.module for ext in _user_preferences.addons]:
            _addon_utils.disable(module_name)

        # *AFTER* unregistering all add-ons, otherwise all calls to
        # unregister_module() will silently fail (do nothing).
        _bpy_types.TypeMap.clear()

    def register_module_call(mod):
        register = getattr(mod, "register", None)
        if register:
            try:
                register()
            except:
                import traceback
                traceback.print_exc()
        else:
            print("\nWarning! '%s' has no register function, "
                  "this is now a requirement for registerable scripts" %
                  mod.__file__)

    def unregister_module_call(mod):
        unregister = getattr(mod, "unregister", None)
        if unregister:
            try:
                unregister()
            except:
                import traceback
                traceback.print_exc()

    def test_reload(mod):
        import importlib
        # reloading this causes internal errors
        # because the classes from this module are stored internally
        # possibly to refresh internal references too but for now, best not to.
        if mod == _bpy_types:
            return mod

        try:
            return importlib.reload(mod)
        except:
            import traceback
            traceback.print_exc()

    def test_register(mod):

        if refresh_scripts and mod in original_modules:
            return

        if reload_scripts and mod:
            print("Reloading:", mod)
            mod = test_reload(mod)

        if mod:
            register_module_call(mod)
            _global_loaded_modules.append(mod.__name__)

    if reload_scripts:

        # module names -> modules
        _global_loaded_modules[:] = [_sys.modules[mod_name]
                                     for mod_name in _global_loaded_modules]

        # loop over and unload all scripts
        _global_loaded_modules.reverse()
        for mod in _global_loaded_modules:
            unregister_module_call(mod)

        for mod in _global_loaded_modules:
            test_reload(mod)

        del _global_loaded_modules[:]

    from bpy_restrict_state import RestrictBlend

    with RestrictBlend():
        for base_path in script_paths(use_user=use_user):
            for path_subdir in _script_module_dirs:
                path = _os.path.join(base_path, path_subdir)
                if _os.path.isdir(path):
                    _sys_path_ensure(path)

                    # only add this to sys.modules, don't run
                    if path_subdir == "modules":
                        continue

                    for mod in modules_from_path(path, loaded_modules):
                        test_register(mod)

    # load template (if set)
    if any(_bpy.utils.app_template_paths()):
        import bl_app_template_utils
        bl_app_template_utils.reset(reload_scripts=reload_scripts)
        del bl_app_template_utils

    # deal with addons separately
    _initialize = getattr(_addon_utils, "_initialize", None)
    if _initialize is not None:
        # first time, use fast-path
        _initialize()
        del _addon_utils._initialize
    else:
        _addon_utils.reset_all(reload_scripts=reload_scripts)
    del _initialize

    # run the active integration preset
    filepath = preset_find(_user_preferences.inputs.active_keyconfig,
                           "keyconfig")

    if filepath:
        keyconfig_set(filepath)

    if reload_scripts:
        import gc
        print("gc.collect() -> %d" % gc.collect())

    if use_time:
        print("Python Script Load Time %.4f" % (time.time() - t_main))

    if use_class_register_check:
        for cls in _bpy.types.bpy_struct.__subclasses__():
            if getattr(cls, "is_registered", False):
                for subcls in cls.__subclasses__():
                    if not subcls.is_registered:
                        print(
                            "Warning, unregistered class: %s(%s)" %
                            (subcls.__name__, cls.__name__)
                        )
Example #58
0
def kfold_lightgbm(df, debug=False):
    # Divide in training/validation and test data

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()

    folds = KFold(n_splits=10, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])  # predicted valid_y
    sub_preds = np.zeros(test_df.shape[0])  # submission preds
    feature_importance_df = pd.DataFrame()  # feature importance

    fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"])  # holding best iter to save model
    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index',
                                                      "APP_index", "BURO_index", "PREV_index", "INSTAL_index",
                                                      "CC_index", "POS_index"]]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            n_jobs=-1,
            n_estimators=10000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x,
                train_y,
                eval_set=[(train_x, train_y),
                          (valid_x, valid_y)],
                eval_metric='auc',
                verbose=200,
                early_stopping_rounds=200)

        # predicted valid_y
        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

        # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor.
        sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        # fold, auc and best iteration
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))

        # best auc & iteration
        fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1),
                                                    'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]),
                                                    "BEST_ITER": clf.best_iteration_}, ignore_index=True)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    # OUTPUTS
    print(fold_auc_best_df)
    print(feature_importance_df)

    # feature importance'ları df olarak kaydet
    feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl")
    fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl")

    # Final Model
    best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values)

    y_train = train_df["TARGET"]
    x_train = train_df[feats]

    final_model = LGBMClassifier(
        n_jobs=-1,
        n_estimators=best_iter_1,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1).fit(x_train, y_train)

    cur_dir = os.getcwd()
    os.chdir('models/reference/')
    pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb'))  # model
    os.chdir(cur_dir)

    # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri.
    cowsay.cow('Full Train(Validation) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        cur_dir = os.getcwd()
        os.chdir('outputs/predictions/')
        test_df['TARGET'] = sub_preds
        test_df[['SK_ID_CURR', 'TARGET']].to_csv("reference_submission.csv", index=False)
        os.chdir(cur_dir)
    display_importances(feature_importance_df)
    del x_train, y_train

    return feature_importance_df
Example #59
0
def get_autoencoder(input_size, latent_dim, data):
    learning_rate = 0.00001
    autoencoder = build(input_size, latent_dim)
    autoencoder.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate))
    encoder = autoencoder.get_layer("encoder")
    cell_decoders = {}
    cell_discriminators = {}
    discriminator = make_discriminator_model(input_size)
    discriminator.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate))
    reconstruction_list = np.zeros((0, 978, 1))
    count = 0
    e = 0
    if not os.path.exists("best"):
        os.makedirs("best")
    if not os.path.exists("weights"):
        os.makedirs("weights")
    while e < nb_total_epoch:
        print("Total epoch " + str(e) + " ------------------------------------------------------")
        if e > 0:
            autoencoder_saved = keras.models.load_model("./weights/main_model")
            autoencoder = build(input_size, latent_dim)
            autoencoder.set_weights(autoencoder_saved.get_weights())
            autoencoder.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate))
            del autoencoder_saved
            discriminator = make_discriminator_model(input_size)
            encoder = autoencoder.get_layer("encoder")

        if e == 0:
            print("Main autoencoder")
            # autoencoder = keras.models.load_model("default_autoencoder")
            callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
            autoencoder.fit(data.train_data, data.train_data, epochs=nb_autoencoder_epoch, batch_size=batch_size,
                            validation_split=0.1,
                            callbacks=[callback])
            autoencoder.save("default_autoencoder")
            for cell in data.cell_types:
                decoder = autoencoder.get_layer("decoder")
                cell_decoders[cell] = decoder.get_weights().copy()
                cell_discriminators[cell] = discriminator.get_weights().copy()
                pickle.dump(cell_decoders[cell], open("./weights/" + cell + "_decoder_weights", "wb"))
                del decoder
        print("Training decoders")
        decoder = autoencoder.get_layer("decoder")
        count_im = 0
        for pert in data.train_perts:
            cell = random.choice(list(data.cell_types))
            decoder.set_weights(cell_decoders[cell])
            discriminator.set_weights(cell_discriminators[cell])
            pert_profiles = np.asarray([data.train_data[i]
                                        for i, p in enumerate(data.train_meta) if p[1] == pert])
            target_profiles = [data.train_data[i]
                               for i, p in enumerate(data.train_meta) if p[1] == pert and p[0] == cell]
            while len(target_profiles) < len(pert_profiles):
                target_profiles.append(target_profiles[0])
            target_profiles = np.asarray(target_profiles)
            if count_im < 5:
                z_mean, z_log_var, z = encoder.predict(pert_profiles)
                utils1.draw_vectors(z, "vectors/" + pert + "_1.png")

            train_step(autoencoder, discriminator, pert_profiles, target_profiles, e)
            if count_im < 5:
                z_mean, z_log_var, z = encoder.predict(pert_profiles)
                utils1.draw_vectors(z, "vectors/" + pert + "_2.png")
            count_im = count_im + 1
            cell_decoders[cell] = decoder.get_weights().copy()
            cell_discriminators[cell] = discriminator.get_weights().copy()
        if e == nb_total_epoch - 1:
            print("freezing encoder")
            encoder.trainable = False
            decoder.trainable = True
            autoencoder.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(0.00001))
        cl = list(data.cell_types)
        random.shuffle(cl)
        for cell in cl:
            print(cell)
            decoder.set_weights(cell_decoders[cell])
            tf.random.set_seed(1)
            cell_data = np.asarray([[data.train_data[i], data.train_meta[i]]
                                    for i, p in enumerate(data.train_meta) if p[0] == cell])
            if len(cell_data) == 0:
                continue
            input_profiles = []
            output_profiles = []
            for i in range(len(cell_data)):
                # input_profiles.append(cell_data[i][0])
                # output_profiles.append(cell_data[i][0])
                closest, profile, mean_profile, all_profiles = data.get_profile(data.train_data,
                                                                                data.meta_dictionary_pert[
                                                                                    cell_data[i][1][1]],
                                                                                cell_data[i][1], train_data=True)
                if mean_profile is not None:
                    for p in all_profiles:
                        input_profiles.append(p)
                        output_profiles.append(cell_data[i][0])

            input_profiles = np.asarray(input_profiles)
            output_profiles = np.asarray(output_profiles)
            if e == nb_total_epoch - 1:
                cell_data_val = np.asarray([[data.val_data[i], data.val_meta[i]]
                                            for i, p in enumerate(data.val_meta) if p[0] == cell])
                input_profiles_val = []
                output_profiles_val = []
                for i in range(len(cell_data_val)):
                    closest, profile, mean_profile, all_profiles = data.get_profile(data.val_data,
                                                                                    data.meta_dictionary_pert_val[
                                                                                        cell_data_val[i][1][1]],
                                                                                    cell_data_val[i][1])
                    if mean_profile is not None:
                        for p in all_profiles:
                            input_profiles_val.append(p)
                            output_profiles_val.append(cell_data_val[i][0])
                input_profiles_val = np.asarray(input_profiles_val)
                output_profiles_val = np.asarray(output_profiles_val)
                callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
                autoencoder.fit(input_profiles, output_profiles, epochs=nb_frozen_epoch, batch_size=batch_size,
                                validation_data=(input_profiles_val, output_profiles_val), callbacks=[callback])
            else:
                discriminator.set_weights(cell_discriminators[cell])
                fake_data = autoencoder.predict(input_profiles)
                if len(reconstruction_list) < 10000:
                    reconstruction_list = np.append(reconstruction_list, fake_data, axis=0)
                else:
                    start = random.randint(0, len(reconstruction_list) - 1 - len(fake_data))
                    reconstruction_list[start:start + len(fake_data)] = fake_data
                np.random.shuffle(reconstruction_list)
                for d_epochs in range(10):
                    total = int(math.ceil(float(len(input_profiles)) / batch_size))
                    for i in range(total):
                        output_data = output_profiles[i * batch_size:(i + 1) * batch_size]
                        reconstruction_data = reconstruction_list[np.random.choice(reconstruction_list.shape[0],
                                                                                   batch_size, replace=False)]
                        train_step_d(discriminator, output_data, reconstruction_data)
                    cell_discriminators[cell] = discriminator.get_weights().copy()
                    fake_data = autoencoder.predict(input_profiles)
                    r = 0
                    f_new = 0
                    a = discriminator.predict(output_profiles)
                    for v in a:
                        if v > 0.5:
                            r = r + 1

                    a = discriminator.predict(fake_data)
                    for v in a:
                        if v > 0.5:
                            f_new = f_new + 1
                    print(str(d_epochs) + " discriminator " + str(r) + " : " + str(f_new) + " - " + str(len(input_profiles)))
            #
            # tf.random.set_seed(1)
            cell_decoders[cell] = decoder.get_weights().copy()

            gc.collect()
        print("---------------------------------------------------------------\n")

        # train_cor_sum = 0.0
        # train_count = 0
        # seen_perts = []
        # for i in range(len(data.train_data)):
        #     train_meta_object = data.train_meta[i]
        #     if train_meta_object[1] in seen_perts:
        #         continue
        #     closest, closest_profile, mean_profile, all_profiles = data.get_profile(data.train_data,
        #                                                                             data.meta_dictionary_pert[
        #                                                                                 train_meta_object[1]],
        #                                                                             train_meta_object, train_data=True)
        #     if closest_profile is None:
        #         continue
        #     seen_perts.append(train_meta_object[1])
        #     train_count = train_count + 1
        #     weights = cell_decoders[train_meta_object[0]]
        #     autoencoder.get_layer("decoder").set_weights(weights)
        #     decoded1 = autoencoder.predict(closest_profile)
        #     train_cor_sum = train_cor_sum + stats.pearsonr(decoded1.flatten(), data.train_data[i].flatten())[0]
        # train_cor = train_cor_sum / train_count
        # print("Training pcc: " + str(train_cor))
        # print("Evaluated:" + str(train_count))

        val_cor_sum = 0.0
        val_count = 0
        seen_perts = []
        disc_fake = 0
        disc_real = 0
        for i in range(len(data.val_data)):
            val_meta_object = data.val_meta[i]
            if val_meta_object[1] in seen_perts:
                continue
            closest, closest_profile, mean_profile, all_profiles = data.get_profile(data.val_data,
                                                                                    data.meta_dictionary_pert_val[
                                                                                        val_meta_object[1]],
                                                                                    val_meta_object)
            if closest_profile is None:
                continue
            seen_perts.append(val_meta_object[1])
            val_count = val_count + 1
            weights = cell_decoders[val_meta_object[0]]
            autoencoder.get_layer("decoder").set_weights(weights)

            predictions = []
            for p in all_profiles:
                predictions.append(autoencoder.predict(np.asarray([p])))
            special_decoded = np.mean(np.asarray(predictions), axis=0)
            val_cor_sum = val_cor_sum + stats.pearsonr(special_decoded.flatten(), data.val_data[i].flatten())[0]
            discriminator.set_weights(cell_discriminators[val_meta_object[0]])
            if discriminator.predict(special_decoded)[0, 0] > 0.5:
                disc_fake = disc_fake + 1
            if discriminator.predict(np.asarray([data.val_data[i]]))[0, 0] > 0.5:
                disc_real = disc_real + 1

        val_cor = val_cor_sum / val_count
        print("Validation pcc: " + str(val_cor))
        print("Evaluated:" + str(val_count))
        print("Discriminator " + str(disc_fake) + " : " + str(disc_real))
        if e == 0:
            best_val_cor = val_cor
        else:
            if val_cor < best_val_cor:
                count = count + 1
            else:
                best_val_cor = val_cor
                count = 0
                autoencoder.save("best/main_model")
                for cell in data.cell_types:
                    pickle.dump(cell_decoders[cell], open("best/" + cell + "_decoder_weights", "wb"))

        if count > 40:
            e = nb_total_epoch - 2
            count = 0
            for cell in data.cell_types:
                cell_decoders[cell] = pickle.load(open("best/" + cell + "_decoder_weights", "rb"))
            shutil.rmtree('weights')
            shutil.move('best', 'weights')

        autoencoder.save("weights/main_model")
        for cell in data.cell_types:
            pickle.dump(cell_decoders[cell], open("weights/" + cell + "_decoder_weights", "wb"))

        # Needed to prevent Keras memory leak
        del autoencoder
        del encoder
        del discriminator
        gc.collect()
        K.clear_session()
        tf.compat.v1.reset_default_graph()
        print("---------------------------------------------------------------\n")
        e = e + 1

    autoencoder = keras.models.load_model("weights/main_model")
    return autoencoder, cell_decoders, val_cor
Example #60
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "all"
    logger = get_logger()
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/train_merged.pickle")
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        },
        "previous_answer_index_question_id": {
            "type": "category"
        },
        "previous_answer_question_id": {
            "type": "category"
        },
        "timediff-elapsedtime_bin500": {
            "type": "category"
        },
        "timedelta_log10": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
            groupby="user_id",
            column="question_id",
            is_debug=is_debug,
            model_id=model_id,
            n=300)
        feature_factory_dict["user_id"][
            "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True)
        feature_factory_dict["user_id"][
            f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(
                column="user_id", agg_column="study_time", remove_now=False)

        feature_factory_dict["user_id"][
            "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
            )
        feature_factory_dict["post"] = {
            "DurationFeaturePostProcess": DurationFeaturePostProcess()
        }

        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id=model_id,
            load_feature=not is_debug,
            save_feature=not is_debug)
        print("all_predict")
        df = feature_factory_manager.all_predict(df)

        def f(x):
            x = x // 1000
            if x < -100:
                return -100
            if x > 400:
                return 400
            return x

        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"]
        ]
        df["timediff-elapsedtime_bin500"] = [
            f(x) for x in df["timediff-elapsedtime"].values
        ]
        df["timedelta_log10"] = np.log10(
            df["duration_previous_content"].values)
        df["timedelta_log10"] = df["timedelta_log10"].replace(
            -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8")
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300", "previous_answer_index_question_id",
            "previous_answer_question_id", "row_id",
            "timediff-elapsedtime_bin500", "timedelta_log10"
        ]]
        print(df.head(10))

        print("data preprocess")

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])

    if not load_pickle or is_debug:
        df_val_row = pd.read_feather(
            "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather"
        )
        if is_debug:
            df_val_row = df_val_row.head(3000)
        df_val_row["is_val"] = 1

        df = pd.merge(df, df_val_row, how="left", on="row_id")
        df["is_val"] = df["is_val"].fillna(0)

        print(df["is_val"].value_counts())

        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model275_all", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model275_all/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model275_all/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model275_all/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model275_all/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout,
                      cont_emb=params["cont_emb"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.2
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.2,
    )
    num_train_optimization_steps = int(len(dataloader_train) * 25)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)
    auc_val = 0
    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, epoch,
                                              output_dir, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))
        torch.save(
            model.state_dict(),
            f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth"
        )

    # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_val)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)