def test_handle_lifetime(self): refs = [] for type in self.handle_types: klass = getattr(pyuv, type) obj = klass(self.loop) refs.append(weakref.ref(obj)) del obj # There are no more references to the handles at this point. # Garbage collection should be prevented from freeing them, though. # Touching each of these without segfault is a best effort check. # The validity of the weakrefs is implementation dependent :(. gc.collect() handles = self.loop.handles self.assertEqual(len(handles), len(self.handle_types)) for handle in handles: self.assertTrue(handle.closed) del handle del handles # Give the loop a chance to finish closing the handles. self.loop.run() # Ensure the weakref is gone now. for ref in refs: self.assertEqual(ref(), None)
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 ypred=np.zeros(X_train.shape[0]) for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=1 for j in range(m): clf=xgb_classifier(eta=0.01,min_child_weight=10,col=0.7,subsample=0.68,depth=5,num_round=500,seed=j*77,gamma=0) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred/(1+j) print j,llfun(y_test_cv,yqq) y_pred/=m; #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) #clf.fit(X_train_cv,(y_train_cv)) #y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred[test_index]=y_pred print xx[-1]#,y_pred.shape print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred
def scheduled_recipe_fetched(self, job): temp_files, fmt, arg = self.conversion_jobs.pop(job) fname = temp_files[0].name if job.failed: self.scheduler.recipe_download_failed(arg) return self.gui.job_exception(job) id = self.gui.library_view.model().add_news(fname, arg) # Arg may contain a "keep_issues" variable. If it is non-zero, # delete all but newest x issues. try: keep_issues = int(arg['keep_issues']) except: keep_issues = 0 if keep_issues > 0: ids_with_tag = list(sorted(self.gui.library_view.model(). db.tags_older_than(arg['title'], None, must_have_tag=_('News')), reverse=True)) ids_to_delete = ids_with_tag[keep_issues:] if ids_to_delete: self.gui.library_view.model().delete_books_by_id(ids_to_delete) self.gui.library_view.model().beginResetModel(), self.gui.library_view.model().endResetModel() sync = self.gui.news_to_be_synced sync.add(id) self.gui.news_to_be_synced = sync self.scheduler.recipe_downloaded(arg) self.gui.status_bar.show_message(arg['title'] + _(' fetched.'), 3000) self.gui.email_news(id) self.gui.sync_news() gc.collect()
def clear(self): """ Clear the output_table """ self.output_table = None self.output_table_default = None gc.collect()
def lars_regression_noise_ipyparallel(pars): import numpy as np import os import sys import gc Y_name,C_name,noise_sn,idxs_C, idxs_Y=pars Y=np.load(Y_name,mmap_mode='r') Y=np.array(Y[idxs_Y,:]) C=np.load(C_name,mmap_mode='r') C=np.array(C) _,T=np.shape(C) #sys.stdout = open(str(os.getpid()) + ".out", "w") st=time.time() As=[] #print "*****************:" + str(idxs_Y[0]) + ',' + str(idxs_Y[-1]) sys.stdout.flush() for y,px in zip(Y,idxs_Y): #print str(time.time()-st) + ": Pixel" + str(px) sys.stdout.flush() c=C[idxs_C[px],:] if np.size(c)>0: sn=noise_sn[px]**2*T _,_,a,_,_=lars_regression_noise(y, c.T, 1, sn) if not np.isscalar(a): a=a.T As.append((px,idxs_C[px],a)) del Y del C gc.collect() return As#As
def dump_references(log, instances, exclude=[]): import gc import inspect gc.collect() frame = inspect.currentframe() try: exclude.append(instances) exclude.append([frame]) for instance in instances: referrers = [x for x in gc.get_referrers(instance) if (x not in exclude and len([y for y in exclude if x in y])==0)] log.info("referrers for %s: %s", instance, len(referrers)) for i in range(len(referrers)): r = referrers[i] log.info("[%s] in %s", i, type(r)) if inspect.isframe(r): log.info(" frame info: %s", str(inspect.getframeinfo(r))[:1024]) elif type(r)==list: listref = gc.get_referrers(r) log.info(" list: %s.. %s referrers: %s", str(r[:32])[:1024], len(listref), str(listref[:32])[:1024]) elif type(r)==dict: if len(r)>64: log.info(" %s items: %s", len(r), str(r)[:1024]) continue for k,v in r.items(): if k is instance: log.info(" key with value=%s", v) elif v is instance: log.info(" for key=%s", k) else: log.info(" %s : %s", type(r), r) finally: del frame
def print_leaks(): global before, after gc.collect() lobjs = gc.get_objects() for i in lobjs: if type(i) not in ignore: after[type(i)] += 1 log.info("print_leaks:") leaked = {} for k in after: delta = after[k]-before[k] if delta>0: leaked[delta] = k before = after after = defaultdict(int) for delta in reversed(sorted(leaked.keys())): ltype = leaked[delta] matches = [x for x in lobjs if type(x)==ltype and ltype not in ignore] if len(matches)<32: minfo = [str(x)[:32] for x in matches] else: minfo = "%s matches" % len(matches) log.info("%8i : %s : %s", delta, ltype, minfo) if len(matches)<32 and ltype in detailed: frame = inspect.currentframe() exclude = [frame, matches, lobjs] try: dump_references(log, matches, exclude=exclude) finally: del frame del exclude del matches del minfo del lobjs return True
def cancel_clicked(self,widget,temp=False): newtree=devede_other.create_tree(self,"wcancel_job_dialog",self.gladefile,False) window=newtree.get_object("wcancel_job_dialog") window.show() value=window.run() window.hide() window.destroy() if value!=-5: # no return True self.runner.cancel() self.runner.wait_end() gobject.source_remove(self.timer) self.window.hide() self.window.destroy() newtree=devede_other.create_tree(self,"waborted_dialog",self.gladefile,False) window=newtree.get_object("waborted_dialog") window.show() window.run() window.hide() window.destroy() window=None gc.collect() (self.main_window_callback)() # show the main window return True
def dump_state(self): """Dump the state of the application to the output, this method is triggered by pressing :kbd:`Ctrl-Alt-D` in the GUI""" from camelot.view.model_thread import post from camelot.view.register import dump_register from camelot.view.proxy.collection_proxy import CollectionProxy import gc gc.collect() dump_register() def dump_session_state(): import collections from camelot.model.authentication import Person print '======= begin session ==============' type_counter = collections.defaultdict(int) for o in Person.query.session: type_counter[type(o).__name__] += 1 for k,v in type_counter.items(): print k,v print '====== end session ==============' post( dump_session_state ) for o in gc.get_objects(): if isinstance(o, CollectionProxy): print o for r in gc.get_referrers(o): print ' ', type(r).__name__ for rr in gc.get_referrers(r): print ' ', type(rr).__name__
def test_wiretap(self): attic = Location("Attic", "A dark attic.") player = Player("fritz", "m") io = ConsoleIo(None) io.supports_smartquotes = False pc = PlayerConnection(player, io) player.set_screen_sizes(0, 100) julie = NPC("julie", "f") julie.move(attic) player.move(attic) julie.tell("message for julie") attic.tell("message for room") self.assertEqual(["message for room\n"], player.test_get_output_paragraphs()) with self.assertRaises(ActionRefused): player.create_wiretap(julie) player.privileges = {"wizard"} player.create_wiretap(julie) player.create_wiretap(attic) julie.tell("message for julie") attic.tell("message for room") pubsub.sync() output = pc.get_output() self.assertTrue("[wiretapped from 'Attic': message for room]" in output) self.assertTrue("[wiretapped from 'julie': message for julie]" in output) self.assertTrue("[wiretapped from 'julie': message for room]" in output) self.assertTrue("message for room " in output) # test removing the wiretaps player.clear_wiretaps() import gc gc.collect() julie.tell("message for julie") attic.tell("message for room") self.assertEqual(["message for room\n"], player.test_get_output_paragraphs())
def testSimpleCleanup(self): g = graph.Graph() op = OpSimple(graph=g) r = weakref.ref(op) del op gc.collect() assert r() is None, "cleanup failed"
def test_main(verbose=None): import sys test_classes = ( TestBasic, TestVariousIteratorArgs, TestSubclass, TestSubclassWithKwargs, TestSequence, ) support.run_unittest(*test_classes) # verify reference counting if verbose and hasattr(sys, "gettotalrefcount"): import gc counts = [None] * 5 for i in range(len(counts)): support.run_unittest(*test_classes) gc.collect() counts[i] = sys.gettotalrefcount() print(counts) # doctests from test import test_deque support.run_doctest(test_deque, verbose)
def cleanup_core_plugin(self): """Ensure that the core plugin is deallocated.""" nm = manager.NeutronManager if not nm.has_instance(): return # TODO(marun) Fix plugins that do not properly initialize notifiers agentschedulers_db.AgentSchedulerDbMixin.agent_notifiers = {} # Perform a check for deallocation only if explicitly # configured to do so since calling gc.collect() after every # test increases test suite execution time by ~50%. check_plugin_deallocation = ( bool_from_env('OS_CHECK_PLUGIN_DEALLOCATION')) if check_plugin_deallocation: plugin = weakref.ref(nm._instance.plugin) nm.clear_instance() if check_plugin_deallocation: gc.collect() # TODO(marun) Ensure that mocks are deallocated? if plugin() and not isinstance(plugin(), mock.Base): raise AssertionError( 'The plugin for this test was not deallocated.')
def go(): router_closed = asyncio.Future() dealer_closed = asyncio.Future() router, _ = yield from loop.create_zmq_connection( lambda: ZmqRouterProtocol(router_closed), zmq.ROUTER, bind='tcp://127.0.0.1:*') addr = next(iter(router.bindings())) dealer, _ = yield from loop.create_zmq_connection( lambda: ZmqDealerProtocol(count, dealer_closed), zmq.DEALER, connect=addr) msg = b'func', b'\0'*200 gc.collect() t1 = time.monotonic() dealer.write(msg) yield from dealer_closed t2 = time.monotonic() gc.collect() router.close() yield from router_closed return t2 - t1
def test_zmq_with_thread(count): """zmq with threads""" print('.', end='', flush=True) ctx = zmq.Context() dealer = ctx.socket(zmq.DEALER) dealer.bind('tcp://127.0.0.1:*') address = dealer.getsockopt(zmq.LAST_ENDPOINT).rstrip(b'\0') msg = b'func', b'\0'*200 def router_thread(): router = ctx.socket(zmq.ROUTER) router.connect(address) for i in range(count): addr, m1, m2 = router.recv_multipart() router.send_multipart((addr, m1, m2)) router.close() th = threading.Thread(target=router_thread) th.start() gc.collect() t1 = time.monotonic() for i in range(count): dealer.send_multipart(msg) dealer.recv_multipart() t2 = time.monotonic() gc.collect() th.join() dealer.close() ctx.destroy() return t2 - t1
def test_contourf_transform_path_counting(): ax = plt.axes(projection=ccrs.Robinson()) plt.draw() # Capture the size of the cache before our test. gc.collect() initial_cache_size = len(cgeoaxes._PATH_TRANSFORM_CACHE) path_to_geos_counter = CallCounter(cartopy.mpl.patch, 'path_to_geos') with path_to_geos_counter: x, y, z = sample_data((30, 60)) cs = plt.contourf(x, y, z, 5, transform=ccrs.PlateCarree()) n_geom = sum([len(c.get_paths()) for c in cs.collections]) del cs if not six.PY3: del c plt.draw() # Before the performance enhancement, the count would have been 2 * n_geom, # but should now be just n_geom. msg = ('The given geometry was transformed too many times (expected: %s; ' 'got %s) - the caching is not working.' '' % (n_geom, path_to_geos_counter.count)) assert path_to_geos_counter.count == n_geom, msg # Check the cache has an entry for each geometry. assert len(cgeoaxes._PATH_TRANSFORM_CACHE) == initial_cache_size + n_geom # Check that the cache is empty again once we've dropped all references # to the source paths. plt.clf() gc.collect() assert len(cgeoaxes._PATH_TRANSFORM_CACHE) == initial_cache_size plt.close()
def openFile(self, filename, weakreference=False): gc.collect() for item in self.rootItem: if item.file.filename == filename: ddict = {} ddict['event'] = "fileUpdated" ddict['filename'] = filename self.sigFileUpdated.emit(ddict) return item.file phynxFile = phynx.File(filename, 'r') if weakreference: def phynxFileInstanceDistroyed(weakrefObject): idx = self.rootItem._identifiers.index(id(weakrefObject)) child = self.rootItem._children[idx] child.clearChildren() del self._idMap[id(child)] self.rootItem.deleteChild(child) if not self.rootItem.hasChildren: self.clear() return refProxy = weakref.proxy(phynxFile, phynxFileInstanceDistroyed) self.rootItem.appendChild(refProxy) else: self.rootItem.appendChild(phynxFile) ddict = {} ddict['event'] = "fileAppended" ddict['filename'] = filename self.sigFileAppended.emit(ddict) return phynxFile
def test_collect_garbage(self): self.preclean() # Each of these cause four objects to be garbage: Two # Uncolectables and their instance dicts. Uncollectable() Uncollectable() C1055820(666) gc.collect() for v in self.visit: if v[1] != "stop": continue info = v[2] self.assertEqual(info["collected"], 2) self.assertEqual(info["uncollectable"], 8) # We should now have the Uncollectables in gc.garbage self.assertEqual(len(gc.garbage), 4) for e in gc.garbage: self.assertIsInstance(e, Uncollectable) # Now, let our callback handle the Uncollectable instances self.cleanup=True self.visit = [] gc.garbage[:] = [] gc.collect() for v in self.visit: if v[1] != "stop": continue info = v[2] self.assertEqual(info["collected"], 0) self.assertEqual(info["uncollectable"], 4) # Uncollectables should be gone self.assertEqual(len(gc.garbage), 0)
def test_class(self): class A: pass A.a = A gc.collect() del A self.assertNotEqual(gc.collect(), 0)
def test_get_stats(self): stats = gc.get_stats() self.assertEqual(len(stats), 3) for st in stats: self.assertIsInstance(st, dict) self.assertEqual(set(st), {"collected", "collections", "uncollectable"}) self.assertGreaterEqual(st["collected"], 0) self.assertGreaterEqual(st["collections"], 0) self.assertGreaterEqual(st["uncollectable"], 0) # Check that collection counts are incremented correctly if gc.isenabled(): self.addCleanup(gc.enable) gc.disable() old = gc.get_stats() gc.collect(0) new = gc.get_stats() self.assertEqual(new[0]["collections"], old[0]["collections"] + 1) self.assertEqual(new[1]["collections"], old[1]["collections"]) self.assertEqual(new[2]["collections"], old[2]["collections"]) gc.collect(2) new = gc.get_stats() self.assertEqual(new[0]["collections"], old[0]["collections"] + 1) self.assertEqual(new[1]["collections"], old[1]["collections"]) self.assertEqual(new[2]["collections"], old[2]["collections"] + 1)
def test_collect(self): self.preclean() gc.collect() # Algorithmically verify the contents of self.visit # because it is long and tortuous. # Count the number of visits to each callback n = [v[0] for v in self.visit] n1 = [i for i in n if i == 1] n2 = [i for i in n if i == 2] self.assertEqual(n1, [1]*2) self.assertEqual(n2, [2]*2) # Count that we got the right number of start and stop callbacks. n = [v[1] for v in self.visit] n1 = [i for i in n if i == "start"] n2 = [i for i in n if i == "stop"] self.assertEqual(n1, ["start"]*2) self.assertEqual(n2, ["stop"]*2) # Check that we got the right info dict for all callbacks for v in self.visit: info = v[2] self.assertTrue("generation" in info) self.assertTrue("collected" in info) self.assertTrue("uncollectable" in info)
def test_boom2(self): class Boom2: def __init__(self): self.x = 0 def __getattr__(self, someattribute): self.x += 1 if self.x > 1: del self.attr raise AttributeError a = Boom2() b = Boom2() a.attr = b b.attr = a gc.collect() garbagelen = len(gc.garbage) del a, b # Much like test_boom(), except that __getattr__ doesn't break the # cycle until the second time gc checks for __del__. As of 2.3b1, # there isn't a second time, so this simply cleans up the trash cycle. # We expect a, b, a.__dict__ and b.__dict__ (4 objects) to get # reclaimed this way. self.assertEqual(gc.collect(), 4) self.assertEqual(len(gc.garbage), garbagelen)
def growth(limit=10, peak_stats={}, shortnames=True): """Calculate the increase in peak object counts since last call. Returns a dict of {type_name: (delta, count)}. Limits the output to ``limit`` largest deltas. You may set ``limit`` to None to see all of them. Uses and updates ``peak_stats``, a dictionary from type names to previously seen peak object counts. Usually you don't need to pay attention to this argument. The caveats documented in :func:`typestats` apply. Example: >>> growth(limit=3) {'wrapper_descriptor': 14, 'tuple': 10, 'dict': 7} .. versionadded:: 1.8 """ gc.collect() stats = objgraph.typestats(shortnames=shortnames) deltas = [] for name, count in iteritems(stats): delta = count - peak_stats.get(name, 0) if delta > 0: deltas.append((name, (delta, count))) peak_stats[name] = count deltas = sorted(deltas, key=operator.itemgetter(1, 0), reverse=True) if limit: deltas = deltas[:limit] return dict(deltas)
def test_incrgc_simple(self): import gc from persistent.interfaces import UPTODATE from persistent._compat import _b cache = self._makeOne() oids = [] for i in range(100): oid = _b('oid_%04d' % i) oids.append(oid) cache[oid] = self._makePersist(oid=oid, state=UPTODATE) self.assertEqual(cache.cache_non_ghost_count, 100) cache.incrgc() gc.collect() # banish the ghosts who are no longer in the ring self.assertEqual(cache.cache_non_ghost_count, 10) items = cache.lru_items() self.assertEqual(_len(items), 10) self.assertEqual(items[0][0], _b('oid_0090')) self.assertEqual(items[1][0], _b('oid_0091')) self.assertEqual(items[2][0], _b('oid_0092')) self.assertEqual(items[3][0], _b('oid_0093')) self.assertEqual(items[4][0], _b('oid_0094')) self.assertEqual(items[5][0], _b('oid_0095')) self.assertEqual(items[6][0], _b('oid_0096')) self.assertEqual(items[7][0], _b('oid_0097')) self.assertEqual(items[8][0], _b('oid_0098')) self.assertEqual(items[9][0], _b('oid_0099')) for oid in oids[:90]: self.assertTrue(cache.get(oid) is None) for oid in oids[90:]: self.assertFalse(cache.get(oid) is None)
def testGCCollectNoMemoryManagement(self): self.gateway = JavaGateway( gateway_parameters=GatewayParameters( enable_memory_management=False)) gc.collect() # Should have nothing in the finalizers self.assertEqual(len(ThreadSafeFinalizer.finalizers), 0) def internal(): sb = self.gateway.jvm.java.lang.StringBuffer() sb.append("Hello World") sb2 = self.gateway.jvm.java.lang.StringBuffer() sb2.append("Hello World") finalizers_size_middle = len(ThreadSafeFinalizer.finalizers) return finalizers_size_middle finalizers_size_middle = internal() gc.collect() # Before collection: two objects created + two returned objects (append # returns a stringbuffer reference for easy chaining). self.assertEqual(finalizers_size_middle, 0) # Assert after collection self.assertEqual(len(ThreadSafeFinalizer.finalizers), 0) self.gateway.shutdown()
def tearDown(self): if self.old_env is None: del os.environ["CUDA_DEVICE"] else: os.environ["CUDA_DEVICE"] = self.old_env del self.old_env gc.collect()
def run_net((k,theta,T,g_inh_,spike_delay)): seed(int(os.getpid()*time.time())) print os.getpid() reinit() reinit_default_clock() clear(True) gc.collect() PKJ = PurkinjeCellGroup(1) PKJ.V = PKJ.El spikes = SpikeMonitor(PKJ) spikes.last_spike = None V_monitor = StateMonitor(PKJ,'V',record=0) ginh_monitor = StateMonitor(PKJ, 'g_inh', record=0) @network_operation(Clock(dt=defaultclock.dt)) def random_current(): PKJ.I = gamma(k,theta,size=len(PKJ)) * nA @network_operation(Clock(dt=defaultclock.dt)) def trigger_spike(): if spikes.spiketimes[0].shape[0] > 0: spikes.last_spike = spikes.spiketimes[0][-1]*second if spikes.last_spike is not None: if abs(defaultclock.t - (spikes.last_spike + spike_delay)) < .000001*ms: PKJ.g_inh = g_inh_ run(T) V_monitor.insert_spikes(spikes) first_isi = diff(spikes.spiketimes[0])[0] return V_monitor.getvalues(), first_isi, spikes.spiketimes
def find_chain(obj, predicate, edge_func, max_depth=20, extra_ignore=()): queue = [obj] depth = {id(obj): 0} parent = {id(obj): None} ignore = set(extra_ignore) ignore.add(id(extra_ignore)) ignore.add(id(queue)) ignore.add(id(depth)) ignore.add(id(parent)) ignore.add(id(ignore)) ignore.add(id(sys._getframe())) # this function ignore.add(id(sys._getframe(1))) # find_chain/find_backref_chain, most likely gc.collect() while queue: target = queue.pop(0) if predicate(target): chain = [target] while parent[id(target)] is not None: target = parent[id(target)] chain.append(target) return chain tdepth = depth[id(target)] if tdepth < max_depth: referrers = edge_func(target) ignore.add(id(referrers)) for source in referrers: if id(source) in ignore: continue if id(source) not in depth: depth[id(source)] = tdepth + 1 parent[id(source)] = target queue.append(source) return [obj] # not found
def __init__(self,inputLayerSize,hiddenLayerSize,outputLayerSize): #Define Hyperparameters gc.collect() self.inputLayerSize = inputLayerSize self.outputLayerSize = outputLayerSize self.hiddenLayerSize = hiddenLayerSize # Weights (parameters) # self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize) # self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize) self.W1 = np.random.uniform(-0.5,0.5,(self.inputLayerSize,self.hiddenLayerSize)) self.W2 = np.random.uniform(-0.5,0.5,(self.hiddenLayerSize,self.outputLayerSize)) try: f = open('myfile','r') for i in range(self.inputLayerSize): for j in range(self.hiddenLayerSize): temp = f.readline() self.W1[i][j] = float(temp) for i in range(self.hiddenLayerSize): for j in range(self.outputLayerSize): temp = f.readline() self.W2[i][j] = float(temp) f.close() except Exception, e: print("File not Found")
def test_bug21435(self): # This is a poor test - its only virtue is that it happened to # segfault on Tim's Windows box before the patch for 21435 was # applied. That's a nasty bug relying on specific pieces of cyclic # trash appearing in exactly the right order in finalize_garbage()'s # input list. # But there's no reliable way to force that order from Python code, # so over time chances are good this test won't really be testing much # of anything anymore. Still, if it blows up, there's _some_ # problem ;-) gc.collect() class A: pass class B: def __init__(self, x): self.x = x def __del__(self): self.attr = None def do_work(): a = A() b = B(A()) a.attr = b b.attr = a do_work() gc.collect() # this blows up (bad C pointer) when it fails
def period_over_period(self, df, start_date, end_date, period, history_periods=2, timestamp_col='timestamp_of_first_event'): try: # filter cols if necessary string = '0 {}(s) prev(current)'.format(period) # filter out the dates greater than today df_current = df.assign(period=string) # label the days being compared with the same label if len(df_current) > 0: df_current = self.label_dates_pop(df_current, period, timestamp_col) # zero out time information start = datetime(start_date.year, start_date.month, start_date.day, 0, 0, 0) end = datetime(end_date.year, end_date.month, end_date.day, 0, 0, 0) cols = list(df.columns) logger.warning(' Line 293 %s:df %s', period, df.head(10)) logger.warning(' Line 293 %s:df cols %s', period, cols) counter = 1 if isinstance(history_periods, str): history_periods = int(history_periods) # make dataframes for request no. of periods start, end = self.shift_period_range(period, start, end) while counter < history_periods and start >= self.initial_date: # load data if period == 'quarter': logger.warning('start:end %s:%s', start, end) if 'bcc' in self.table: df_temp = self.load_df_pym(start, end, cols, timestamp_col) else: df_temp = self.load_df(start, end, cols, timestamp_col) if df_temp is not None: if len(df_temp) > 1: string = '{} {}(s) prev'.format(counter, period) # label period df_temp = df_temp.assign(period=string) # relabel days to get matching day of week,doy, dom, for different periods df_temp = self.label_dates_pop(df_temp, period, timestamp_col) #logger.warning('df temp loaded for %s previous: %s',counter,len(df_temp)) df_current = concat_dfs(df_current, df_temp) del df_temp gc.collect() # shift the loading window counter += 1 start, end = self.shift_period_range(period, start, end) if period == 'week': logger.warning('LINE 327 df_current:%s', df_current.head(10)) return df_current except Exception: logger.error('period over period', exc_info=True)
def store_survey(self, survey_name, R_table_name, destination_table_name, data_dir, variables=None, force_recreation=True): """ Store a R data table in an HDF5 file Parameters ---------- survey_name : string the name of the survey R_table_name : string the name of the R data table destination_table_name : string the name of the table in the HDFStore data_dir : path the directory where to find the RData file variables : list of string, default None When not None, list of the variables to keep """ gc.collect() year = self.year def get_survey_year(survey_name, year): if survey_name == "logement": if year == 2003: return 2003 elif year in range(2006,2010): return 2006 if survey_name == "patrimoine": return 2004 else: return year print "creating %s" %(destination_table_name) table_Rdata = R_table_name + ".Rdata" filename = os.path.join(data_dir, str(get_survey_year(survey_name, year)), table_Rdata) print filename if not os.path.isfile(filename): raise Exception("filename do not exists") rpy.r.load(filename) stored_table = com.load_data(R_table_name) store = HDFStore(self.hdf5_filename) store_path = str(self.year)+"/"+destination_table_name if store_path in store: if force_recreation is not True: print store_path + "already exists, do not re-create and exit" store.close() return if variables is not None: print store print store_path print variables variables_stored = list(set(variables).intersection(set(stored_table.columns))) print list(set(variables).difference((set(stored_table.columns)))) store[store_path] = stored_table[variables_stored] else: store[store_path] = stored_table store.close() del stored_table gc.collect()
def fit(self, X_train=None, Y_train=None, X_test=None, Y_test=None, dataset_train=None, dataset_val=None, time_limit=None, **kwargs): start_time = time.time() params = self.params.copy() # TODO: kwargs can have num_cpu, num_gpu. Currently these are ignored. verbosity = kwargs.get('verbosity', 2) params = fixedvals_from_searchspaces(params) if verbosity <= 1: verbose_eval = False elif verbosity == 2: verbose_eval = 1000 elif verbosity == 3: verbose_eval = 50 else: verbose_eval = 1 eval_metric, eval_metric_name = self.get_eval_metric() dataset_train, dataset_val = self.generate_datasets(X_train=X_train, Y_train=Y_train, params=params, X_test=X_test, Y_test=Y_test, dataset_train=dataset_train, dataset_val=dataset_val) gc.collect() num_boost_round = params.pop('num_boost_round', 1000) logger.log(15, f'Training Gradient Boosting Model for {num_boost_round} rounds...') logger.log(15, "with the following hyperparameter settings:") logger.log(15, params) num_rows_train = len(dataset_train.data) if 'min_data_in_leaf' in params: if params['min_data_in_leaf'] > num_rows_train: # TODO: may not be necessary params['min_data_in_leaf'] = max(1, int(num_rows_train / 5.0)) # TODO: Better solution: Track trend to early stop when score is far worse than best score, or score is trending worse over time if (dataset_val is not None) and (dataset_train is not None): modifier = 1 if num_rows_train <= 10000 else 10000 / num_rows_train early_stopping_rounds = max(round(modifier * 150), 10) else: early_stopping_rounds = 150 callbacks = [] valid_names = ['train_set'] valid_sets = [dataset_train] if dataset_val is not None: reporter = kwargs.get('reporter', None) train_loss_name = self._get_train_loss_name() if reporter is not None else None if train_loss_name is not None: if 'metric' not in params or params['metric'] == '': params['metric'] = train_loss_name elif train_loss_name not in params['metric']: params['metric'] = f'{params["metric"]},{train_loss_name}' callbacks += [ # Note: Don't use self.params_aux['max_memory_usage_ratio'] here as LightGBM handles memory per iteration optimally. # TODO: Consider using when ratio < 1. early_stopping_custom(early_stopping_rounds, metrics_to_use=[('valid_set', eval_metric_name)], max_diff=None, start_time=start_time, time_limit=time_limit, ignore_dart_warning=True, verbose=False, manual_stop_file=False, reporter=reporter, train_loss_name=train_loss_name), ] valid_names = ['valid_set'] + valid_names valid_sets = [dataset_val] + valid_sets seed_val = params.pop('seed_value', 0) train_params = { 'params': params, 'train_set': dataset_train, 'num_boost_round': num_boost_round, 'valid_sets': valid_sets, 'valid_names': valid_names, 'callbacks': callbacks, 'verbose_eval': verbose_eval, } if not isinstance(eval_metric, str): train_params['feval'] = eval_metric else: if 'metric' not in train_params['params'] or train_params['params']['metric'] == '': train_params['params']['metric'] = eval_metric elif eval_metric not in train_params['params']['metric']: train_params['params']['metric'] = f'{train_params["params"]["metric"]},{eval_metric}' if seed_val is not None: train_params['params']['seed'] = seed_val random.seed(seed_val) np.random.seed(seed_val) # Train LightGBM model: try_import_lightgbm() import lightgbm as lgb self.model = lgb.train(**train_params) self.params_trained['num_boost_round'] = self.model.best_iteration
def run_tests_sequential(self): if self.ns.trace: import trace self.tracer = trace.Trace(trace=False, count=True) save_modules = sys.modules.keys() print("Run tests sequentially") previous_test = None for test_index, test in enumerate(self.tests, 1): start_time = time.monotonic() text = test if previous_test: text = '%s -- %s' % (text, previous_test) self.display_progress(test_index, text) if self.tracer: # If we're tracing code coverage, then we don't exit with status # if on a false return value from main. cmd = ('result = runtest(self.ns, test); ' 'self.accumulate_result(test, result)') ns = dict(locals()) self.tracer.runctx(cmd, globals=globals(), locals=ns) result = ns['result'] else: try: result = runtest(self.ns, test) except KeyboardInterrupt: self.interrupted = True self.accumulate_result(test, (INTERRUPTED, None)) break else: self.accumulate_result(test, result) previous_test = format_test_result(test, result[0]) test_time = time.monotonic() - start_time if test_time >= PROGRESS_MIN_TIME: previous_test = "%s in %s" % (previous_test, format_duration(test_time)) elif result[0] == PASSED: # be quiet: say nothing if the test passed shortly previous_test = None if self.ns.findleaks: gc.collect() if gc.garbage: print("Warning: test created", len(gc.garbage), end=' ') print("uncollectable object(s).") # move the uncollectable objects somewhere so we don't see # them again self.found_garbage.extend(gc.garbage) del gc.garbage[:] # Unload the newly imported modules (best effort finalization) for module in sys.modules.keys(): if module not in save_modules and module.startswith("test."): support.unload(module) if previous_test: print(previous_test)
def jorudan(num_rows=None): tmp_jorudan = pd.read_csv('../input/jorudan.tsv', sep='\t', nrows=num_rows) # 日付をdatetime型へ変換 tmp_jorudan['access_date'] = pd.to_datetime(tmp_jorudan['access_date']) tmp_jorudan['datetime'] = pd.to_datetime( tmp_jorudan['departure_and_arrival_date']) # 当日以降のアクセスデータを削除 tmp_jorudan = tmp_jorudan[ tmp_jorudan['datetime'] > tmp_jorudan['access_date']] # 2018/1/1以降のデータを削除 tmp_jorudan = tmp_jorudan[tmp_jorudan['datetime'] < '2018-01-01'] # one-hot encoding jorudan, cols = one_hot_encoder(tmp_jorudan[[ 'departure_and_arrival_type', 'departure_and_arrival_place_type', 'departure_prefecture', 'arrival_prefecture' ]], nan_as_category=False) # 日付と公園名のカラムを追加 jorudan['park'] = tmp_jorudan['park'] jorudan['datetime'] = tmp_jorudan['datetime'] feats_jorudan = [ c for c in jorudan.columns if c not in ['park', 'datetime'] ] # 集約用のdictを生成 agg_jorudan = {} for c in feats_jorudan: agg_jorudan[c] = ['sum', 'mean'] # 日付と公園名で集約 jorudan = jorudan.groupby(['park', 'datetime']).agg(agg_jorudan) # ゼロ埋め jorudan.fillna(0, inplace=True) # カラム名の変更 jorudan.columns = pd.Index( [e[0] + "_" + e[1].upper() for e in jorudan.columns.tolist()]) # 追加の特徴量 jorudan['departure_and_arrival_place_mean_sum'] = jorudan[ 'departure_and_arrival_place_type_A_MEAN'] + jorudan[ 'departure_and_arrival_place_type_D_MEAN'] jorudan['departure_and_arrival_place_sum_sum'] = jorudan[ 'departure_and_arrival_place_type_A_SUM'] + jorudan[ 'departure_and_arrival_place_type_D_SUM'] jorudan['departure_and_arrival_type__mean_sum'] = jorudan[ 'departure_and_arrival_type_A_MEAN'] + jorudan[ 'departure_and_arrival_type_D_MEAN'] jorudan['departure_and_arrival_type_sum_sum'] = jorudan[ 'departure_and_arrival_type_A_SUM'] + jorudan[ 'departure_and_arrival_type_D_SUM'] jorudan['departure_and_arrival_place_mean_ratio'] = jorudan[ 'departure_and_arrival_place_type_A_MEAN'] / jorudan[ 'departure_and_arrival_place_type_D_MEAN'] jorudan['departure_and_arrival_place_sum_ratio'] = jorudan[ 'departure_and_arrival_place_type_A_SUM'] / jorudan[ 'departure_and_arrival_place_type_D_SUM'] jorudan['departure_and_arrival_type_mean_ratio'] = jorudan[ 'departure_and_arrival_type_A_MEAN'] / jorudan[ 'departure_and_arrival_type_D_MEAN'] jorudan['departure_and_arrival_type_sum_ratio'] = jorudan[ 'departure_and_arrival_type_A_SUM'] / jorudan[ 'departure_and_arrival_type_D_SUM'] # カラム名を変更 jorudan.columns = ['JORUDAN_' + c for c in jorudan.columns] del tmp_jorudan gc.collect() return jorudan
def hotlink(num_rows=None): # load csv hotlink = pd.read_csv('../input/hotlink.tsv', sep='\t') # aggregate by datetime & keyword hotlink_all = hotlink.pivot_table(index='datetime', columns='keyword', values='count', aggfunc=[np.sum, np.max, 'mean']) hotlink_bbs = hotlink[hotlink.domain == 'bbs'].pivot_table( index='datetime', columns='keyword', values='count', aggfunc=[np.sum, np.max, 'mean']) hotlink_twitter = hotlink[hotlink.domain == 'twitter_sampling'].pivot_table( index='datetime', columns='keyword', values='count', aggfunc=[np.sum, np.max, 'mean']) hotlink_blog = hotlink[hotlink.domain == 'blog'].pivot_table( index='datetime', columns='keyword', values='count', aggfunc=[np.sum, np.max, 'mean']) # 欠損値をゼロ埋め hotlink_all.fillna(0, inplace=True) hotlink_bbs.fillna(0, inplace=True) hotlink_twitter.fillna(0, inplace=True) hotlink_blog.fillna(0, inplace=True) # indexをdatetime型に変換 hotlink_all.index = pd.to_datetime(hotlink_all.index) hotlink_bbs.index = pd.to_datetime(hotlink_bbs.index) hotlink_twitter.index = pd.to_datetime(hotlink_twitter.index) hotlink_blog.index = pd.to_datetime(hotlink_blog.index) # 1日先へシフト hotlink_all = hotlink_all.shift() hotlink_bbs = hotlink_bbs.shift() hotlink_twitter = hotlink_twitter.shift() hotlink_blog = hotlink_blog.shift() # カラム名を変更 hotlink_all.columns = pd.Index( [e[1] + "_" + e[0].upper() for e in hotlink_all.columns.tolist()]) hotlink_bbs.columns = pd.Index( [e[1] + "_" + e[0].upper() for e in hotlink_bbs.columns.tolist()]) hotlink_twitter.columns = pd.Index( [e[1] + "_" + e[0].upper() for e in hotlink_twitter.columns.tolist()]) hotlink_blog.columns = pd.Index( [e[1] + "_" + e[0].upper() for e in hotlink_blog.columns.tolist()]) hotlink_all.columns = ['HOTLINK_ALL_' + c for c in hotlink_all.columns] hotlink_bbs.columns = ['HOTLINK_BBS_' + c for c in hotlink_bbs.columns] hotlink_twitter.columns = [ 'HOTLINK_TWITTER_' + c for c in hotlink_twitter.columns ] hotlink_blog.columns = ['HOTLINK_BLOG_' + c for c in hotlink_blog.columns] # merge hotlink = pd.concat( [hotlink_all, hotlink_bbs, hotlink_twitter, hotlink_blog], axis=1) del hotlink_all, hotlink_bbs, hotlink_twitter, hotlink_blog gc.collect() return hotlink
def train_test(num_rows=None): print("Loading datasets...") # load datasets train_df = pd.read_csv('../input/train.tsv', sep='\t', nrows=num_rows) test_df = pd.read_csv('../input/test.tsv', sep='\t', nrows=num_rows) print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df))) #testのtargetをnanにしときます test_df['visitors'] = np.nan # merge df = train_df.append(test_df[['datetime', 'park', 'visitors']]).reset_index() del train_df, test_df gc.collect() # 日付をdatetime型へ変換 df['datetime'] = pd.to_datetime(df['datetime']) # 日本の祝日データを追加 df['japanese_holiday'] = getJapaneseHolidays(df['datetime']).replace(2, 1) # 連休数のファクターを生成 holidays = df.groupby('datetime')['japanese_holiday'].mean().replace(2, 1) holidays = fillHolidays(holidays).replace(2, 1) # 休日の谷間の平日を休日にする df['num_holidays'] = df['datetime'].map(getNumHolidays(holidays)) # 季節性の特徴量を追加 df['day'] = df['datetime'].dt.day.astype(object) df['month'] = df['datetime'].dt.month.astype(object) df['weekday'] = df['datetime'].dt.weekday.astype(object) df['weekofyear'] = df['datetime'].dt.weekofyear.astype(object) # df['day_month'] = df['day'].astype(str)+'_'+df['month'].astype(str) # df['day_weekday'] = df['day'].astype(str)+'_'+df['weekday'].astype(str) # df['day_weekofyear'] = df['day'].astype(str)+'_'+df['weekofyear'].astype(str) df['month_weekday'] = df['month'].astype(str) + '_' + df['weekday'].astype( str) df['month_weekofyear'] = df['month'].astype( str) + '_' + df['weekofyear'].astype(str) # df['weekday_weekofyear'] = df['weekday'].astype(str)+'_'+df['weekofyear'].astype(str) df['new_years_day'] = getNewYearsDay(df['datetime']) df['golden_week'] = getGoldenWeek(df['datetime']) # df['park_day'] = df['park'].astype(str)+'_'+df['day'].astype(str) df['park_month'] = df['park'].astype(str) + '_' + df['month'].astype(str) df['park_weekday'] = df['park'].astype(str) + '_' + df['weekday'].astype( str) df['park_japanese_holiday'] = df['park'].astype( str) + '_' + df['japanese_holiday'].astype(str) # df['park_weekofyear'] = df['park'].astype(str)+'_'+df['weekofyear'].astype(str) df['park_num_holiday'] = df['park'].astype( str) + '_' + df['num_holidays'].astype(str) df['park_new_years_day'] = df['park'].astype( str) + '_' + df['new_years_day'].astype(str) df['park_golden_week'] = df['park'].astype( str) + '_' + df['golden_week'].astype(str) # categorical変数を変換 df_res, cat_cols = one_hot_encoder(df, nan_as_category=False) # stratify & mearge用 df_res['park'] = df['park'] df_res['weekofyear'] = df['weekofyear'].astype(int) df_res['weekday'] = df['weekday'].astype(int) df_res['year'] = df['datetime'].dt.year.astype(int) df_res['month'] = df['datetime'].dt.month.astype(int) df_res['park_month'], _ = pd.factorize(df['park_month']) df_res['park_japanese_holiday'], _ = pd.factorize( df['park_japanese_holiday']) # df_res['ISESHIMA_summit'] = ((df['park']=='伊勢志摩国立公園')&df['japanese_holiday']&('2016-5-27'>df['datetime'])&(df['datetime']>'2015-6-5')).astype(int) # 2016年伊勢島サミット開催決定後の休日フラグ return df_res
from datetime import datetime t1=datetime.now() all_vector=['creativeSize','aid','advertiserId','campaignId','creativeId','adCategoryId','productId','productType','LBS','age','appIdAction','appIdInstall', 'carrier','consumptionAbility','ct','education','gender','house','interest1','interest2','interest3', 'interest4','interest5','kw1','kw2','kw3','os','marriageStatus','topic1','topic2','topic3'] #15 usecols=['uid','label'] feat_List=usecols[:-1] train_test=pd.read_csv("../data/train_test1_2_data.csv",usecols=usecols) train_test=train_test.fillna('-1') train=train_test[train_test.label!=-1] test=train_test[train_test.label==-1] del train_test gc.collect() print(train.shape) print(test.shape) def statis_feat(df,df_val,feature): df=df.groupby(feature)["label"].agg(['sum','count']).reset_index() new_feat_name=feature+'_stas' df.loc[:,new_feat_name]=100*(df['sum']+1+0.0001)/(df['count']+30+0.0001) df.loc[:,new_feat_name] = np.round(df.loc[:,new_feat_name].values,4) df_stas = df[[feature,new_feat_name]] df_val=pd.merge(df_val,df_stas,how='left',on=feature) return df_val[['index',new_feat_name]]#返回index,new_feat_name def Feature(train,predict,feat): train['index']=list(range(train.shape[0]))
def test_ret_struct_val(self): from rpython.translator.tool.cbuild import ExternalCompilationInfo from rpython.translator.platform import platform from rpython.tool.udir import udir c_file = udir.ensure("test_libffi", dir=1).join("xlib.c") c_file.write(py.code.Source(''' #include "src/precommondefs.h" #include <stdlib.h> #include <stdio.h> struct s2h { short x; short y; }; RPY_EXPORTED struct s2h give(short x, short y) { struct s2h out; out.x = x; out.y = y; return out; } RPY_EXPORTED struct s2h perturb(struct s2h inp) { inp.x *= 2; inp.y *= 3; return inp; } ''')) eci = ExternalCompilationInfo(include_dirs=[cdir]) lib_name = str(platform.compile([c_file], eci, 'x2', standalone=False)) lib = CDLL(lib_name) size = ffi_type_sshort.c_size*2 alignment = ffi_type_sshort.c_alignment tpe = make_struct_ffitype_e(size, alignment, [ffi_type_sshort]*2) give = lib.getrawpointer('give', [ffi_type_sshort, ffi_type_sshort], tpe.ffistruct) inbuffer = lltype.malloc(rffi.SHORTP.TO, 2, flavor='raw') inbuffer[0] = rffi.cast(rffi.SHORT, 40) inbuffer[1] = rffi.cast(rffi.SHORT, 72) outbuffer = lltype.malloc(rffi.SHORTP.TO, 2, flavor='raw') give.call([rffi.cast(rffi.VOIDP, inbuffer), rffi.cast(rffi.VOIDP, rffi.ptradd(inbuffer, 1))], rffi.cast(rffi.VOIDP, outbuffer)) assert outbuffer[0] == 40 assert outbuffer[1] == 72 perturb = lib.getrawpointer('perturb', [tpe.ffistruct], tpe.ffistruct) inbuffer[0] = rffi.cast(rffi.SHORT, 7) inbuffer[1] = rffi.cast(rffi.SHORT, 11) perturb.call([rffi.cast(rffi.VOIDP, inbuffer)], rffi.cast(rffi.VOIDP, outbuffer)) assert inbuffer[0] == 7 assert inbuffer[1] == 11 assert outbuffer[0] == 14 assert outbuffer[1] == 33 lltype.free(outbuffer, flavor='raw') lltype.free(inbuffer, flavor='raw') del give del perturb lltype.free(tpe, flavor='raw') gc.collect() del lib assert not ALLOCATED
def DO(num_leaves,max_depth, option): print('------------------------------------------------') print('start...') print('fraction:', frac) print('prepare predictors, categorical and target...') predictors = get_predictors(option) categorical = get_categorical(predictors) target = TARGET if debug==0: print('=======================================================================') print('process on server...') print('=======================================================================') if debug==1: print('=======================================================================') print('for testing only...') print('=======================================================================') if debug==2: print('=======================================================================') print('for LIGHT TEST only...') print('=======================================================================') print('reading train') subfilename = yearmonthdate_string + '_' + str(len(predictors)) + \ 'features_' + boosting_type + '_cv_newparam2_' + str(int(100*frac)) + \ 'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) + '.csv.gz' modelfilename = yearmonthdate_string + '_' + str(len(predictors)) + \ 'features_' + boosting_type + '_cv_newparam2_' + str(int(100*frac)) + \ 'percent_full_%d_%d'%(num_leaves,max_depth) + '_OPTION' + str(option) print('----------------------------------------------------------') print('SUMMARY:') print('----------------------------------------------------------') print('predictors:',predictors) print('taget', target) print('categorical', categorical) print('submission file name:', subfilename) print('model file name:', modelfilename) print('fraction:', frac) print('option:', option) print('----------------------------------------------------------') train_df = read_processed_h5(TRAIN_HDF5, predictors+target) if frac<1: train_df = train_df.sample(frac=frac, random_state = SEED) print_memory('afer reading train:') print(train_df.head()) print("train size: ", len(train_df)) gc.collect() print('----------------------------------------------------------') print("Training...") start_time = time.time() params = { 'boosting_type': boosting_type, 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.05, 'num_leaves': num_leaves, # we should let it be smaller than 2^(max_depth) 'max_depth': max_depth, # -1 means no limit 'min_data_in_leaf': 128, # Minimum number of data need in a child(min_data_in_leaf) # 'max_bin': 512, # Number of bucketed bin for feature values 'max_bin': 1024, # Number of bucketed bin for feature values 'subsample': 0.5, # Subsample ratio of the training instance. 'subsample_freq': 1, # frequence of subsample, <=0 means no enable 'feature_fraction': 0.9, # Subsample ratio of columns when constructing each tree. 'min_child_weight': 0, # Minimum sum of instance weight(hessian) needed in a child(leaf) 'subsample_for_bin': 200000, # Number of samples for constructing bin 'min_split_gain': 0, # lambda_l1, lambda_l2 and min_gain_to_split to regularization 'reg_alpha': 10, # L1 regularization term on weights 'reg_lambda': 0, # L2 regularization term on weights 'nthread': 4, 'verbose': 0, 'scale_pos_weight': 200, # because training data is extremely unbalanced } print ('params:', params) print('>> cleaning train...') train_df_array = train_df[predictors].values train_df_labels = train_df[target].values.astype('int').flatten() del train_df; gc.collect() print_memory() print('>> prepare dataset...') dtrain_lgb = lgb.Dataset(train_df_array, label=train_df_labels, feature_name=predictors, categorical_feature=categorical) del train_df_array, train_df_labels; gc.collect() print_memory() print('>> start cv...') cv_results = lgb.cv(params, dtrain_lgb, categorical_feature = categorical, num_boost_round=2000, metrics='auc', seed = SEED, shuffle = False, stratified=True, nfold=5, show_stdv=True, early_stopping_rounds=30, verbose_eval=True) print('[{}]: model training time'.format(time.time() - start_time)) print('Total memory in use after cv training: ', process.memory_info().rss/(2**30), ' GB\n') # print (cv_results) print('--------------------------------------------------------------------') num_boost_rounds_lgb = len(cv_results['auc-mean']) print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb)) print ('>> start trainning... ') model_lgb = lgb.train( params, dtrain_lgb, num_boost_round=num_boost_rounds_lgb, feature_name = predictors, categorical_feature = categorical) del dtrain_lgb gc.collect() print('--------------------------------------------------------------------') print('>> save model...') # save model to file model_lgb.save_model(modelfilename+'.txt') print('--------------------------------------------------------------------') print('>> reading test') test_df = read_processed_h5(TEST_HDF5,predictors+['click_id']) print(test_df.info()); print(test_df.head()) print_memory() print("test size : ", len(test_df)) sub = pd.DataFrame() sub['click_id'] = test_df['click_id'].astype('int') print(">> predicting...") sub['is_attributed'] = model_lgb.predict(test_df[predictors]) # if not debug: print("writing...") sub.to_csv(subfilename,index=False,compression='gzip') print("done...") return sub
def build_useful_data(): """ #TODO 利用pca降维,或者LDA降维......方式构建特征文件 构建可用的初始特征数据, 默认原始竞赛数据储存在当前文件夹中的datas文件夹中. :return: 可用数据(pd.DataFrame实例) """ # 读取蛋白质数据 print("Loading and merging data") protein_train = pd.read_csv('datas/df_protein_train.csv') protein_test = pd.read_csv('datas/df_protein_test.csv') protein_all = pd.concat([protein_train, protein_test]) #添加蛋白质序列长度作为特征 protein_all['seq_len'] = protein_all['Sequence'].apply(len) #读取分子数据 mol_train = pd.read_csv('datas/df_molecule.csv') aff_train = pd.read_csv('datas/df_affinity_train.csv') aff_test = pd.read_csv('datas/df_affinity_test_toBePredicted.csv') #初始化待预测的Ki值为-11 aff_test['Ki'] = -11 aff_all = pd.concat([aff_train, aff_test]) data = aff_all.merge(mol_train, on="Molecule_ID", how='left') data = data.merge(protein_all, on='Protein_ID', how='left') #获取蛋白质ID PID = list(protein_all["Protein_ID"]) #word_length = 1时的wordcount特征 print("Processing wordcount1") _, word_counts1 = tfidf_and_wordcounts(protein_all, PID, word_length=1, stride=1) #word_length = 2时的wordcount特征 print("Processing wordcount2") _, word_counts2 = tfidf_and_wordcounts(protein_all, PID, word_length=2, stride=1) word_counts1_2 = word_counts1.merge(word_counts2, on="Protein_ID", how="left") # 保存特征文件,以供后期训练 word_counts1_2.to_csv("datas/1and2_1_421_protein_std.csv", index=False) del word_counts1_2, word_counts1, word_counts2 print("Processing wordcount3") _, word_count3 = tfidf_and_wordcounts(protein_all, PID, word_length=3, stride=1) word_count3_features = list(word_count3.columns) #8000维的数据,需要降维 word_count3_features.remove("Protein_ID") #利用标准差进行降维,设置标准差阈值为0.42,去掉标准差小于0.42的特征 new_word_count3 = reduce_dims_with_std(word_count3, word_count3_features, std_threshold=0.42) #保存特征文件,以供后期训练 new_word_count3.to_csv("datas/3_1_661_protein_std0.42.csv", index=False) del new_word_count3 for i in range(len(word_count3_features) // 500): #每次划分500个特征,并保存在特征文件里,以供后期训练 file = word_count3[["Protein_ID"] + word_count3_features[i * 500:(i + 1) * 500]] file_name = "3_1_500_protein_" + str(i) file.to_csv("datas/" + file_name + ".csv", index=False) del word_count3, word_count3_features print("Processing wordcount4") gc.collect() _, word_count4 = tfidf_and_wordcounts(protein_all, PID, word_length=4, stride=1) word_count4_features = list(word_count4.columns)#140000+ 维的数据,需要降维 word_count4_features.remove("Protein_ID") # 利用标准差进行降维,设置标准差阈值为0.16,去掉标准差小于0.16的特征 new_word_count4 = reduce_dims_with_std(word_count4, word_count4_features, std_threshold=0.16) new_word_count4.to_csv("datas/4_1_679_protein_std0.16.csv", index=False) # 利用标准差进行降维,设置标准差阈值为0.13,去掉标准差小于0.13的特征 new_word_count4 = reduce_dims_with_std(word_count4, word_count4_features, std_threshold=0.13) word_count4_features = list(new_word_count4.columns) word_count4_features.remove("Protein_ID") for i in range(len(word_count4_features) // 500): #每次划分500个特征,并保存在特征文件里,以供日后训练 file = new_word_count4[["Protein_ID"] + word_count4_features[i * 500:(i + 1) * 500]] file_name = "4_1_500_protein_" + str(i) file.to_csv("datas/" + file_name + ".csv", index=False) del new_word_count4, word_count4 #以下特征是蛋白质的词向量特征, 来自技术圈, 谢谢"小武哥"同学.但我们的最终提交版本没用这些特征 "=====================================词向量特征===========================================" #feat2 = protein_embedding(protein_all, word_length = 2) #data = data.merge(feat2, on="Protein_ID", how="left") #del feat2 #feat3 = protein_embedding(protein_all, word_length = 3) #data = data.merge(feat3, on="Protein_ID", how="left") #del feat3 #feat4 = protein_embedding(protein_all, word_length = 4) #data = data.merge(feat4, on="Protein_ID", how="left") #del feat4 "================================================================================" #分子指纹展开 mol_fingerprints = list(mol_train["Fingerprint"].apply(lambda x: list(np.array(x.split(',')).astype(int)))) mol_fingerprints = pd.DataFrame(mol_fingerprints, columns=["Fingerprint_"+str(i) for i in range(167)]) mol_fingerprints["Molecule_ID"] = mol_train["Molecule_ID"] del PID "==================================================================================================" data = data.merge(mol_fingerprints, on="Molecule_ID", how='left') del mol_fingerprints del data["Sequence"], protein_train, protein_test, mol_train data.reset_index(drop = True, inplace = True) data.to_csv("datas/original_data.csv", index=False) del data print("Useful data have builded")
def test_library_open(self): lib = self.get_libc() del lib gc.collect() assert not ALLOCATED
def main(cmd_args): import optparse global options, PSYCO usage = "\n%prog [options] command [input-file-patterns]\n" + cmd_doc oparser = optparse.OptionParser(usage) oparser.add_option( "-l", "--logfilename", default="", help="contains error messages") oparser.add_option( "-v", "--verbosity", type="int", default=0, help="level of information and diagnostics provided") oparser.add_option( "-m", "--mmap", type="int", default=-1, help="1: use mmap; 0: don't use mmap; -1: accept heuristic") oparser.add_option( "-e", "--encoding", default="", help="encoding override") oparser.add_option( "-f", "--formatting", type="int", default=0, help="0 (default): no fmt info\n" "1: fmt info (all cells)\n", ) oparser.add_option( "-g", "--gc", type="int", default=0, help="0: auto gc enabled; 1: auto gc disabled, manual collect after each file; 2: no gc") oparser.add_option( "-s", "--onesheet", default="", help="restrict output to this sheet (name or index)") oparser.add_option( "-u", "--unnumbered", action="store_true", default=0, help="omit line numbers or offsets in biff_dump") oparser.add_option( "-d", "--on-demand", action="store_true", default=0, help="load sheets on demand instead of all at once") oparser.add_option( "-t", "--suppress-timing", action="store_true", default=0, help="don't print timings (diffs are less messy)") oparser.add_option( "-r", "--ragged-rows", action="store_true", default=0, help="open_workbook(..., ragged_rows=True)") options, args = oparser.parse_args(cmd_args) if len(args) == 1 and args[0] in ("version", ): pass elif len(args) < 2: oparser.error("Expected at least 2 args, found %d" % len(args)) cmd = args[0] xlrd_version = getattr(xlrd, "__VERSION__", "unknown; before 0.5") if cmd == 'biff_dump': xlrd.dump(args[1], unnumbered=options.unnumbered) sys.exit(0) if cmd == 'biff_count': xlrd.count_records(args[1]) sys.exit(0) if cmd == 'version': print("xlrd: %s, from %s" % (xlrd_version, xlrd.__file__)) print("Python:", sys.version) sys.exit(0) if options.logfilename: logfile = LogHandler(open(options.logfilename, 'w')) else: logfile = sys.stdout mmap_opt = options.mmap mmap_arg = xlrd.USE_MMAP if mmap_opt in (1, 0): mmap_arg = mmap_opt elif mmap_opt != -1: print('Unexpected value (%r) for mmap option -- assuming default' % mmap_opt) fmt_opt = options.formatting | (cmd in ('xfc', )) gc_mode = options.gc if gc_mode: gc.disable() for pattern in args[1:]: for fname in glob.glob(pattern): print("\n=== File: %s ===" % fname) if logfile != sys.stdout: logfile.setfileheading("\n=== File: %s ===\n" % fname) if gc_mode == 1: n_unreachable = gc.collect() if n_unreachable: print("GC before open:", n_unreachable, "unreachable objects") if PSYCO: import psyco psyco.full() PSYCO = 0 try: t0 = time.time() bk = xlrd.open_workbook( fname, verbosity=options.verbosity, logfile=logfile, use_mmap=mmap_arg, encoding_override=options.encoding, formatting_info=fmt_opt, on_demand=options.on_demand, ragged_rows=options.ragged_rows, ) t1 = time.time() if not options.suppress_timing: print("Open took %.2f seconds" % (t1-t0,)) except xlrd.XLRDError as e: print("*** Open failed: %s: %s" % (type(e).__name__, e)) continue except KeyboardInterrupt: print("*** KeyboardInterrupt ***") traceback.print_exc(file=sys.stdout) sys.exit(1) except BaseException as e: print("*** Open failed: %s: %s" % (type(e).__name__, e)) traceback.print_exc(file=sys.stdout) continue t0 = time.time() if cmd == 'hdr': bk_header(bk) elif cmd == 'ov': # OverView show(bk, 0) elif cmd == 'show': # all rows show(bk) elif cmd == '2rows': # first row and last row show(bk, 2) elif cmd == '3rows': # first row, 2nd row and last row show(bk, 3) elif cmd == 'bench': show(bk, printit=0) elif cmd == 'fonts': bk_header(bk) show_fonts(bk) elif cmd == 'names': # named reference list show_names(bk) elif cmd == 'name_dump': # named reference list show_names(bk, dump=1) elif cmd == 'labels': show_labels(bk) elif cmd == 'xfc': count_xfs(bk) else: print("*** Unknown command <%s>" % cmd) sys.exit(1) del bk if gc_mode == 1: n_unreachable = gc.collect() if n_unreachable: print("GC post cmd:", fname, "->", n_unreachable, "unreachable objects") if not options.suppress_timing: t1 = time.time() print("\ncommand took %.2f seconds\n" % (t1-t0,)) return None
def collectGarbage(self): """ Run a garbage collection run. """ gc.collect()
def evaluate(self, loader, epoch): self.model.eval() user_id_list, true_y, pred_y = [], [], [] loss_all, num_batch = 0., 0. with torch.no_grad(): for index, datum_tuple in enumerate(loader): creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, user_id, y_label = datum_tuple advertiser_id, product_id, product_category, industry, time = advertiser_id.to(device,non_blocking=True),\ product_id.to(device,non_blocking=True), \ product_category.to(device,non_blocking=True), \ industry.to(device,non_blocking=True),\ time.to(device,non_blocking=True) #获取embedding抽取的向量 inputlist_tensor = [ creative_id, ad_id, advertiser_id, product_id, product_category, industry, time ] emb_layer_mat = [] for index, input_col in enumerate(inputlist_tensor): emb_layer_col_mat = {} for j in range(len(self.emb_layer[index])): if index in [2, 3, 4, 5, 6]: self.emb_layer[index][j] = self.emb_layer[index][ j].to(device, non_blocking=True) emb_layer_col_mat[j] = self.emb_layer[index][j]( input_col) emb_layer_col_mat[j] = emb_layer_col_mat[j].to( device, non_blocking=True) emb_layer_mat.append(emb_layer_col_mat) output = self.model(emb_layer_mat) y_label = y_label.to(device, non_blocking=True) y_label = y_label.long() loss = self.loss_func(output, y_label) loss_all += loss.item() num_batch += 1 pred_y.extend(list(output.cpu().detach().numpy())) true_y.extend(list(y_label.cpu().detach().numpy())) user_id_list.extend(list(user_id.numpy())) del creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, y_label _ = gc.collect() pred = np.argmax(np.array(pred_y), 1) true = np.array(true_y).reshape((-1, )) acc_score = accuracy_score(true, pred) loss_valid = loss_all / num_batch output_data = DataFrame({'user_id': user_id_list, 'pred': pred_y}) if acc_score > 0.48: if not os.path.isdir( '../../oof/bk_oof/Multi_Head_ResNext_4seeds_all'): os.mkdir('../../oof/bk_oof/Multi_Head_ResNext_4seeds_all') pickle.dump( output_data, open( '../../oof/bk_oof/Multi_Head_ResNext_4seeds_all/seed_{}_val_{}_folds_{}.pkl' .format(self.seed, epoch, self.folds), 'wb')) del pred, true, pred_y, true_y _ = gc.collect() return acc_score, loss_valid
def buildADS1115Graph(password, myGraphSampleCount, graphNumber): print('buildADS1115Graph%d - The time is: %s' % (graphNumber, datetime.now())) # open database con1 = mdb.connect('localhost', 'root', password, 'DataLogger' ) # now we have to get the data, stuff it in the graph mycursor = con1.cursor() print myGraphSampleCount query = '(SELECT timestamp, deviceid, channel0_voltage, channel0_raw, channel1_voltage, channel1_raw, channel2_voltage, channel2_raw, channel3_voltage, channel3_raw, id FROM '+ADS1115tableName+' ORDER BY id DESC LIMIT '+ str(myGraphSampleCount) + ') ORDER BY id ASC' print "query=", query try: mycursor.execute(query) result = mycursor.fetchall() except: e=sys.exc_info()[0] print "Error: %s" % e print result[0] t = [] # time u = [] # channel 1 - Current averageCurrent = 0.0 currentCount = 0 for record in result: t.append(record[0]) # adjust according to graphNumber if (graphNumber == 0): addValue = record[graphNumber*2+3] if (graphNumber == 1): # O2 Sensor sensorVoltage = record[graphNumber*2+2]*(5.0/6.144) AMP = 121 K_O2 = 7.43 sensorVoltage = sensorVoltage/AMP*10000.0 Value_O2 = sensorVoltage/K_O2 addValue = Value_O2 - 1.05 if (graphNumber == 2): addValue = record[graphNumber*2+2] if (graphNumber == 3): addValue = record[graphNumber*2+2] u.append(addValue) averageCurrent = averageCurrent+addValue currentCount=currentCount+1 averageCurrent = averageCurrent/currentCount print ("count of t=",len(t)) x1 = [datetime.strptime(d, '%Y-%m-%d %H:%M:%S',) for d in t] fds = dates.date2num(x1) # converted # matplotlib date format object hfmt = dates.DateFormatter('%H:%M:%S') #hfmt = dates.DateFormatter('%m/%d-%H') fig = pyplot.figure() fig.set_facecolor('white') ax = fig.add_subplot(111,axisbg = 'white') ax.vlines(fds, -200.0, 1000.0,colors='w') #ax.xaxis.set_major_locator(dates.MinuteLocator(interval=1)) ax.xaxis.set_major_formatter(hfmt) if (graphNumber == 0): ax.set_ylim(bottom = 0.0) pyplot.xticks(rotation='45') pyplot.subplots_adjust(bottom=.3) pylab.plot(fds, u, color='r',label="Air Quality Sensor",linestyle="-",marker=".") if (graphNumber == 1): ax.set_ylim(bottom = 0.0) pyplot.xticks(rotation='45') pyplot.subplots_adjust(bottom=.3) pylab.plot(fds, u, color='r',label="Oxygen (O2) Sensor ",linestyle="-",marker=".") if (graphNumber == 2): ax.set_ylim(bottom = 0.0) pyplot.xticks(rotation='45') pyplot.subplots_adjust(bottom=.3) pylab.plot(fds, u, color='r',label="Light Sensor",linestyle="-",marker=".") if (graphNumber == 3): ax.set_ylim(bottom = -200.0) pyplot.xticks(rotation='45') pyplot.subplots_adjust(bottom=.3) pylab.plot(fds, u, color='r',label="Voltage Divider ",linestyle="-",marker=".") pylab.xlabel("Seconds") pylab.legend(loc='lower center') if (graphNumber == 0): pylab.axis([min(fds), max(fds), 0, max(u)+1000]) pylab.ylabel("Raw Data") if (graphNumber == 1): pylab.axis([min(fds), max(fds), 0, max(u)+2]) pylab.ylabel("Percent (%)") if (graphNumber == 2): pylab.axis([min(fds), max(fds), 0, max(u)+2]) pylab.ylabel("Voltage (V)") if (graphNumber == 3): pylab.axis([min(fds), max(fds), 0, max(u)+2]) pylab.ylabel("Voltage Divider (V)") if (graphNumber == 0): pylab.figtext(.5, .05, ("Average Air Quality %6.2f\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center') if (graphNumber == 1): pylab.figtext(.5, .05, ("Average O2 %6.2f %%\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center') if (graphNumber == 2): pylab.figtext(.5, .05, ("Average Light Sensor %6.2f V\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center') if (graphNumber == 3): pylab.figtext(.5, .05, ("Average Voltage Divider %6.2f V\n%s") %(averageCurrent, datetime.now()),fontsize=18,ha='center') pylab.grid(True) pyplot.show() pyplot.savefig("/var/www/html/ADS1115DataLoggerGraph"+str(graphNumber)+".png", facecolor=fig.get_facecolor()) mycursor.close() con1.close() fig.clf() pyplot.close() pylab.close() gc.collect() print "------ADS1115Graph"+str(graphNumber)+" finished now"
#获取emb_layer emb_layer = [] for index, col in enumerate(inputlist): emb_layer_col = {} for indexj, matrixi in enumerate(emb_matrix_dict[col]): emb_layer_col[indexj] = nn.Embedding.from_pretrained( torch.from_numpy(matrixi)) if col in train_able_dict: emb_layer_col[indexj].weight.requires_grad = False else: emb_layer_col[indexj].weight.requires_grad = True emb_layer.append(emb_layer_col) del id_list_dict, emb_matrix_dict _ = gc.collect() #换4个seed,每个seed跑五折 for seed in [34, 2020, 1111, 200]: for folds in range(5): print('This is fold: ', folds) data = pickle.load( open('../../cached_data/input_data_20class.pkl', 'rb')) #读数据 train_idx = list( np.load( '../../cached_data/5folds_4seeds_index/seed_{}_train_index_fold_{}.npy' .format(seed, folds))) val_idx = list( np.load( '../../cached_data/5folds_4seeds_index/seed_{}_val_index_fold_{}.npy' .format(seed, folds)))
def set_model_ma10(uid, force_full_update, connection): """ xxx """ ret = 0 ######################################################################## # (2.1) Define names of column in use by the model ######################################################################## model_tp_column = 'price_instruments_data.ma10_tp' model_score_column = 'instruments.score_ma10' #----------------------------------------------------------------------- day_to_process = 370 score = 0 if force_full_update: sql_selection = "SELECT price_instruments_data.symbol, "+\ "price_instruments_data.date, price_instruments_data.price_close, " +\ str(model_tp_column) + " FROM price_instruments_data "+\ "JOIN symbol_list ON symbol_list.symbol = price_instruments_data.symbol "+\ "WHERE symbol_list.uid = "+ str(uid) +" ORDER BY date DESC LIMIT "+\ str(day_to_process) else: sql_selection = "SELECT price_instruments_data.symbol, "+\ "price_instruments_data.date, price_instruments_data.price_close, " +\ str(model_tp_column) + " FROM price_instruments_data "+\ "JOIN symbol_list ON symbol_list.symbol = price_instruments_data.symbol "+\ "WHERE symbol_list.uid = "+ str(uid) +\ " AND price_instruments_data.is_ta_calc = 0 ORDER BY date DESC" cursor = connection.cursor(pymysql.cursors.SSCursor) sql = sql_selection cursor.execute(sql) res = cursor.fetchall() symbol = '' for row in res: symbol = row[0] last_date = row[1].strftime('%Y%m%d') last_price = row[2] model_tp = row[3] cr_c = connection.cursor(pymysql.cursors.SSCursor) sql_c = "SELECT " + str(model_tp_column) +\ ", price_instruments_data.price_close "+\ "FROM price_instruments_data JOIN symbol_list "+\ "ON symbol_list.symbol = price_instruments_data.symbol "+\ "WHERE symbol_list.uid = "+ str(uid) +" AND date = DATE_SUB("+\ str(last_date) +", INTERVAL 7 DAY)" cr_c.execute(sql_c) rs_c = cr_c.fetchall() model_prediction_tp = 0 previous_price = 0 for row in rs_c: model_prediction_tp = row[0] previous_price = row[1] cr_c.close() if model_prediction_tp != 0 and previous_price != 0: type_of_trade = '' if previous_price <= model_prediction_tp: type_of_trade = 'b' if previous_price > model_prediction_tp: type_of_trade = 's' if (previous_price >= last_price) and (type_of_trade == 'b'): if score > 0: score = score - 0.01 if (previous_price >= last_price) and (type_of_trade == 's'): score = score + 0.01 if (previous_price < last_price) and (type_of_trade == 'b'): score = score + 0.01 if (previous_price < last_price) and (type_of_trade == 's'): if score > 0: score = score - 0.01 debug("### score calc "+ str(model_score_column) +\ ": current score = " + str(score)) if model_tp == 0: ######################################################################## # (3) Define function that calc the model target price ######################################################################## last_model_tp = get_model_price_ma10(uid, last_date, connection) cr_u = connection.cursor(pymysql.cursors.SSCursor) sql_u = "UPDATE price_instruments_data SET " +\ str(model_tp_column) + " = " + str(last_model_tp) +\ " WHERE symbol = '"+ str(symbol) +"' AND date = " + str(last_date) cr_u.execute(sql_u) connection.commit() ret = last_model_tp cr_u.close() gc.collect() model_score = 0 if not force_full_update: sql = "SELECT "+ str(model_score_column) +\ " FROM instruments WHERE symbol = '"+ str(symbol) +"'" cursor.execute(sql) res = cursor.fetchall() for row in res: model_score = row[0] debug("### Total score calc "+ str(model_score_column) +": " +\ str(model_score) + " + " + str(score)) model_score = round(model_score + score, 2) debug("### Total score "+ str(model_score_column) +": " + str(model_score)) sql = "UPDATE instruments SET " + str(model_score_column) +\ " = " + str(model_score) + " WHERE symbol = '"+ str(symbol) +"'" cursor.execute(sql) connection.commit() cursor.close() gc.collect() return ret
def tearDown(self): ImpalaE2E.tearDown(self) gc.collect()
def train(self): iter_wrapper = lambda x: tqdm(x, total=len(self.train_data)) start_epoch = -1 best_valid = 0. min_lr = 1e-7 if self.is_resume: print('Let Continue!') checkpoint = torch.load(PATH_CHECKPOINT) # 加载断点 self.model.load_state_dict(checkpoint['model_state_dict']) self.optim.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] best_valid = checkpoint['best_valid'] for epoch in range(start_epoch + 1, EPOCHS): print('=========================') print('Processing Epoch {}'.format(epoch)) print('=========================') loss_per_epoch, train_n_batch = 0., 0. for index, data in iter_wrapper(enumerate(self.train_data)): creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, user_id, y_label = data advertiser_id, product_id, product_category, industry, time = advertiser_id.to(device,non_blocking=True),\ product_id.to(device,non_blocking=True), \ product_category.to(device,non_blocking=True), \ industry.to(device,non_blocking=True),\ time.to(device,non_blocking=True) self.model.train() self.optim.zero_grad() #获取embedding抽取的向量 inputlist_tensor = [ creative_id, ad_id, advertiser_id, product_id, product_category, industry, time ] emb_layer_mat = [] for index, input_col in enumerate(inputlist_tensor): emb_layer_col_mat = {} for j in range(len(self.emb_layer[index])): if index in [2, 3, 4, 5, 6]: self.emb_layer[index][j] = self.emb_layer[index][ j].to(device, non_blocking=True) emb_layer_col_mat[j] = self.emb_layer[index][j]( input_col) emb_layer_col_mat[j] = emb_layer_col_mat[j].to( device, non_blocking=True) emb_layer_mat.append(emb_layer_col_mat) output = self.model(emb_layer_mat) y_label = y_label.to(device, non_blocking=True) y_label = y_label.long() loss = self.loss_func(output, y_label) loss_per_epoch += loss.item() train_n_batch += 1 loss.backward() nn.utils.clip_grad_norm_(self.model.parameters(), 10.) # 梯度裁剪 self.optim.step() del creative_id, ad_id, product_id, advertiser_id, industry, product_category, time, y_label _ = gc.collect() if self.val_data is not None: # Do Validation valid_score, valid_loss = self.evaluate(self.val_data, epoch) print('evaluate done!') if valid_score > 0.48: self.test(self.test_data, epoch) if valid_score > best_valid: best_valid = valid_score self.scheduler_ReduceLROnPlateauLR.step(valid_score) if self.optim.param_groups[0]['lr'] < min_lr: print("stopping") break torch.cuda.empty_cache()
matrix['item_first_sale'] = matrix['date_block_num'] - matrix.groupby('item_id')['date_block_num'].transform('min') matrix = matrix[matrix.date_block_num > 11] def fill_na(df): for col in df.columns: if ('_lag_' in col) & (df[col].isnull().any()): if ('item_cnt' in col): df[col].fillna(0, inplace=True) return df matrix = fill_na(matrix) matrix.to_pickle('data.pkl') gc.collect(); data = pd.read_pickle('data.pkl') data = data[[ 'date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'city_code', 'item_category_id', 'main_cate_id', 'sub_cate_id', 'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
def write(i, batch): this_batch = helper.reshape_batch(batch, new_size, dim) with open(to+"/batch{}.pickle".format(str(i)), "wb") as f: pickle.dump(this_batch, f) del this_batch gc.collect()
def tearDown(self): gc.collect() # This will only contain uncollectable garbage, i.e. reference cycles # involving objects with __del__ defined. self.assertEmpty(gc.garbage) super().tearDown()
def main(mod1, mod2, epochs, learning_rate, l2s, batch_size, network='MAESurv'): if mod2 != 'None': MAE_params = pickle.load(io.open(STATE_FOLDER+mod1+'+'+mod2+'.dict', 'rb')) else: MAE_params = pickle.load(io.open(STATE_FOLDER+mod1+'.dict', 'rb')) d_dims = MAE_params['D dims'] hidden_dims = MAE_params['Hidden dims'] x1, x2, index = read_data(DATA_FOLDER, mod1, mod2, suffix=DATA_SUFFIX) with open(DATA_FOLDER+'train'+INDEX_SET+'.pickle','rb') as f: train_index = pickle.load(f) with open(DATA_FOLDER+'test'+INDEX_SET+'.pickle','rb') as f: test_index = pickle.load(f) x1_train = x1.loc[train_index].to_numpy()#x1_train = torch.from_numpy(x1.loc[train_index].to_numpy()).float().to(device) in_dims = [x1_train.shape[1]] x1_test = x1.loc[test_index].to_numpy()#x1_test = torch.from_numpy(x1.loc[test_index].to_numpy()).float().to(device) x1_train = torch.from_numpy(x1_train).float().to(device) x1_test = torch.from_numpy(x1_test).float().to(device) if x2 is not None: x2_train = x2.loc[train_index].to_numpy() x2_test = x2.loc[test_index].to_numpy() in_dims.append(x2_train.shape[1]) x2_train = torch.from_numpy(x2_train).float().to(device) x2_test = torch.from_numpy(x2_test).float().to(device) else: x2_train = None in_features_two = None x2_test = None with open(DATA_FOLDER + 'survival_TT.pickle', 'rb') as f: survival=pickle.load(f) get_target = lambda df: (df['OS.time'].values, df['OS'].values) y1_train, y2_train = get_target(survival.loc[train_index]) y1_test, y2_test = get_target(survival.loc[test_index]) y1_train = torch.from_numpy(y1_train).to(device) y2_train = torch.from_numpy(y2_train).to(device) y1_test = torch.from_numpy(y1_test).to(device) y2_test = torch.from_numpy(y2_test).to(device) for epoch in epochs: for do in [0,0.1,0.2]: for l2 in l2s: for lr in learning_rate: for bs in batch_size: for ns in [16,32,64,100]: hyperparameters = {'Epoch': epoch, 'Dropout': do, 'L2 reg': l2, 'State file path': STATE_FOLDER, 'Learning rate': lr, #tt.optim.Adam 'batch_size': bs, 'Input dimension': in_dims, 'Embedding dimension': d_dims, 'Latent size': hidden_dims, 'Neuron size': ns } stdOrigin = sys.stdout PATH = mod1+" "+str(mod2)+"_"+str(epoch)+"_"+str(do)+"_"+str(lr)+"_"+str(l2)+"_"+str(bs)+"_"+str(ns) sys.stdout = open(os.path.join(LOG_FOLDER, "MAESurv_"+PATH+".out"), "w") print(hyperparameters) CV(mod1, mod2, x1_train, x2_train, y1_train, y2_train, 5, hyperparameters) p_time = time.time() model, log = MAESurv_pipeline(mod1, mod2, x1_train, x2_train, y1_train, y2_train, hyperparameters, True, PATH) a_time = time.time() print(f'Training time: {a_time-p_time}') Cindex = MAESurv_evaluate(model, x1_test, x2_test, y1_test, y2_test) print(f"Test C-index: {Cindex}") gc.collect() sys.stdout.close() sys.stdout = stdOrigin
def display(self): try: from bsddb3.db import DBError except: class DBError(Exception): """ Dummy. """ self.parent = self.top.get_toplevel() progress = ProgressMeter(_('Updating display...'), '', parent=self.parent, can_cancel=True) self.model.clear() self.junk = [] gc.collect(2) self.junk = gc.garbage self.label.set_text(_('Uncollected Objects: %s') % str(len(self.junk))) progress.set_pass(_('Updating display...'), len(self.junk)) for count in range(0, len(self.junk)): if progress.step(): break try: refs = [] referrers = gc.get_referrers(self.junk[count]) for referrer in referrers: try: if referrer is not self.junk: for indx in range(0, len(self.junk)): if referrer is self.junk[indx]: refs.append(str(indx) + ' ') break except: print(sys.exc_info()) if len(refs) > 3: ref = ' '.join(refs[0:2]) + "..." else: ref = ' '.join(refs) try: self.model.append((count, ref, str(self.junk[count]))) except DBError: self.model.append( (count, ref, 'db.DB instance at %s' % id(self.junk[count]))) except ReferenceError: self.model.append( (count, ref, 'weakly-referenced object no longer exists %s' % type(self.junk[count]))) except TypeError: self.model.append( (count, ref, 'Object cannot be displayed %s' % type(self.junk[count]))) except: print(sys.exc_info()) except ReferenceError: InfoDialog(_('Reference Error'), "Refresh to correct", parent=self.parent) progress.close()
def main(): # pylint: disable=too-many-statements,too-many-branches,too-many-locals """main routine""" if int(platform.python_version().split('.')[0]) < 3: LOGGER.fatal("%s needs at least python version 3, currently %s", ME, platform.python_version()) sys.exit(1) start_time = int(time.time()) _parser = ArgumentParser() _parser.add_argument("-c", "--cfile", dest="configfile", default=ME+".cfg", help="Configuration file", metavar="FILE", required=True) _parser.add_argument("-v", "--verbosity", action="count", default=0, help="increase output verbosity overriding the default") _parser.add_argument("-p", "--parameter", action="store", help="show parameter from configfile") _args = _parser.parse_args() set_logfile(LOGGER, _args.configfile+".log") _config = get_config(_args.configfile, ME) if _args.parameter: if _args.parameter == 'password': print('parameter {}: {}\n'.format(_args.parameter, decrypted(_config[_args.parameter+'_enc']))) else: print('parameter {}: {}\n'.format( _args.parameter, _config[_args.parameter])) sys.exit(0) if _args.verbosity: newLevel = logging.getLogger().getEffectiveLevel() - (_args.verbosity*10) if newLevel < 0: newLevel = 0 LOGGER.warning("Changing loglevel from %d to %d", logging.getLogger().getEffectiveLevel(), newLevel) logging.getLogger().setLevel(newLevel) LOGGER.debug("log level %d", logging.getLogger().getEffectiveLevel()) LOGGER.warning("start python-%s %s-%s pid=%s Connecting ...\n", platform.python_version(), ME, VERSION, os.getpid() ) if _config['password']: LOGGER.warning( "first encrypted the plaintext password and removed from config\n") # we need the password .... _config['password'] = decrypted(_config['password_enc']) # add a few seconds extra to allow the driver timeout handling to do the it's job. # for example, cx_oracle has a cancel routine that we call after a timeout. If # there is a network problem, the cancel gets a ORA-12152: TNS:unable to send break message # setting this defaulttimeout should speed this up socket.setdefaulttimeout(_config['sqltimeout']+3) LOGGER.warning("%s found db_type=%s, driver %s; checking for driver\n", ME, _config['db_type'], _config['db_driver']) if not os.path.exists( os.path.join(_config['checks_dir'], _config['db_type'])): raise ValueError("db_type "+_config['db_type'] + " does not exist in the "+_config['checks_dir']+" directory") db_driver = load_driver(_config) driver_errors = load_driver_errors(_config) db_connections = load_db_connections(_config) LOGGER.info(db_connections) LOGGER.info(driver_errors) LOGGER.info("hostname in zabbix: %s", _config['hostname']) # hide password, hoping username != password ;-) LOGGER.info("connect string : %s\n", db_connections.connect_string(_config).replace(_config['password'], '******')) LOGGER.info('using sql_timeout : %ds\n', _config['sqltimeout']) LOGGER.info("out_file : %s\n", _config['out_file']) if _config['site_checks']: LOGGER.info("site_checks : %s\n", _config['site_checks']) if LOG_CONF: sys_files = 4 else: sys_files = 3 check_files = [{'name': __file__, 'lmod': os.path.getmtime(__file__)}, {'name': db_connections.__file__, 'lmod': os.path.getmtime(db_connections.__file__)}, {'name': driver_errors.__file__, 'lmod': os.path.getmtime(driver_errors.__file__)}, {'name': LOG_CONF, 'lmod': os.path.getmtime(LOG_CONF)} ] if LOG_CONF: check_files.append( {'name': LOG_CONF, 'lmod': os.path.getmtime(LOG_CONF)}) for i in range(sys_files): to_outfile(_config, "{}[checks,{},name]".format(ME, i), check_files[i]['name']) to_outfile(_config, "{}[checks,{},lmod]".format(ME, i), int(check_files[i]['lmod'])) conn_counter = 0 conn_errors = 0 query_counter = 0 query_errors = 0 sleep_c = 0 sleep_s = 1 prev_err = 0 while True: try: for i in range(sys_files): if check_files[i]['lmod'] != os.stat(check_files[i]['name']).st_mtime: LOGGER.warning("%s Changed, from %s to %s restarting ..\n", check_files[i]['name'], time.ctime(check_files[i]['lmod']), time.ctime(os.path.getmtime(check_files[i]['name']))) os.execv(__file__, sys.argv) # reset list in case of a just new connection that reloads the config check_files = [{'name': __file__, 'lmod': os.path.getmtime(__file__)}, {'name': db_connections.__file__, 'lmod': os.path.getmtime(db_connections.__file__)}, {'name': driver_errors.__file__, 'lmod': os.path.getmtime(driver_errors.__file__)}] if LOG_CONF: check_files.append( {'name': LOG_CONF, 'lmod': os.path.getmtime(LOG_CONF)}) _config = get_config(_args.configfile, ME) _config['password'] = decrypted(_config['password_enc']) _start = timer() # hide password, hoping username != password ;-) LOGGER.info('connecting to %s\n', db_connections.connect_string(_config).replace(_config['password'], '******')) conn_has_cancel = False _conn = db_connections.connect(db_driver, _config) if "cancel" in dir(_conn): conn_has_cancel = True LOGGER.info(_conn) conn_counter += 1 to_outfile(_config, ME+"[connect,status]", 0) _cursor = _conn.cursor() connect_info = db_connections.connection_info(_conn) LOGGER.info('connected db_url %s type %s db_role %s version %s\n' '%s user %s %s sid,serial %d,%d instance %s as %s cancel:%s\n', _config['db_url'], connect_info['instance_type'], connect_info['db_role'], connect_info['dbversion'], datetime.datetime.fromtimestamp(time.time()), _config['username'], connect_info['uname'], connect_info['sid'], connect_info['serial'], connect_info['iname'], _config['role'], conn_has_cancel) if connect_info['db_role'] in ["PHYSICAL STANDBY", "SLAVE"]: checks_file = os.path.join(_config['checks_dir'], _config['db_type'], "standby" + "." + connect_info['dbversion'] + ".cfg") else: checks_file = os.path.join(_config['checks_dir'], _config['db_type'], connect_info['db_role'].lower() + "." + connect_info['dbversion']+".cfg") _files = [checks_file] check_files.append({'name': checks_file, 'lmod': 0}) if _config['site_checks']: for addition in _config['site_checks'].split(","): addfile = os.path.join(_config['checks_dir'], _config['db_type'], addition + ".cfg") check_files.append({'name': addfile, 'lmod': 0}) _files.extend([addfile]) LOGGER.info('using checks from %s\n', _files) for checks_file in check_files: if not os.path.exists(checks_file['name']): raise ValueError( "Configfile " + checks_file['name'] + " does not exist") # all checkfiles exist sleep_c = 0 sleep_s = 1 prev_err = 0 con_mins = 0 open_time = int(time.time()) while True: LOGGER.debug("%s while True\n", ME) if connect_info['db_role'] != db_connections.current_role(_conn, connect_info): LOGGER.info("db_role changed from %s to %s", connect_info['db_role'], db_connections.current_role(_conn, connect_info)) # re connect to get the correct monitoring config again break # keep this to compare for when to dump stats now_run = int(time.time()) run_timer = timer() # keep this to compare for when to dump stats # loading checks from the various checkfiles: need_to_load = "no" # pylint: disable=consider-using-enumerate for i in range(len(check_files)): # at 0 - sys_files is the script itself try: current_lmod = os.path.getmtime(check_files[i]['name']) except OSError as _e: LOGGER.warning("%s: %s\n", check_files[i]['name'], _e.strerror) # ignore the error, maybe temporary due to an update current_lmod = check_files[i]['lmod'] if check_files[i]['lmod'] != current_lmod: if i < sys_files: # it is the script, a module or LOG_CONF # that changed LOGGER.warning("%s changed, from %s to %s restarting ...\n", check_files[i]['name'], time.ctime(check_files[i]['lmod']), time.ctime(current_lmod)) os.execv(__file__, sys.argv) else: if check_files[i]['lmod'] == 0: LOGGER.info("checks loading %s\n", check_files[i]['name']) need_to_load = "yes" else: LOGGER.warning("checks changed, reloading %s\n", check_files[i]['name']) need_to_load = "yes" if need_to_load == "yes": to_outfile(_config, ME + "[version]", VERSION) to_outfile( _config, ME + "[config,db_type]", _config['db_type']) to_outfile( _config, ME + "[config,db_driver]", _config['db_driver']) to_outfile( _config, ME + "[config,instance_type]", _config['instance_type']) to_outfile(_config, ME + "[conn,db_role]", connect_info['db_role']) to_outfile( _config, ME + "[conn,instance_type]", connect_info['instance_type']) to_outfile(_config, ME + "[conn,dbversion]", connect_info['dbversion']) to_outfile( _config, ME + "[connect,instance_name]", connect_info['iname']) # sometimes the instance_name query follows within a second # missing event so give it some more time time.sleep(3) objects_list = [] sections_list = [] file_list = [] all_checks = [] for i in range(len(check_files)): _e = collections.OrderedDict() _e = {"{#CHECKS_FILE}": i} file_list.append(_e) files_json = '{\"data\":'+json.dumps(file_list)+'}' to_outfile(_config, ME+".files.lld", files_json) for i in range(sys_files, len(check_files)): # #0 is executable that is also checked for updates # #1 db_connections module # #2 driver_errors module # #3 LOG_CONF if it exists ... # so, skip those and pick the real check_files _checks = configparser.RawConfigParser() try: check_file = open(check_files[i]['name'], 'r') to_outfile(_config, "{}[checks,{},name]".format(ME, i), check_files[i]['name']) to_outfile(_config, "{}[checks,{},lmod]".format(ME, i), str(int(os.stat(check_files[i]['name']).st_mtime))) try: _checks.read_file(check_file) check_file.close() to_outfile(_config, ME + "[checks," + str(i) + ",status]", 0) except configparser.Error: to_outfile(_config, ME + "[checks," + str(i) + ",status]", 13) LOGGER.critical("file %s has parsing errors ->(13)\n", check_files[i]['name']) except IOError as io_error: to_outfile( _config, ME + "[checks," + str(i) + ",status]", 11) LOGGER.critical("file %s IOError %s %s ->(11)\n", check_files[i]['name'], io_error.errno, io_error.strerror) check_files[i]['lmod'] = os.stat( check_files[i]['name']).st_mtime all_checks.append(_checks) for section in sorted(_checks.sections()): sec_mins = int(_checks.get(section, "minutes")) if sec_mins == 0: LOGGER.info( "%s run at connect only\n", section) else: LOGGER.info("%s run every %d minutes\n", section, sec_mins) # dump own discovery items of the queries per section _e = collections.OrderedDict() _e = {"{#SECTION}": section} sections_list.append(_e) _x = dict(_checks.items(section)) for key, sqls in sorted(_x.items()): if sqls and key != "minutes": _d = collections.OrderedDict() _d = {"{#SECTION}": section, "{#KEY}": key} objects_list.append(_d) LOGGER.info("%s: %s\n", key, sqls[0: 60]. replace('\n', ' ').replace('\r', ' ')) # checks are loaded now. sections_json = '{\"data\":'+json.dumps(sections_list)+'}' LOGGER.debug("lld key: %s json: %s\n", ME+".lld", sections_json) to_outfile(_config, ME+".section.lld", sections_json) rows_json = '{\"data\":'+json.dumps(objects_list)+'}' LOGGER.debug("lld key: %s json: %s\n", ME+".lld", rows_json) to_outfile(_config, ME + ".query.lld", rows_json) # sqls can contain multiple statements per key. sqlparse to split them # now. Otherwise use a lot of extra cycles when splitting at runtime # all_sql { {section, key}: statements } all_sql = {} for _checks in all_checks: for section in sorted(_checks.sections()): _x = dict(_checks.items(section)) for key, sqls in sorted(_x.items()): if sqls and key != "minutes": all_sql[(section, key)] = [] for statement in sqlparse.split(sqls): all_sql[(section, key)].append( statement) # checks discovery is also printed # # assume we are still connected. If not, exception will tell real story to_outfile(_config, ME + "[connect,status]", 0) to_outfile(_config, ME + "[uptime]", int(time.time() - start_time)) to_outfile(_config, ME + "[opentime]", int(time.time() - open_time)) # the connect status is only real if executed a query .... for _checks in all_checks: for section in sorted(_checks.sections()): section_timer = timer() # keep this to compare for when to dump stats sec_mins = int(_checks.get(section, "minutes")) if ((con_mins == 0 and sec_mins == 0) or (sec_mins > 0 and con_mins % sec_mins == 0)): # time to run the checks again from this section _x = dict(_checks.items(section)) _cursor = _conn.cursor() for key, sqls in sorted(_x.items()): if sqls and key != "minutes": LOGGER.debug("%s section: %s key: %s\n", ME, section, key) try: query_counter += 1 if conn_has_cancel: # pymysql has no cancel but does have # timeout in connect sqltimeout = threading.Timer( _config['sqltimeout'], cancel_sql, [_conn, section, key]) sqltimeout.start() _start = timer() for statement in all_sql[(section, key)]: LOGGER.debug("%s section: %s key: %s sql: %s\n", ME, section, key, statement) _cursor.execute(statement) startf = timer() # output for the last query must include the # output for the preparing queries is ignored # complete key and value rows = _cursor.fetchall() if conn_has_cancel: sqltimeout.cancel() if "discover" in section: objects_list = [] for row in rows: _d = collections.OrderedDict() for col in range(len(_cursor.description)): _d[_cursor.description[col] [0]] = row[col] objects_list.append(_d) rows_json = '{\"data\":' + \ json.dumps(objects_list)+'}' LOGGER.debug("DEBUG lld key: %s json: %s\n", key, rows_json) to_outfile(_config, key, rows_json) to_outfile(_config, ME + "[query," + section + "," + key + ",status]", 0) else: if rows and len(rows[0]) == 2: _config['section'] = section _config['key'] = key for row in rows: to_outfile( _config, row[0], row[1]) to_outfile(_config, ME + "[query," + section + "," + key + ",status]", 0) elif not rows: to_outfile(_config, ME + "[query," + section + "," + key + ",status]", 0) else: LOGGER.critical('key=%s.%s ZBXDB-%d: ' 'SQL format error: %s\n', section, key, 2, "expect key,value pairs") to_outfile(_config, ME + "[query," + section + "," + key + ",status]", 2) _config['section'] = "" _config['key'] = "" fetchela = timer() - startf elapsed_s = timer() - _start to_outfile(_config, ME + "[query," + section + "," + key + ",ela]", elapsed_s) to_outfile(_config, ME + "[query," + section + "," + key + ",fetch]", fetchela) # except (db_driver.DatabaseError, # socket.timeout) as dberr: except Exception as dberr: if conn_has_cancel: sqltimeout.cancel() ecode, emsg = driver_errors.db_errorcode( db_driver, dberr) elapsed_s = timer() - _start query_errors += 1 to_outfile(_config, ME + "[query," + section + "," + key + ",status]", ecode) to_outfile(_config, ME + "[query," + section + "," + key + ",ela]", elapsed_s) LOGGER.info('key=%s.%s ZBXDB-%s: ' 'Db execution error: %s\n', section, key, ecode, emsg.strip()) if driver_errors.db_error_needs_new_session(db_driver, ecode): raise LOGGER.debug("%s commit\n", ME) _conn.commit() LOGGER.debug("%s committed\n", ME) # end of a section ## time to run the checks again from this section to_outfile(_config, ME + "[query," + section + ",,ela]", timer() - section_timer) # release locks that might have been taken LOGGER.debug("%s commit 2\n", ME) _conn.commit() LOGGER.debug("%s committed.\n", ME) # dump metric for summed elapsed time of this run to_outfile(_config, ME + "[query,,,ela]", timer() - run_timer) to_outfile(_config, ME + "[cpu,user]", resource.getrusage(resource.RUSAGE_SELF).ru_utime) to_outfile(_config, ME + "[cpu,sys]", resource.getrusage(resource.RUSAGE_SELF).ru_stime) to_outfile(_config, ME + "[mem,maxrss]", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # passed all sections if ((now_run - start_time) % 3600) == 0: gc.collect() # dump stats LOGGER.info("connect %d times, %d fail; started %d queries, " "%d fail memrss:%d user:%f sys:%f\n", conn_counter, conn_errors, query_counter, query_errors, resource.getrusage( resource.RUSAGE_SELF).ru_maxrss, resource.getrusage( resource.RUSAGE_SELF).ru_utime, resource.getrusage(resource.RUSAGE_SELF).ru_stime) # try to keep activities on the same starting second: sleep_time = 60 - ((int(time.time()) - start_time) % 60) LOGGER.debug("Sleeping for %d seconds\n", sleep_time) time.sleep(sleep_time) con_mins = con_mins + 1 # not really mins since the checks could # have taken longer than 1 minute to complete # end while True # except (db_driver.DatabaseError, socket.timeout, ConnectionResetError) as dberr: except Exception as dberr: err_code, err_msg = driver_errors.db_errorcode(db_driver, dberr) elapsed_s = timer() - _start to_outfile(_config, ME + "[connect,status]", err_code) if not driver_errors.db_error_needs_new_session(db_driver, err_code): # from a killed session, crashed instance or similar conn_errors += 1 if prev_err != err_code: sleep_c = 0 sleep_s = 1 prev_err = err_code sleep_c += 1 if sleep_c >= 10: if sleep_s <= 301: # don't sleep longer than 5 mins after connect failures sleep_s += 10 sleep_c = 0 LOGGER.warning('(%d.%d)connection error: [%s] %s for %s@%s\n', sleep_c, sleep_s, err_code, err_msg.strip().replace('\n', ' ').replace('\r', ' '), _config['username'], _config['db_url']) # set_trace() time.sleep(sleep_s) except (KeyboardInterrupt, SystemExit): exit(0)
def load_scripts(reload_scripts=False, refresh_scripts=False): """ Load scripts and run each modules register function. :arg reload_scripts: Causes all scripts to have their unregister method called before loading. :type reload_scripts: bool :arg refresh_scripts: only load scripts which are not already loaded as modules. :type refresh_scripts: bool """ use_time = use_class_register_check = _bpy.app.debug_python use_user = not _is_factory_startup if use_time: import time t_main = time.time() loaded_modules = set() if refresh_scripts: original_modules = _sys.modules.values() if reload_scripts: # just unload, don't change user defaults, this means we can sync # to reload. note that they will only actually reload of the # modification time changes. This `won't` work for packages so... # its not perfect. for module_name in [ext.module for ext in _user_preferences.addons]: _addon_utils.disable(module_name) # *AFTER* unregistering all add-ons, otherwise all calls to # unregister_module() will silently fail (do nothing). _bpy_types.TypeMap.clear() def register_module_call(mod): register = getattr(mod, "register", None) if register: try: register() except: import traceback traceback.print_exc() else: print("\nWarning! '%s' has no register function, " "this is now a requirement for registerable scripts" % mod.__file__) def unregister_module_call(mod): unregister = getattr(mod, "unregister", None) if unregister: try: unregister() except: import traceback traceback.print_exc() def test_reload(mod): import importlib # reloading this causes internal errors # because the classes from this module are stored internally # possibly to refresh internal references too but for now, best not to. if mod == _bpy_types: return mod try: return importlib.reload(mod) except: import traceback traceback.print_exc() def test_register(mod): if refresh_scripts and mod in original_modules: return if reload_scripts and mod: print("Reloading:", mod) mod = test_reload(mod) if mod: register_module_call(mod) _global_loaded_modules.append(mod.__name__) if reload_scripts: # module names -> modules _global_loaded_modules[:] = [_sys.modules[mod_name] for mod_name in _global_loaded_modules] # loop over and unload all scripts _global_loaded_modules.reverse() for mod in _global_loaded_modules: unregister_module_call(mod) for mod in _global_loaded_modules: test_reload(mod) del _global_loaded_modules[:] from bpy_restrict_state import RestrictBlend with RestrictBlend(): for base_path in script_paths(use_user=use_user): for path_subdir in _script_module_dirs: path = _os.path.join(base_path, path_subdir) if _os.path.isdir(path): _sys_path_ensure(path) # only add this to sys.modules, don't run if path_subdir == "modules": continue for mod in modules_from_path(path, loaded_modules): test_register(mod) # load template (if set) if any(_bpy.utils.app_template_paths()): import bl_app_template_utils bl_app_template_utils.reset(reload_scripts=reload_scripts) del bl_app_template_utils # deal with addons separately _initialize = getattr(_addon_utils, "_initialize", None) if _initialize is not None: # first time, use fast-path _initialize() del _addon_utils._initialize else: _addon_utils.reset_all(reload_scripts=reload_scripts) del _initialize # run the active integration preset filepath = preset_find(_user_preferences.inputs.active_keyconfig, "keyconfig") if filepath: keyconfig_set(filepath) if reload_scripts: import gc print("gc.collect() -> %d" % gc.collect()) if use_time: print("Python Script Load Time %.4f" % (time.time() - t_main)) if use_class_register_check: for cls in _bpy.types.bpy_struct.__subclasses__(): if getattr(cls, "is_registered", False): for subcls in cls.__subclasses__(): if not subcls.is_registered: print( "Warning, unregistered class: %s(%s)" % (subcls.__name__, cls.__name__) )
def kfold_lightgbm(df, debug=False): # Divide in training/validation and test data train_df = df[df['TARGET'].notnull()] test_df = df[df['TARGET'].isnull()] print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape)) del df gc.collect() folds = KFold(n_splits=10, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) # predicted valid_y sub_preds = np.zeros(test_df.shape[0]) # submission preds feature_importance_df = pd.DataFrame() # feature importance fold_auc_best_df = pd.DataFrame(columns=["FOLD", "AUC", "BEST_ITER"]) # holding best iter to save model feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index', "APP_index", "BURO_index", "PREV_index", "INSTAL_index", "CC_index", "POS_index"]] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx] # LightGBM parameters found by Bayesian optimization clf = LGBMClassifier( n_jobs=-1, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=200, early_stopping_rounds=200) # predicted valid_y oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] # submission preds. her kat icin test setini tahmin edip tum katların ortalamasini alıyor. sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits # fold, auc and best iteration print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx]))) # best auc & iteration fold_auc_best_df = fold_auc_best_df.append({'FOLD': int(n_fold + 1), 'AUC': roc_auc_score(valid_y, oof_preds[valid_idx]), "BEST_ITER": clf.best_iteration_}, ignore_index=True) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importances_ fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) del clf, train_x, train_y, valid_x, valid_y gc.collect() # OUTPUTS print(fold_auc_best_df) print(feature_importance_df) # feature importance'ları df olarak kaydet feature_importance_df.to_pickle("outputs/features/feature_importance_df.pkl") fold_auc_best_df.to_pickle("outputs/features/fold_auc_best_df.pkl") # Final Model best_iter_1 = int(fold_auc_best_df.sort_values(by="AUC", ascending=False)[:1]["BEST_ITER"].values) y_train = train_df["TARGET"] x_train = train_df[feats] final_model = LGBMClassifier( n_jobs=-1, n_estimators=best_iter_1, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1).fit(x_train, y_train) cur_dir = os.getcwd() os.chdir('models/reference/') pickle.dump(final_model, open("lightgbm_final_model.pkl", 'wb')) # model os.chdir(cur_dir) # her bir fold icin tahmin edilen valid_y'ler aslında train setinin y'lerinin farklı parcalarda yer alan tahminleri. cowsay.cow('Full Train(Validation) AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds)) # Write submission file and plot feature importance if not debug: cur_dir = os.getcwd() os.chdir('outputs/predictions/') test_df['TARGET'] = sub_preds test_df[['SK_ID_CURR', 'TARGET']].to_csv("reference_submission.csv", index=False) os.chdir(cur_dir) display_importances(feature_importance_df) del x_train, y_train return feature_importance_df
def get_autoencoder(input_size, latent_dim, data): learning_rate = 0.00001 autoencoder = build(input_size, latent_dim) autoencoder.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate)) encoder = autoencoder.get_layer("encoder") cell_decoders = {} cell_discriminators = {} discriminator = make_discriminator_model(input_size) discriminator.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate)) reconstruction_list = np.zeros((0, 978, 1)) count = 0 e = 0 if not os.path.exists("best"): os.makedirs("best") if not os.path.exists("weights"): os.makedirs("weights") while e < nb_total_epoch: print("Total epoch " + str(e) + " ------------------------------------------------------") if e > 0: autoencoder_saved = keras.models.load_model("./weights/main_model") autoencoder = build(input_size, latent_dim) autoencoder.set_weights(autoencoder_saved.get_weights()) autoencoder.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(learning_rate)) del autoencoder_saved discriminator = make_discriminator_model(input_size) encoder = autoencoder.get_layer("encoder") if e == 0: print("Main autoencoder") # autoencoder = keras.models.load_model("default_autoencoder") callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) autoencoder.fit(data.train_data, data.train_data, epochs=nb_autoencoder_epoch, batch_size=batch_size, validation_split=0.1, callbacks=[callback]) autoencoder.save("default_autoencoder") for cell in data.cell_types: decoder = autoencoder.get_layer("decoder") cell_decoders[cell] = decoder.get_weights().copy() cell_discriminators[cell] = discriminator.get_weights().copy() pickle.dump(cell_decoders[cell], open("./weights/" + cell + "_decoder_weights", "wb")) del decoder print("Training decoders") decoder = autoencoder.get_layer("decoder") count_im = 0 for pert in data.train_perts: cell = random.choice(list(data.cell_types)) decoder.set_weights(cell_decoders[cell]) discriminator.set_weights(cell_discriminators[cell]) pert_profiles = np.asarray([data.train_data[i] for i, p in enumerate(data.train_meta) if p[1] == pert]) target_profiles = [data.train_data[i] for i, p in enumerate(data.train_meta) if p[1] == pert and p[0] == cell] while len(target_profiles) < len(pert_profiles): target_profiles.append(target_profiles[0]) target_profiles = np.asarray(target_profiles) if count_im < 5: z_mean, z_log_var, z = encoder.predict(pert_profiles) utils1.draw_vectors(z, "vectors/" + pert + "_1.png") train_step(autoencoder, discriminator, pert_profiles, target_profiles, e) if count_im < 5: z_mean, z_log_var, z = encoder.predict(pert_profiles) utils1.draw_vectors(z, "vectors/" + pert + "_2.png") count_im = count_im + 1 cell_decoders[cell] = decoder.get_weights().copy() cell_discriminators[cell] = discriminator.get_weights().copy() if e == nb_total_epoch - 1: print("freezing encoder") encoder.trainable = False decoder.trainable = True autoencoder.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(0.00001)) cl = list(data.cell_types) random.shuffle(cl) for cell in cl: print(cell) decoder.set_weights(cell_decoders[cell]) tf.random.set_seed(1) cell_data = np.asarray([[data.train_data[i], data.train_meta[i]] for i, p in enumerate(data.train_meta) if p[0] == cell]) if len(cell_data) == 0: continue input_profiles = [] output_profiles = [] for i in range(len(cell_data)): # input_profiles.append(cell_data[i][0]) # output_profiles.append(cell_data[i][0]) closest, profile, mean_profile, all_profiles = data.get_profile(data.train_data, data.meta_dictionary_pert[ cell_data[i][1][1]], cell_data[i][1], train_data=True) if mean_profile is not None: for p in all_profiles: input_profiles.append(p) output_profiles.append(cell_data[i][0]) input_profiles = np.asarray(input_profiles) output_profiles = np.asarray(output_profiles) if e == nb_total_epoch - 1: cell_data_val = np.asarray([[data.val_data[i], data.val_meta[i]] for i, p in enumerate(data.val_meta) if p[0] == cell]) input_profiles_val = [] output_profiles_val = [] for i in range(len(cell_data_val)): closest, profile, mean_profile, all_profiles = data.get_profile(data.val_data, data.meta_dictionary_pert_val[ cell_data_val[i][1][1]], cell_data_val[i][1]) if mean_profile is not None: for p in all_profiles: input_profiles_val.append(p) output_profiles_val.append(cell_data_val[i][0]) input_profiles_val = np.asarray(input_profiles_val) output_profiles_val = np.asarray(output_profiles_val) callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) autoencoder.fit(input_profiles, output_profiles, epochs=nb_frozen_epoch, batch_size=batch_size, validation_data=(input_profiles_val, output_profiles_val), callbacks=[callback]) else: discriminator.set_weights(cell_discriminators[cell]) fake_data = autoencoder.predict(input_profiles) if len(reconstruction_list) < 10000: reconstruction_list = np.append(reconstruction_list, fake_data, axis=0) else: start = random.randint(0, len(reconstruction_list) - 1 - len(fake_data)) reconstruction_list[start:start + len(fake_data)] = fake_data np.random.shuffle(reconstruction_list) for d_epochs in range(10): total = int(math.ceil(float(len(input_profiles)) / batch_size)) for i in range(total): output_data = output_profiles[i * batch_size:(i + 1) * batch_size] reconstruction_data = reconstruction_list[np.random.choice(reconstruction_list.shape[0], batch_size, replace=False)] train_step_d(discriminator, output_data, reconstruction_data) cell_discriminators[cell] = discriminator.get_weights().copy() fake_data = autoencoder.predict(input_profiles) r = 0 f_new = 0 a = discriminator.predict(output_profiles) for v in a: if v > 0.5: r = r + 1 a = discriminator.predict(fake_data) for v in a: if v > 0.5: f_new = f_new + 1 print(str(d_epochs) + " discriminator " + str(r) + " : " + str(f_new) + " - " + str(len(input_profiles))) # # tf.random.set_seed(1) cell_decoders[cell] = decoder.get_weights().copy() gc.collect() print("---------------------------------------------------------------\n") # train_cor_sum = 0.0 # train_count = 0 # seen_perts = [] # for i in range(len(data.train_data)): # train_meta_object = data.train_meta[i] # if train_meta_object[1] in seen_perts: # continue # closest, closest_profile, mean_profile, all_profiles = data.get_profile(data.train_data, # data.meta_dictionary_pert[ # train_meta_object[1]], # train_meta_object, train_data=True) # if closest_profile is None: # continue # seen_perts.append(train_meta_object[1]) # train_count = train_count + 1 # weights = cell_decoders[train_meta_object[0]] # autoencoder.get_layer("decoder").set_weights(weights) # decoded1 = autoencoder.predict(closest_profile) # train_cor_sum = train_cor_sum + stats.pearsonr(decoded1.flatten(), data.train_data[i].flatten())[0] # train_cor = train_cor_sum / train_count # print("Training pcc: " + str(train_cor)) # print("Evaluated:" + str(train_count)) val_cor_sum = 0.0 val_count = 0 seen_perts = [] disc_fake = 0 disc_real = 0 for i in range(len(data.val_data)): val_meta_object = data.val_meta[i] if val_meta_object[1] in seen_perts: continue closest, closest_profile, mean_profile, all_profiles = data.get_profile(data.val_data, data.meta_dictionary_pert_val[ val_meta_object[1]], val_meta_object) if closest_profile is None: continue seen_perts.append(val_meta_object[1]) val_count = val_count + 1 weights = cell_decoders[val_meta_object[0]] autoencoder.get_layer("decoder").set_weights(weights) predictions = [] for p in all_profiles: predictions.append(autoencoder.predict(np.asarray([p]))) special_decoded = np.mean(np.asarray(predictions), axis=0) val_cor_sum = val_cor_sum + stats.pearsonr(special_decoded.flatten(), data.val_data[i].flatten())[0] discriminator.set_weights(cell_discriminators[val_meta_object[0]]) if discriminator.predict(special_decoded)[0, 0] > 0.5: disc_fake = disc_fake + 1 if discriminator.predict(np.asarray([data.val_data[i]]))[0, 0] > 0.5: disc_real = disc_real + 1 val_cor = val_cor_sum / val_count print("Validation pcc: " + str(val_cor)) print("Evaluated:" + str(val_count)) print("Discriminator " + str(disc_fake) + " : " + str(disc_real)) if e == 0: best_val_cor = val_cor else: if val_cor < best_val_cor: count = count + 1 else: best_val_cor = val_cor count = 0 autoencoder.save("best/main_model") for cell in data.cell_types: pickle.dump(cell_decoders[cell], open("best/" + cell + "_decoder_weights", "wb")) if count > 40: e = nb_total_epoch - 2 count = 0 for cell in data.cell_types: cell_decoders[cell] = pickle.load(open("best/" + cell + "_decoder_weights", "rb")) shutil.rmtree('weights') shutil.move('best', 'weights') autoencoder.save("weights/main_model") for cell in data.cell_types: pickle.dump(cell_decoders[cell], open("weights/" + cell + "_decoder_weights", "wb")) # Needed to prevent Keras memory leak del autoencoder del encoder del discriminator gc.collect() K.clear_session() tf.compat.v1.reset_default_graph() print("---------------------------------------------------------------\n") e = e + 1 autoencoder = keras.models.load_model("weights/main_model") return autoencoder, cell_decoders, val_cor
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) model_id = "all" logger = get_logger() df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) df["prior_question_had_explanation"] = df[ "prior_question_had_explanation"].fillna(-1) df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "leakage_feature" }, "answered_correctly": { "type": "leakage_feature" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" }, "prior_question_had_explanation": { "type": "category" }, "rating_diff_content_user_id": { "type": "numeric" }, "task_container_id_bin300": { "type": "category" }, "previous_answer_index_question_id": { "type": "category" }, "previous_answer_question_id": { "type": "category" }, "timediff-elapsedtime_bin500": { "type": "category" }, "timedelta_log10": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_dict["user_id"][ "UserContentRateEncoder"] = UserContentRateEncoder( rate_func="elo", column="user_id") feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2( groupby="user_id", column="question_id", is_debug=is_debug, model_id=model_id, n=300) feature_factory_dict["user_id"][ "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True) feature_factory_dict["user_id"][ f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator( column="user_id", agg_column="study_time", remove_now=False) feature_factory_dict["user_id"][ "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder( ) feature_factory_dict["post"] = { "DurationFeaturePostProcess": DurationFeaturePostProcess() } feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id=model_id, load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) def f(x): x = x // 1000 if x < -100: return -100 if x > 400: return 400 return x df["task_container_id_bin300"] = [ x if x < 300 else 300 for x in df["task_container_id"] ] df["timediff-elapsedtime_bin500"] = [ f(x) for x in df["timediff-elapsedtime"].values ] df["timedelta_log10"] = np.log10( df["duration_previous_content"].values) df["timedelta_log10"] = df["timedelta_log10"].replace( -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8") df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300", "prior_question_had_explanation", "rating_diff_content_user_id", "task_container_id_bin300", "previous_answer_index_question_id", "previous_answer_question_id", "row_id", "timediff-elapsedtime_bin500", "timedelta_log10" ]] print(df.head(10)) print("data preprocess") ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=df) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df_val_row = pd.read_feather( "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather" ) if is_debug: df_val_row = df_val_row.head(3000) df_val_row["is_val"] = 1 df = pd.merge(df, df_val_row, how="left", on="row_id") df["is_val"] = df["is_val"].fillna(0) print(df["is_val"].value_counts()) w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model275_all", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model275_all/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model275_all/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model275_all/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout, cont_emb=params["cont_emb"]) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.2 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.2, ) num_train_optimization_steps = int(len(dataloader_train) * 25) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) auc_val = 0 for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, epoch, output_dir, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) torch.save( model.state_dict(), f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth" ) # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_val) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model torch.cuda.empty_cache() with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)