def index(): if 'id' in session: res = db.select('user', id=session['id'])[0] user = {v: res[k] for k, v in enumerate(config.user_fields)} utils.writelog('user').info('"INFO: %s is Is visiting"'%user['username']) return render_template('/common/index.html',user=user,role=session['role']) return render_template('/common/login.html')
def delete(table, id): sql = 'DELETE FROM {} WHERE id = {};'.format(table, id) try: cursor.execute(sql) conn.commit() except: utils.writelog('db').error('"Exec: %s,"Error: %s"' % (sql, traceback.format_exc())) return cursor.rowcount
def update(table, condition, id): sql = 'UPDATE {} SET {} WHERE id = {};'.format(table, ','.join(condition), id) try: cursor.execute(sql) conn.commit() except: utils.writelog('db').error('"Exec: %s,"Error: %s"' % (sql, traceback.format_exc())) return cursor.rowcount
def add(table, data): sql = 'INSERT INTO {} ({}) VALUES({});'.format( table, ','.join(data.keys()), ','.join(['"%s"' % k for k in data.values()])) try: cursor.execute(sql) conn.commit() except: utils.writelog('db').error('"Exec: %s,"Error: %s"' % (sql, traceback.format_exc())) return cursor.rowcount
def letssee(self, site_url): self.driver.get(site_url) time.sleep(5) if (self.driver.find_elements_by_css_selector(page_selector)): # preluam codul qr img = self.driver.find_element_by_css_selector(qr_selector) timestamp = time.strftime('%d-%m-%Y %H-%M-%S') src = img.get_attribute('src') qr_image_url = '/static/qr'+timestamp+'.png' urllib.request.urlretrieve(src, os.getcwd()+qr_image_url) # Asteptam 120 de secunde sa dispara codul qr print('Preluam codul qr') try: writelog('Nu este logat contul, Scanati codul qr : Aveti la dispozitie 2 minute ' + self.modem_name, '3') print('Nu este logat contul, Scanati codul qr : Aveti la dispozitie 2 minute') no_login = WebDriverWait(self.driver, 10).until( EC.invisibility_of_element_located((By.CSS_SELECTOR, qr_selector)) ) try: # Asteptam sa dispara elementul loader startpage = WebDriverWait(self.driver, 10).until( EC.invisibility_of_element_located((By.ID, 'loader')) ) # print('Am gasit elementul'+ str(startpage)) print('Pagina este gata') except TimeoutException: writelog('Pagina nu este gata: ' + self.modem_name +' se inchide', '3') print('Pagina nu este gata') self.driver.quit() except TimeoutException: writelog('PPrima pagina nu a fost incarcata: ' + self.modem_name +' se inchide', '3') print('Prima pagina nu a fost incarcata') self.driver.quit() else: # nu exista cod qr - contul este logat si asteptam sa se inchida elementul loader try: # Asteptam sa dispara elementul loader startpage = WebDriverWait(self.driver, 10).until( EC.invisibility_of_element_located((By.ID, loader_selector)) ) # print('Am gasit elementul'+ str(startpage)) print('Pagina este gata') except TimeoutException: writelog('Pagina nu este gata: ' + self.modem_name +' se inchide', '3') print('Pagina nu este gata') print('Am terminat initializarea') writelog('Modemul: ' + self.modem_name +' a fost initializat', '3') modem1 = Modem.query.filter_by(name=self.modem_name).first() modem1.status = 'running' db.session.commit()
def userinfo(): if request.args.get('id'): user = db.select('user',id=request.args.get('id'))[0] user = {v: user[k] for k, v in enumerate(config.user_fields)} utils.writelog('user').info("INFO: {} is change {}'s info".format(session['username'],user['username'])) return json.dumps({'result':user}) else: role = session.get('role') res = db.select('user',id = session['id'])[0] user = {v: res[k] for k, v in enumerate(config.user_fields)} utils.writelog('user').info("INFO : %s is view his info "%user['username']) return render_template("/user/userinfo.html", user = user, role=role)
def collect_tweets(task, tags): ''' Collect tweets for tag, indefinitely and store in csv files ''' appKeys = kays.appKeys with fopen(task, newline='\n', encoding='utf-8') as f: keyIdx = 0 tagIdx = 0 # writer for csv writer = csv.writer(f) # save task to log writelog(task, tags) # collect tweets indefinitely by using all keys while True: print(time.ctime(), 'Collecting tweets...') # get the key key = appKeys[keyIdx] # create auth and api auth = tweepy.OAuthHandler(key['consumerAPIKey'], key['consumerAPISecretKey']) auth.set_access_token(key['accessToken'], key['accessTokenSecret']) api = tweepy.API(auth) # filter out retweets query = tags[tagIdx] + ' -filter:retweets' count = 0 # collect tweets and save try: for tweet in tweepy.Cursor(api.search, q=query).items(): user = tweet.user # escape text row = map(esc, [tweet.text, tweet.id, user.name, user.screen_name, user.location, user.description, user.followers_count, user.friends_count, user.listed_count, user.statuses_count, user.favourites_count, user.verified, user.default_profile_image, user.default_profile, user.protected, user.created_at]) writer.writerow(row) count = count+1 except Exception as e: # Wait for 10 mins and then start using next key print(time.ctime(), 'Got {} tweets'.format(count)) # if keyIdx+1 == len(appKeys): tagIdx = (tagIdx+1) % len(tags) keyIdx = (keyIdx+1) % len(appKeys) time.sleep(10 * 60)
def select(table, *fields, **data): data = ' and '.join({'%s="%s"' % (k, v) for k, v in data.items()}) fields = ','.join(fields) if data and fields: sql = 'SELECT {} FROM {} WHERE {};'.format(fields, table, data) elif not data and fields: sql = 'SELECT {} FROM {};'.format(fields, table) elif not fields and data: sql = 'SELECT * FROM {} WHERE {};'.format(table, data) else: sql = 'SELECT * FROM {};'.format(table) try: cursor.execute(sql) result = cursor.fetchall() return result except: utils.writelog('db').error('"Exec: %s,"Error: %s"' % (sql, traceback.format_exc()))
def api_create_mesaj(current_user): data = request.get_json() if data['name'] == '' or data['mesaj'] == '': return jsonify({'message' : 'Aveti campuri goale'}) elif len(data['telefon']) > 10 or len(data['telefon']) < 10: return jsonify({'message' : 'Telefonul trebuie sa contina 10 caractere'}) else: mesaj_lenght = len(data['mesaj']) print(mesaj_lenght) mesaj_count = math.ceil(mesaj_lenght / 160) new_mesaj = Mesaj(name = data['name'], telefon = data['telefon'], mesaj = data['mesaj'], mesaj_count = mesaj_count, mesaj_lenght = mesaj_lenght, user_id = current_user.id, is_sent = False) db.session.add(new_mesaj) db.session.commit() writelog('Mesaj adaugat prin api', current_user.id) return jsonify({'message' : 'Message sent'})
def handleHMIRequest(): waterlevel = int(request.form.get('jwttoken')) plaintext, sig = request.form.get('HMIquery').split(',') print("request = ", plaintext, sig, waterlevel) key = "c" * 16 # tmp res = {"errmsg": "", "data": "123"} if utils.authcheck(plaintext, sig, waterlevel) == 0: res['errmsg'] = "Authentication failed" utils.writelog("Authentication failed: %s" % (plaintext)) return res else: modbusUtils.modbusSend( sock, modbusUtils.makeWriteSingleRegisterRequest(cm.memLoc, waterlevel)) response = modbusUtils.parseWriteSingleRegisterResponse( modbusUtils.modbusRecv(sock)) print("request response = ", response) modbusUtils.modbusSend( sock, modbusUtils.makeReadInputRegistersRequest(cm.revmemLoc, 1)) response = modbusUtils.parseReadInputRegistersResponse( modbusUtils.modbusRecv(sock)) print("response response = ", response) return jsonify(utils.AESEncrypt(str(waterlevel), key))
def api_login(): auth = request.authorization add_ip =request.remote_addr if not auth or not auth.username or not auth.password: writelog(add_ip + ' Could not verify', 1) return make_response('Could not verify', 401, {'WWW=Authenticate' : 'Basic realm = "Login required"'}) user =User.query.filter_by(username=auth.username).first() if not user: writelog(add_ip + ' Could not verify', 1) return make_response('Could not verify', 401, {'WWW=Authenticate' : 'Basic realm = "Login required"'}) if check_password_hash(user.password, auth.password): token = jwt.encode({'public_id' : user.public_id, 'exp' : datetime.utcnow() + timedelta(minutes=360)}, app.config['SECRET_KEY']) writelog(add_ip + ' Am emis tokenul pentru ' + user.username, user.id) return jsonify({'token' : token.decode('UTF-8')}) writelog(add_ip + ' Could not verify', 1) return make_response('Could not verify', 401, {'WWW=Authenticate' : 'Basic realm = "Login required"'})
def kfold(n_splits, split_id, databatch): """ :type n_splits: int :type split_id: int :type databatch: DataBatch :rtype: DataBatch, DataBatch """ assert 0 <= split_id < n_splits kfold = KFold(n_splits=n_splits, random_state=1234, shuffle=True) indices_list = list(kfold.split(np.arange(len(databatch)))) train_indices, dev_indices = indices_list[split_id] assert len(train_indices) + len(dev_indices) == len(databatch) train_databatch = utils.DataBatch( batch_edu_ids=databatch.batch_edu_ids[train_indices], batch_edus=databatch.batch_edus[train_indices], batch_edus_postag=databatch.batch_edus_postag[train_indices], batch_edus_head=databatch.batch_edus_head[train_indices], batch_sbnds=databatch.batch_sbnds[train_indices], batch_pbnds=databatch.batch_pbnds[train_indices], batch_nary_sexp=databatch.batch_nary_sexp[train_indices], batch_bin_sexp=databatch.batch_bin_sexp[train_indices], batch_arcs=databatch.batch_arcs[train_indices]) dev_databatch = utils.DataBatch( batch_edu_ids=databatch.batch_edu_ids[dev_indices], batch_edus=databatch.batch_edus[dev_indices], batch_edus_postag=databatch.batch_edus_postag[dev_indices], batch_edus_head=databatch.batch_edus_head[dev_indices], batch_sbnds=databatch.batch_sbnds[dev_indices], batch_pbnds=databatch.batch_pbnds[dev_indices], batch_nary_sexp=databatch.batch_nary_sexp[dev_indices], batch_bin_sexp=databatch.batch_bin_sexp[dev_indices], batch_arcs=databatch.batch_arcs[dev_indices]) utils.writelog("n_splits=%d" % n_splits) utils.writelog("split_id=%d" % split_id) utils.writelog("# of training instances=%d" % len(train_databatch)) utils.writelog("# of development instances=%d" % len(dev_databatch)) return train_databatch, dev_databatch
def login(): if request.method == 'GET': return render_template('/common/login.html') user_dict = {k:v[0] for k , v in dict(request.form).items()} if user_dict['username'] and user_dict['password']: res = db.select('user',**user_dict)[0] user = {v:res[k] for k,v in enumerate(config.user_fields)} if res: session['role'] = user['role'] session['id'] = user['id'] session['username'] = ['username'] utils.writelog('user').info('"INFO: %s is login success"' %user['username']) return json.dumps({'code': '1', 'result': 'login success'}) else: utils.writelog('user').error('"INFO: %s login Failed"' %user_dict['username']) return json.dumps({'code': '0', 'result': 'name or password wrong'}) else: utils.writelog('user').error('"INFO: %s login Failed"' %user_dict['username']) return json.dump({'code':'0','result':'username or password not be null'})
def randomsplit(n_dev, databatch): """ :type n_dev: int :type databatch: DataBatch :rtype: DataBatch, DataBatch """ n_total = len(databatch) assert 0 < n_dev < n_total indices = np.random.RandomState(1234).randint(0, n_total, n_total) dev_indices = indices[:n_dev] train_indices = indices[n_dev:] assert len(train_indices) + len(dev_indices) == len(databatch) train_databatch = utils.DataBatch( batch_edu_ids=databatch.batch_edu_ids[train_indices], batch_edus=databatch.batch_edus[train_indices], batch_edus_postag=databatch.batch_edus_postag[train_indices], batch_edus_head=databatch.batch_edus_head[train_indices], batch_sbnds=databatch.batch_sbnds[train_indices], batch_pbnds=databatch.batch_pbnds[train_indices], batch_nary_sexp=databatch.batch_nary_sexp[train_indices], batch_bin_sexp=databatch.batch_bin_sexp[train_indices], batch_arcs=databatch.batch_arcs[train_indices]) dev_databatch = utils.DataBatch( batch_edu_ids=databatch.batch_edu_ids[dev_indices], batch_edus=databatch.batch_edus[dev_indices], batch_edus_postag=databatch.batch_edus_postag[dev_indices], batch_edus_head=databatch.batch_edus_head[dev_indices], batch_sbnds=databatch.batch_sbnds[dev_indices], batch_pbnds=databatch.batch_pbnds[dev_indices], batch_nary_sexp=databatch.batch_nary_sexp[dev_indices], batch_bin_sexp=databatch.batch_bin_sexp[dev_indices], batch_arcs=databatch.batch_arcs[dev_indices]) utils.writelog("n_dev=%d" % n_dev) utils.writelog("# of training instances=%d" % len(train_databatch)) utils.writelog("# of development instances=%d" % len(dev_databatch)) return train_databatch, dev_databatch
def send_sms(self, nume, nr_telefon, mesaj): modem1 = Modem.query.filter_by(name=self.modem_name).first() modem1.status = 'busy' db.session.commit() time.sleep(10) try: start_chat = WebDriverWait(self.driver, 30).until( EC.presence_of_element_located((By.CSS_SELECTOR, chat_button_selector)) ) print('dam clik pe startchat') start_chat.click() except TimeoutException: print('nu pot sa dau click') writelog(self.modem_name +' nu pot sa dau click', '3') writelog(self.modem_name +' se inchide', '3') self.letsclose() nr_telefon = str(nr_telefon) nr_telefon = nr_telefon[0:10]+'\n' try: input_number = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#mat-chip-list-%d > div > input' % counter)) ) print(nume + ' cu numarul: ' + nr_telefon + ' primeste mesajul: ' + mesaj) input_number.send_keys(nr_telefon) except TimeoutException: writelog(self.modem_name +' Ceva nu a mers bine: Nu pot sa introduc numarul de telefon', '3') print('Ceva nu a mers bine: Nu pot sa introduc numarul de telefon') self.letsclose() counter_numar() mesaj = str(mesaj) input_text = WebDriverWait(self.driver, 5).until( EC.presence_of_element_located((By.CSS_SELECTOR, input_message_selector)) ) input_text.send_keys(mesaj+'\n') print('am terminat de scris') time.sleep(10)
def main(net_opt = None): start_time = time.time() #get the parameters opt = net_opt or Option() #create dataloader data_loader = DataLoader(opt.data_path, opt.batch_size, opt.n_threads) train_loader, test_loader = data_loader.getloader() print("==>Finish loading data\n") #define checkpoint and load the model check_point = CheckPoint(opt) if opt.retrain: #model check_point_params = check_point.retrainmodel() #model,epoch,optimizer elif opt.resume: check_point_params = check_point.resumemodel() #none else: check_point_params = check_point.check_point_params #load optimizer optimizer = check_point_params['opts'] #load model model = check_point_params['model'] or Net() model = utils.dataparallel(model=model,ngpus=opt.ngpus,gpu0=opt.gpu0) print(model) print("==>Finish loading model\n") start_epoch = check_point_params['resume_epoch'] or 0 if check_point_params['resume_epoch'] is not None: start_epoch += 1 if start_epoch >= opt.nepoch: start_epoch = 1 #create trainer trainer = Trainer(model=model, opt=opt, optimizer=optimizer) #training and testing process best_loss = 100 best_acc = 0 result_train = np.zeros(opt.nepoch) result_test = np.zeros(opt.nepoch) result_trainl = np.zeros(opt.nepoch) result_testl = np.zeros(opt.nepoch) for epoch in range(start_epoch,opt.nepoch): train_loss, train_acc = trainer.train(train_loader=train_loader, epoch=epoch) test_loss, test_acc = trainer.test(test_loader=test_loader, epoch=epoch) # write and print result log_str = "%d\t%.4f\t%.4f\t%.4f\t%.4f\t" % (epoch, train_loss,test_loss, train_acc, test_acc) utils.writelog(log_str) result_train[epoch] = train_acc result_test[epoch] = test_acc result_trainl[epoch] = train_loss result_testl[epoch] = test_loss best_flag = False if test_acc>= best_acc: best_loss = test_loss best_acc = test_acc best_flag = True print("==>Best Result is: Error: %f, Accuracy: %f\n" % (test_loss, test_acc)) check_point.save_model(epoch=epoch, model=trainer.model, opts=trainer.optimizer, best_flag=best_flag) print("==>Best Result is: Error: %f, Accuracy: %f\n" % (best_loss, best_acc)) utils.draw_result(result_train = result_train,result_test =result_test,result_trainl = result_trainl,result_testl =result_testl) end_time = time.time() time_interval = end_time-start_time print("==>Time is: %f\n" % (time_interval))
def letsclose(self): self.driver.quit() writelog('Modemul: ' + self.modem_name +' a fost inchis', '3') modem1 = Modem.query.filter_by(name=self.modem_name).first() modem1.status = 'stopped' db.session.commit()
# Author: tailorYang import mysql.connector as mysql import config, traceback, utils try: conn = mysql.connect(**config.config) cursor = conn.cursor() except: utils.writelog('db').error('"Error: %s"' % (traceback.format_exc())) def select(table, *fields, **data): data = ' and '.join({'%s="%s"' % (k, v) for k, v in data.items()}) fields = ','.join(fields) if data and fields: sql = 'SELECT {} FROM {} WHERE {};'.format(fields, table, data) elif not data and fields: sql = 'SELECT {} FROM {};'.format(fields, table) elif not fields and data: sql = 'SELECT * FROM {} WHERE {};'.format(table, data) else: sql = 'SELECT * FROM {};'.format(table) try: cursor.execute(sql) result = cursor.fetchall() return result except: utils.writelog('db').error('"Exec: %s,"Error: %s"' % (sql, traceback.format_exc()))
def read_rstdt(split, relation_level, with_root=False): """ :type split: str :type relation_level: str :type with_root: bool :rtype: numpy.ndarray(shape=(dataset_size), dtype="O") """ if not relation_level in ["coarse-grained", "fine-grained"]: raise ValueError( "relation_level must be 'coarse-grained' or 'fine-grained'") config = utils.Config() path_root = os.path.join(config.getpath("data"), "rstdt", "wsj", split) if relation_level == "coarse-grained": relation_mapper = treetk.rstdt.RelationMapper() # Reading dataset = [] filenames = os.listdir(path_root) filenames = [n for n in filenames if n.endswith(".edus.tokens")] filenames.sort() for filename in filenames: # Path path_edus = os.path.join(path_root, filename + ".preprocessed") path_edus_postag = os.path.join( path_root, filename.replace(".edus.tokens", ".edus.postags")) path_edus_head = os.path.join( path_root, filename.replace(".edus.tokens", ".edus.heads")) path_sbnds = os.path.join(path_root, filename.replace(".edus.tokens", ".sbnds")) path_pbnds = os.path.join(path_root, filename.replace(".edus.tokens", ".pbnds")) path_nary_sexp = os.path.join( path_root, filename.replace(".edus.tokens", ".labeled.nary.ctree")) path_bin_sexp = os.path.join( path_root, filename.replace(".edus.tokens", ".labeled.bin.ctree")) path_arcs = os.path.join(path_root, filename.replace(".edus.tokens", ".arcs")) kargs = OrderedDict() # EDUs edus = utils.read_lines(path_edus, process=lambda line: line.split()) if with_root: edus = [["<root>"]] + edus kargs["edus"] = edus # EDU IDs edu_ids = np.arange(len(edus)).tolist() kargs["edu_ids"] = edu_ids # EDUs (POS tags) edus_postag = utils.read_lines(path_edus_postag, process=lambda line: line.split()) if with_root: edus_postag = [["<root>"]] + edus_postag kargs["edus_postag"] = edus_postag # EDUs (head) edus_head = utils.read_lines(path_edus_head, process=lambda line: tuple(line.split())) if with_root: edus_head = [("<root>", "<root>", "<root>")] + edus_head kargs["edus_head"] = edus_head # Sentence boundaries sbnds = utils.read_lines( path_sbnds, process=lambda line: tuple([int(x) for x in line.split()])) kargs["sbnds"] = sbnds # Paragraph boundaries pbnds = utils.read_lines( path_pbnds, process=lambda line: tuple([int(x) for x in line.split()])) kargs["pbnds"] = pbnds # Constituent tree nary_sexp = utils.read_lines(path_nary_sexp, process=lambda line: line.split())[0] bin_sexp = utils.read_lines(path_bin_sexp, process=lambda line: line.split())[0] if relation_level == "coarse-grained": nary_tree = treetk.rstdt.postprocess( treetk.sexp2tree(nary_sexp, with_nonterminal_labels=True, with_terminal_labels=False)) bin_tree = treetk.rstdt.postprocess( treetk.sexp2tree(bin_sexp, with_nonterminal_labels=True, with_terminal_labels=False)) nary_tree = treetk.rstdt.map_relations(nary_tree, mode="f2c") bin_tree = treetk.rstdt.map_relations(bin_tree, mode="f2c") nary_sexp = treetk.tree2sexp(nary_tree) bin_sexp = treetk.tree2sexp(bin_tree) kargs["nary_sexp"] = nary_sexp kargs["bin_sexp"] = bin_sexp # Dependency tree hyphens = utils.read_lines(path_arcs, process=lambda line: line.split()) assert len(hyphens) == 1 hyphens = hyphens[0] # list of str arcs = treetk.hyphens2arcs(hyphens) # list of (int, int, str) if relation_level == "coarse-grained": arcs = [(h, d, relation_mapper.f2c(l)) for h, d, l in arcs] kargs["arcs"] = arcs # DataInstance # data = utils.DataInstance( # edus=edus, # edu_ids=edu_ids, # edus_postag=edus_postag, # edus_head=edus_head, # sbnds=sbnds, # pbnds=pbnds, # nary_sexp=nary_sexp, # bin_sexp=bin_sexp, # arcs=arcs) data = utils.DataInstance(**kargs) dataset.append(data) # NOTE that sentence/paragraph boundaries do NOT consider ROOTs even if with_root=True. dataset = np.asarray(dataset, dtype="O") n_docs = len(dataset) n_paras = 0 for data in dataset: n_paras += len(data.pbnds) n_sents = 0 for data in dataset: n_sents += len(data.sbnds) n_edus = 0 for data in dataset: if with_root: n_edus += len(data.edus[1:]) # Exclude the ROOT else: n_edus += len(data.edus) utils.writelog("split=%s" % split) utils.writelog("# of documents=%d" % n_docs) utils.writelog("# of paragraphs=%d" % n_paras) utils.writelog("# of sentences=%d" % n_sents) utils.writelog("# of EDUs (w/o ROOTs)=%d" % n_edus) return dataset
def read_ptbwsj_wo_rstdt(with_root): """ :type with_root: bool :rtype: DataBatch """ config = utils.Config() path_root = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed") # Reading batch_edu_ids = [] batch_edus = [] batch_edus_postag = [] batch_edus_head = [] batch_sbnds = [] batch_pbnds = [] filenames = os.listdir( os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt", "preprocessed")) filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")] filenames = [ n.replace(".paragraph.boundaries", ".edus") for n in filenames ] filenames.sort() for filename in filenames: # Path path_edus = os.path.join(path_root, filename + ".preprocessed") path_edus_postag = os.path.join(path_root, filename + ".postags") path_edus_head = os.path.join(path_root, filename + ".heads") path_sbnds = os.path.join( path_root, filename.replace(".edus", ".sentence.proj.boundaries")) path_pbnds = os.path.join( path_root, filename.replace(".edus", ".paragraph.boundaries")) # EDUs edus = utils.read_lines(path_edus, process=lambda line: line.split()) if with_root: edus = [["<root>"]] + edus batch_edus.append(edus) # EDU IDs edu_ids = np.arange(len(edus)).tolist() batch_edu_ids.append(edu_ids) # EDUs (Syntactic features; POSTAG) edus_postag = utils.read_lines(path_edus_postag, process=lambda line: line.split()) if with_root: edus_postag = [["<root>"]] + edus_postag batch_edus_postag.append(edus_postag) # EDUs (Syntactic features; HEAD) edus_head = utils.read_lines(path_edus_head, process=lambda line: tuple(line.split())) if with_root: edus_head = [("<root>", "<root>", "<root>")] + edus_head batch_edus_head.append(edus_head) # Sentence boundaries sbnds = utils.read_lines( path_sbnds, process=lambda line: tuple([int(x) for x in line.split()])) batch_sbnds.append(sbnds) # Paragraph boundaries pbnds = utils.read_lines( path_pbnds, process=lambda line: tuple([int(x) for x in line.split()])) batch_pbnds.append(pbnds) assert len(batch_edu_ids) \ == len(batch_edus) \ == len(batch_edus_postag) \ == len(batch_edus_head) \ == len(batch_sbnds) \ == len(batch_pbnds) # Conversion to numpy.ndarray batch_edu_ids = np.asarray(batch_edu_ids, dtype="O") batch_edus = np.asarray(batch_edus, dtype="O") batch_edus_postag = np.asarray(batch_edus_postag, dtype="O") batch_edus_head = np.asarray(batch_edus_head, dtype="O") batch_sbnds = np.asarray(batch_sbnds, dtype="O") batch_pbnds = np.asarray(batch_pbnds, dtype="O") # Conversion to DataBatch databatch = utils.DataBatch(batch_edu_ids=batch_edu_ids, batch_edus=batch_edus, batch_edus_postag=batch_edus_postag, batch_edus_head=batch_edus_head, batch_sbnds=batch_sbnds, batch_pbnds=batch_pbnds) total_edus = 0 for edus in batch_edus: if with_root: total_edus += len(edus[1:]) # Exclude the ROOT else: total_edus += len(edus) utils.writelog("# of instances=%d" % len(databatch)) utils.writelog("# of EDUs (w/o ROOTs)=%d" % total_edus) return databatch
def train(model, decoder, sampler, max_epoch, n_init_epochs, negative_size, batch_size, weight_decay, gradient_clipping, optimizer_name, train_dataset, dev_dataset, path_train, path_valid, path_snapshot, path_pred, path_gold): """ :type model: SpanBasedModel :type decoder: IncrementalCKYDecoder :type sampler: TreeSampler :type max_epoch: int :type n_init_epochs: int :type negative_size: int :type batch_size: int :type weight_decay: float :type gradient_clipping: float :type optimizer_name: str :type train_dataset: numpy.ndarray :type dev_dataset: numpy.ndarray :type path_train: str :type path_valid: str :type path_snapshot: str :type path_pred: str :type path_gold: str :rtype: None """ writer_train = jsonlines.Writer(open(path_train, "w"), flush=True) if dev_dataset is not None: writer_valid = jsonlines.Writer(open(path_valid, "w"), flush=True) boundary_flags = [(True, False)] assert negative_size >= len(boundary_flags) negative_tree_sampler = treesamplers.NegativeTreeSampler() # Optimizer preparation if optimizer_name == "adam": opt = optimizers.Adam() else: raise ValueError("Invalid optimizer_name=%s" % optimizer_name) opt.setup(model) if weight_decay > 0.0: opt.add_hook(chainer.optimizer.WeightDecay(weight_decay)) if gradient_clipping: opt.add_hook(chainer.optimizer.GradientClipping(gradient_clipping)) n_train = len(train_dataset) it = 0 bestscore_holder = utils.BestScoreHolder(scale=100.0) bestscore_holder.init() if dev_dataset is not None: # Initial validation with chainer.using_config("train", False), chainer.no_backprop_mode(): parse(model=model, decoder=decoder, dataset=dev_dataset, path_pred=path_pred) scores = metrics.rst_parseval(pred_path=path_pred, gold_path=path_gold) old_scores = metrics.old_rst_parseval(pred_path=path_pred, gold_path=path_gold) out = { "epoch": 0, "Morey2018": { "Unlabeled Precision": scores["S"]["Precision"] * 100.0, "Precision_info": scores["S"]["Precision_info"], "Unlabeled Recall": scores["S"]["Recall"] * 100.0, "Recall_info": scores["S"]["Recall_info"], "Micro F1": scores["S"]["Micro F1"] * 100.0 }, "Marcu2000": { "Unlabeled Precision": old_scores["S"]["Precision"] * 100.0, "Precision_info": old_scores["S"]["Precision_info"], "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0, "Recall_info": old_scores["S"]["Recall_info"], "Micro F1": old_scores["S"]["Micro F1"] * 100.0 } } writer_valid.write(out) utils.writelog(utils.pretty_format_dict(out)) # Saving bestscore_holder.compare_scores(scores["S"]["Micro F1"], step=0) serializers.save_npz(path_snapshot, model) utils.writelog("Saved the model to %s" % path_snapshot) else: # Saving serializers.save_npz(path_snapshot, model) utils.writelog("Saved the model to %s" % path_snapshot) for epoch in range(1, max_epoch + 1): perm = np.random.permutation(n_train) ########## E-Step (BEGIN) ########## utils.writelog("E step ===>") prog_bar = pyprind.ProgBar(n_train) for inst_i in range(0, n_train, batch_size): ### Mini batch for data in train_dataset[inst_i:inst_i + batch_size]: ### One data instance edu_ids = data.edu_ids edus = data.edus edus_postag = data.edus_postag edus_head = data.edus_head sbnds = data.sbnds pbnds = data.pbnds with chainer.using_config("train", False), chainer.no_backprop_mode(): # Feature extraction edu_vectors = model.forward_edus( edus, edus_postag, edus_head) # (n_edus, bilstm_dim) padded_edu_vectors = model.pad_edu_vectors( edu_vectors) # (n_edus+2, bilstm_dim) mask_bwd, mask_fwd = model.make_masks( ) # (1, bilstm_dim), (1, bilstm_dim) # Positive tree if epoch <= n_init_epochs: pos_sexp = sampler.sample(inputs=edu_ids, edus=edus, edus_head=edus_head, sbnds=sbnds, pbnds=pbnds) else: span_scores = precompute_all_span_scores( model=model, edus=edus, edus_postag=edus_postag, sbnds=sbnds, pbnds=pbnds, padded_edu_vectors=padded_edu_vectors, mask_bwd=mask_bwd, mask_fwd=mask_fwd) pos_sexp = decoder.decode(span_scores=span_scores, inputs=edu_ids, sbnds=sbnds, pbnds=pbnds, use_sbnds=True, use_pbnds=True) pos_tree = treetk.sexp2tree(pos_sexp, with_nonterminal_labels=False, with_terminal_labels=False) pos_tree.calc_spans() pos_spans = treetk.aggregate_spans( pos_tree, include_terminal=False, order="post-order") # list of (int, int) data.pos_spans = pos_spans #NOTE prog_bar.update() ########## E-Step (END) ########## ########## M-Step (BEGIN) ########## utils.writelog("M step ===>") for inst_i in range(0, n_train, batch_size): # Processing one mini-batch # Init loss_bracketing, acc_bracketing = 0.0, 0.0 actual_batchsize = 0 for data in train_dataset[perm[inst_i:inst_i + batch_size]]: # Processing one instance edu_ids = data.edu_ids edus = data.edus edus_postag = data.edus_postag edus_head = data.edus_head sbnds = data.sbnds pbnds = data.pbnds pos_spans = data.pos_spans # NOTE # Feature extraction edu_vectors = model.forward_edus( edus, edus_postag, edus_head) # (n_edus, bilstm_dim) padded_edu_vectors = model.pad_edu_vectors( edu_vectors) # (n_edus+2, bilstm_dim) mask_bwd, mask_fwd = model.make_masks( ) # (1, bilstm_dim), (1, bilstm_dim) # Negative trees pos_neg_spans = [] margins = [] pos_neg_spans.append(pos_spans) with chainer.using_config("train", False), chainer.no_backprop_mode(): for use_sbnds, use_pbnds in boundary_flags: span_scores = precompute_all_span_scores( model=model, edus=edus, edus_postag=edus_postag, sbnds=sbnds, pbnds=pbnds, padded_edu_vectors=padded_edu_vectors, mask_bwd=mask_bwd, mask_fwd=mask_fwd) neg_bin_sexp = decoder.decode( span_scores=span_scores, inputs=edu_ids, sbnds=sbnds, pbnds=pbnds, use_sbnds=use_sbnds, use_pbnds=use_pbnds, gold_spans=pos_spans) # list of str neg_tree = treetk.sexp2tree( neg_bin_sexp, with_nonterminal_labels=False, with_terminal_labels=False) neg_tree.calc_spans() neg_spans = treetk.aggregate_spans( neg_tree, include_terminal=False, order="pre-order") # list of (int, int) margin = compute_tree_distance(pos_spans, neg_spans, coef=1.0) pos_neg_spans.append(neg_spans) margins.append(margin) for _ in range(negative_size - len(boundary_flags)): neg_bin_sexp = negative_tree_sampler.sample(inputs=edu_ids, sbnds=sbnds, pbnds=pbnds) neg_tree = treetk.sexp2tree(neg_bin_sexp, with_nonterminal_labels=False, with_terminal_labels=False) neg_tree.calc_spans() neg_spans = treetk.aggregate_spans( neg_tree, include_terminal=False, order="pre-order") # list of (int, int) margin = compute_tree_distance(pos_spans, neg_spans, coef=1.0) pos_neg_spans.append(neg_spans) margins.append(margin) # Scoring pred_scores = model.forward_spans_for_bracketing( edus=edus, edus_postag=edus_postag, sbnds=sbnds, pbnds=pbnds, padded_edu_vectors=padded_edu_vectors, mask_bwd=mask_bwd, mask_fwd=mask_fwd, batch_spans=pos_neg_spans, aggregate=True) # (1+negative_size, 1) # Bracketing Loss for neg_i in range(negative_size): loss_bracketing += F.clip( pred_scores[1 + neg_i] + margins[neg_i] - pred_scores[0], 0.0, 10000000.0) # Ranking Accuracy pred_scores = F.reshape( pred_scores, (1, 1 + negative_size)) # (1, 1+negative_size) gold_scores = np.zeros((1, ), dtype=np.int32) # (1,) gold_scores = utils.convert_ndarray_to_variable( gold_scores, seq=False) # (1,) acc_bracketing += F.accuracy(pred_scores, gold_scores) actual_batchsize += 1 # Backward & Update actual_batchsize = float(actual_batchsize) loss_bracketing = loss_bracketing / actual_batchsize acc_bracketing = acc_bracketing / actual_batchsize loss = loss_bracketing model.zerograds() loss.backward() opt.update() it += 1 # Write log loss_bracketing_data = float(cuda.to_cpu(loss_bracketing.data)) acc_bracketing_data = float(cuda.to_cpu(acc_bracketing.data)) out = { "iter": it, "epoch": epoch, "progress": "%d/%d" % (inst_i + actual_batchsize, n_train), "progress_ratio": float(inst_i + actual_batchsize) / n_train * 100.0, "Bracketing Loss": loss_bracketing_data, "Ranking Accuracy": acc_bracketing_data * 100.0 } writer_train.write(out) utils.writelog(utils.pretty_format_dict(out)) ########## M-Step (END) ########## if dev_dataset is not None: # Validation with chainer.using_config("train", False), chainer.no_backprop_mode(): parse(model=model, decoder=decoder, dataset=dev_dataset, path_pred=path_pred) scores = metrics.rst_parseval(pred_path=path_pred, gold_path=path_gold) old_scores = metrics.old_rst_parseval(pred_path=path_pred, gold_path=path_gold) out = { "epoch": epoch, "Morey2018": { "Unlabeled Precision": scores["S"]["Precision"] * 100.0, "Precision_info": scores["S"]["Precision_info"], "Unlabeled Recall": scores["S"]["Recall"] * 100.0, "Recall_info": scores["S"]["Recall_info"], "Micro F1": scores["S"]["Micro F1"] * 100.0 }, "Marcu2000": { "Unlabeled Precision": old_scores["S"]["Precision"] * 100.0, "Precision_info": old_scores["S"]["Precision_info"], "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0, "Recall_info": old_scores["S"]["Recall_info"], "Micro F1": old_scores["S"]["Micro F1"] * 100.0 } } writer_valid.write(out) utils.writelog(utils.pretty_format_dict(out)) # Saving did_update = bestscore_holder.compare_scores( scores["S"]["Micro F1"], epoch) if did_update: serializers.save_npz(path_snapshot, model) utils.writelog("Saved the model to %s" % path_snapshot) # Finished? if bestscore_holder.ask_finishing(max_patience=10): utils.writelog( "Patience %d is over. Training finished successfully." % bestscore_holder.patience) writer_train.close() if dev_dataset is not None: writer_valid.close() return else: # No validation # Saving serializers.save_npz(path_snapshot, model)
from torch._C import Size import utils, os, torch, time, engine, model from data_helper import train_data_set from torch.utils.data import DataLoader import torch.optim as optim os.environ["CUDA_VISIBLE_DEVICES"] = "2" device = torch.device("cuda") if __name__ == "__main__": log_file = 'log.txt' if os.path.exists(log_file): os.remove(log_file) utils.writelog(file=log_file, log_info='=' * 10 + 'begin to load data' + '=' * 10) since = time.time() train_set = train_data_set(image_dir='train_data/', anno_path='reconstruct-anno.json', size=512) train_set_load = DataLoader(train_set, batch_size=32, shuffle=True) utils.writelog(file=log_file, log_info='=' * 10 + 'finished load data' + '=' * 10 + ', ' + str(time.time() - since)) # for images, targets in train_set_load: # print(targets, '\n') # break utils.writelog(file=log_file, log_info='=' * 10 + 'begin to set model' + '=' * 10)
def main(args): #################### # Arguments gpu = args.gpu model_name = args.model initial_tree_sampling = args.initial_tree_sampling path_config = args.config data_augmentation = args.data_augmentation trial_name = args.name actiontype = args.actiontype max_epoch = args.max_epoch dev_size = args.dev_size # Check assert actiontype in ["train", "evaluate"] if actiontype == "train": assert max_epoch > 0 assert len(initial_tree_sampling.split("_")) == 3 for type_ in initial_tree_sampling.split("_"): assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"] assert initial_tree_sampling.split("_")[2] != "X" assert initial_tree_sampling.split("_")[1] != "RB2" assert initial_tree_sampling.split("_")[2] != "RB2" if trial_name is None or trial_name == "None": trial_name = utils.get_current_time() #################### # Path setting config = utils.Config(path_config) basename = "%s.%s.%s.aug_%s.%s" \ % (model_name, initial_tree_sampling, utils.get_basename_without_ext(path_config), data_augmentation, trial_name) if actiontype == "train": path_log = os.path.join(config.getpath("results"), basename + ".training.log") elif actiontype == "evaluate": path_log = os.path.join(config.getpath("results"), basename + ".evaluation.log") path_train = os.path.join(config.getpath("results"), basename + ".training.jsonl") path_valid = os.path.join(config.getpath("results"), basename + ".validation.jsonl") path_snapshot = os.path.join(config.getpath("results"), basename + ".model") path_pred = os.path.join(config.getpath("results"), basename + ".evaluation.ctrees") path_eval = os.path.join(config.getpath("results"), basename + ".evaluation.json") utils.set_logger(path_log) #################### # Random seed random_seed = trial_name random_seed = utils.hash_string(random_seed) random.seed(random_seed) np.random.seed(random_seed) cuda.cupy.random.seed(random_seed) #################### # Log so far utils.writelog("gpu=%d" % gpu) utils.writelog("model_name=%s" % model_name) utils.writelog("initial_tree_sampling=%s" % initial_tree_sampling) utils.writelog("path_config=%s" % path_config) utils.writelog("data_augmentation=%s" % data_augmentation) utils.writelog("trial_name=%s" % trial_name) utils.writelog("actiontype=%s" % actiontype) utils.writelog("max_epoch=%s" % max_epoch) utils.writelog("dev_size=%s" % dev_size) utils.writelog("path_log=%s" % path_log) utils.writelog("path_train=%s" % path_train) utils.writelog("path_valid=%s" % path_valid) utils.writelog("path_snapshot=%s" % path_snapshot) utils.writelog("path_pred=%s" % path_pred) utils.writelog("path_eval=%s" % path_eval) utils.writelog("random_seed=%d" % random_seed) #################### # Data preparation begin_time = time.time() train_dataset = dataloader.read_rstdt("train", relation_level="coarse-grained", with_root=False) test_dataset = dataloader.read_rstdt("test", relation_level="coarse-grained", with_root=False) vocab_word = utils.read_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt")) vocab_postag = utils.read_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "postags.vocab.txt")) vocab_deprel = utils.read_vocab( os.path.join(config.getpath("data"), "rstdt-vocab", "deprels.vocab.txt")) if data_augmentation: external_train_dataset = dataloader.read_ptbwsj_wo_rstdt( with_root=False) # Remove documents with only one leaf node external_train_dataset = utils.filter_dataset( external_train_dataset, condition=lambda data: len(data.edu_ids) > 1) end_time = time.time() utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time)) #################### # Hyper parameters word_dim = config.getint("word_dim") postag_dim = config.getint("postag_dim") deprel_dim = config.getint("deprel_dim") lstm_dim = config.getint("lstm_dim") mlp_dim = config.getint("mlp_dim") n_init_epochs = config.getint("n_init_epochs") negative_size = config.getint("negative_size") batch_size = config.getint("batch_size") weight_decay = config.getfloat("weight_decay") gradient_clipping = config.getfloat("gradient_clipping") optimizer_name = config.getstr("optimizer_name") utils.writelog("word_dim=%d" % word_dim) utils.writelog("postag_dim=%d" % postag_dim) utils.writelog("deprel_dim=%d" % deprel_dim) utils.writelog("lstm_dim=%d" % lstm_dim) utils.writelog("mlp_dim=%d" % mlp_dim) utils.writelog("n_init_epochs=%d" % n_init_epochs) utils.writelog("negative_size=%d" % negative_size) utils.writelog("batch_size=%d" % batch_size) utils.writelog("weight_decay=%f" % weight_decay) utils.writelog("gradient_clipping=%f" % gradient_clipping) utils.writelog("optimizer_name=%s" % optimizer_name) #################### # Model preparation cuda.get_device(gpu).use() # Initialize a model utils.mkdir(os.path.join(config.getpath("data"), "caches")) path_embed = config.getpath("pretrained_word_embeddings") path_caches = os.path.join( config.getpath("data"), "caches", "cached." + os.path.basename(path_embed) + ".npy") if os.path.exists(path_caches): utils.writelog("Loading cached word embeddings ...") initialW = np.load(path_caches) else: initialW = utils.read_word_embedding_matrix(path=path_embed, dim=word_dim, vocab=vocab_word, scale=0.0) np.save(path_caches, initialW) if model_name == "spanbasedmodel": # Span-based model w/ template features template_feature_extractor = models.TemplateFeatureExtractor( dataset=train_dataset) utils.writelog("Template feature size=%d" % template_feature_extractor.feature_size) if actiontype == "train": for template in template_feature_extractor.templates: dim = template_feature_extractor.template2dim[template] utils.writelog("Template feature #%s %s" % (dim, template)) model = models.SpanBasedModel( vocab_word=vocab_word, vocab_postag=vocab_postag, vocab_deprel=vocab_deprel, word_dim=word_dim, postag_dim=postag_dim, deprel_dim=deprel_dim, lstm_dim=lstm_dim, mlp_dim=mlp_dim, initialW=initialW, template_feature_extractor=template_feature_extractor) elif model_name == "spanbasedmodel2": # Span-based model w/o template features model = models.SpanBasedModel2(vocab_word=vocab_word, vocab_postag=vocab_postag, vocab_deprel=vocab_deprel, word_dim=word_dim, postag_dim=postag_dim, deprel_dim=deprel_dim, lstm_dim=lstm_dim, mlp_dim=mlp_dim, initialW=initialW) else: raise ValueError("Invalid model_name=%s" % model_name) utils.writelog("Initialized the model ``%s''" % model_name) # Load pre-trained parameters if actiontype != "train": serializers.load_npz(path_snapshot, model) utils.writelog("Loaded trained parameters from %s" % path_snapshot) model.to_gpu(gpu) #################### # Decoder preparation decoder = decoders.IncrementalCKYDecoder() #################### # Initializer preparation sampler = treesamplers.TreeSampler(initial_tree_sampling.split("_")) #################### # Training / evaluation if actiontype == "train": with chainer.using_config("train", True): if dev_size > 0: # Training with cross validation train_dataset, dev_dataset = utils.split_dataset( dataset=train_dataset, n_dev=dev_size, seed=None) with open( os.path.join(config.getpath("results"), basename + ".valid_gold.ctrees"), "w") as f: for data in dev_dataset: f.write("%s\n" % " ".join(data.nary_sexp)) else: # Training with the full training set dev_dataset = None if data_augmentation: train_dataset = np.concatenate( [train_dataset, external_train_dataset], axis=0) train(model=model, decoder=decoder, sampler=sampler, max_epoch=max_epoch, n_init_epochs=n_init_epochs, negative_size=negative_size, batch_size=batch_size, weight_decay=weight_decay, gradient_clipping=gradient_clipping, optimizer_name=optimizer_name, train_dataset=train_dataset, dev_dataset=dev_dataset, path_train=path_train, path_valid=path_valid, path_snapshot=path_snapshot, path_pred=os.path.join(config.getpath("results"), basename + ".valid_pred.ctrees"), path_gold=os.path.join(config.getpath("results"), basename + ".valid_gold.ctrees")) elif actiontype == "evaluate": with chainer.using_config("train", False), chainer.no_backprop_mode(): # Test parse(model=model, decoder=decoder, dataset=test_dataset, path_pred=path_pred) scores = metrics.rst_parseval( pred_path=path_pred, gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj", "test", "gold.labeled.nary.ctrees")) old_scores = metrics.old_rst_parseval( pred_path=path_pred, gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj", "test", "gold.labeled.nary.ctrees")) out = { "Morey2018": { "Unlabeled Precision": scores["S"]["Precision"] * 100.0, "Precision_info": scores["S"]["Precision_info"], "Unlabeled Recall": scores["S"]["Recall"] * 100.0, "Recall_info": scores["S"]["Recall_info"], "Micro F1": scores["S"]["Micro F1"] * 100.0 }, "Marcu2000": { "Unlabeled Precision": old_scores["S"]["Precision"] * 100.0, "Precision_info": old_scores["S"]["Precision_info"], "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0, "Recall_info": old_scores["S"]["Recall_info"], "Micro F1": old_scores["S"]["Micro F1"] * 100.0 } } utils.write_json(path_eval, out) utils.writelog(utils.pretty_format_dict(out)) utils.writelog("Done: %s" % basename)
def loginout(): utils.writelog('user').info('"INFO: %s logout"'%session['username']) if session: session.clear() return redirect('/login/')
def read_ptbwsj_wo_rstdt(with_root=False): """ :type with_root: bool :rtype: numpy.ndarray(shape=(dataset_size,), dtype="O") """ config = utils.Config() path_root = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt") # Reading dataset = [] filenames = os.listdir(path_root) filenames = [n for n in filenames if n.endswith(".edus.tokens")] filenames.sort() for filename in filenames: # Path path_edus = os.path.join(path_root, filename + ".preprocessed") path_edus_postag = os.path.join( path_root, filename.replace(".edus.tokens", ".edus.postags")) path_edus_head = os.path.join( path_root, filename.replace(".edus.tokens", ".edus.heads")) path_sbnds = os.path.join(path_root, filename.replace(".edus.tokens", ".sbnds")) path_pbnds = os.path.join(path_root, filename.replace(".edus.tokens", ".pbnds")) kargs = {} # EDUs edus = utils.read_lines(path_edus, process=lambda line: line.split()) if with_root: edus = [["<root>"]] + edus kargs["edus"] = edus # EDU IDs edu_ids = np.arange(len(edus)).tolist() kargs["edu_ids"] = edu_ids # EDUs (POS tags) edus_postag = utils.read_lines(path_edus_postag, process=lambda line: line.split()) if with_root: edus_postag = [["<root>"]] + edus_postag kargs["edus_postag"] = edus_postag # EDUs (head) edus_head = utils.read_lines(path_edus_head, process=lambda line: tuple(line.split())) if with_root: edus_head = [("<root>", "<root>", "<root>")] + edus_head kargs["edus_head"] = edus_head # Sentence boundaries sbnds = utils.read_lines( path_sbnds, process=lambda line: tuple([int(x) for x in line.split()])) kargs["sbnds"] = sbnds # Paragraph boundaries pbnds = utils.read_lines( path_pbnds, process=lambda line: tuple([int(x) for x in line.split()])) kargs["pbnds"] = pbnds data = utils.DataInstance(**kargs) dataset.append(data) dataset = np.asarray(dataset, dtype="O") n_docs = len(dataset) n_paras = 0 for data in dataset: n_paras += len(data.pbnds) n_sents = 0 for data in dataset: n_sents += len(data.sbnds) n_edus = 0 for data in dataset: if with_root: n_edus += len(data.edus[1:]) # Exclude the ROOT else: n_edus += len(data.edus) utils.writelog("# of documents=%d" % n_docs) utils.writelog("# of paragraphs=%d" % n_paras) utils.writelog("# of sentences=%d" % n_sents) utils.writelog("# of EDUs (w/o ROOTs)=%d" % n_edus) return dataset
def main(args): #################### # Arguments tree_sampling = args.tree_sampling # NOTE trial_name = args.name # Check assert len(tree_sampling.split("_")) == 3 for type_ in tree_sampling.split("_"): assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"] assert tree_sampling.split("_")[2] != "X" assert tree_sampling.split("_")[1] != "RB2" assert tree_sampling.split("_")[2] != "RB2" if trial_name is None or trial_name == "None": trial_name = utils.get_current_time() #################### # Path setting config = utils.Config() basename = "%s.%s" \ % (tree_sampling, trial_name) utils.mkdir(os.path.join(config.getpath("results"), "baselines")) path_log = os.path.join(config.getpath("results"), "baselines", basename + ".evaluation.log") path_pred = os.path.join(config.getpath("results"), "baselines", basename + ".evaluation.ctrees") path_eval = os.path.join(config.getpath("results"), "baselines", basename + ".evaluation.json") utils.set_logger(path_log) #################### # Random seed random_seed = trial_name random_seed = utils.hash_string(random_seed) np.random.seed(random_seed) cuda.cupy.random.seed(random_seed) #################### # Log so far utils.writelog("tree_sampling=%s" % tree_sampling) utils.writelog("trial_name=%s" % trial_name) utils.writelog("path_log=%s" % path_log) utils.writelog("path_pred=%s" % path_pred) utils.writelog("path_eval=%s" % path_eval) utils.writelog("random_seed=%d" % random_seed) #################### # Data preparation begin_time = time.time() test_databatch = dataloader.read_rstdt("test", relation_level="coarse-grained", with_root=False) end_time = time.time() utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time)) #################### # Tree-sampler preparation sampler = treesamplers.TreeSampler(tree_sampling.split("_")) # NOTE with chainer.using_config("train", False), chainer.no_backprop_mode(): parse(sampler=sampler, databatch=test_databatch, path_pred=path_pred) scores = rst_parseval.evaluate( pred_path=path_pred, gold_path=os.path.join(config.getpath("data"), "rstdt", "renamed", "test.labeled.nary.ctrees")) old_scores = old_rst_parseval.evaluate( pred_path=path_pred, gold_path=os.path.join(config.getpath("data"), "rstdt", "renamed", "test.labeled.nary.ctrees")) out = { "Morey2018": { "Unlabeled Precision": scores["S"]["Precision"] * 100.0, "Precision_info": scores["S"]["Precision_info"], "Unlabeled Recall": scores["S"]["Recall"] * 100.0, "Recall_info": scores["S"]["Recall_info"], "Micro F1": scores["S"]["Micro F1"] * 100.0 }, "Marcu2000": { "Unlabeled Precision": old_scores["S"]["Precision"] * 100.0, "Precision_info": old_scores["S"]["Precision_info"], "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0, "Recall_info": old_scores["S"]["Recall_info"], "Micro F1": old_scores["S"]["Micro F1"] * 100.0 } } utils.write_json(path_eval, out) utils.writelog(utils.pretty_format_dict(out)) utils.writelog("Done.")