Ejemplo n.º 1
0
def index():
    if 'id' in session:
        res = db.select('user', id=session['id'])[0]
        user = {v: res[k] for k, v in enumerate(config.user_fields)}
        utils.writelog('user').info('"INFO: %s is Is visiting"'%user['username'])
        return render_template('/common/index.html',user=user,role=session['role'])
    return render_template('/common/login.html')
Ejemplo n.º 2
0
def delete(table, id):
    sql = 'DELETE FROM {} WHERE id = {};'.format(table, id)
    try:
        cursor.execute(sql)
        conn.commit()
    except:
        utils.writelog('db').error('"Exec: %s,"Error: %s"' %
                                   (sql, traceback.format_exc()))
    return cursor.rowcount
Ejemplo n.º 3
0
def update(table, condition, id):
    sql = 'UPDATE {} SET {} WHERE id = {};'.format(table, ','.join(condition),
                                                   id)
    try:
        cursor.execute(sql)
        conn.commit()
    except:
        utils.writelog('db').error('"Exec: %s,"Error: %s"' %
                                   (sql, traceback.format_exc()))
    return cursor.rowcount
Ejemplo n.º 4
0
def add(table, data):
    sql = 'INSERT INTO {} ({}) VALUES({});'.format(
        table, ','.join(data.keys()),
        ','.join(['"%s"' % k for k in data.values()]))
    try:
        cursor.execute(sql)
        conn.commit()
    except:
        utils.writelog('db').error('"Exec: %s,"Error: %s"' %
                                   (sql, traceback.format_exc()))
    return cursor.rowcount
Ejemplo n.º 5
0
    def letssee(self, site_url):

        self.driver.get(site_url)
        time.sleep(5)
        if (self.driver.find_elements_by_css_selector(page_selector)):
            # preluam codul qr
            img = self.driver.find_element_by_css_selector(qr_selector)
            timestamp = time.strftime('%d-%m-%Y %H-%M-%S')
            src = img.get_attribute('src')
            qr_image_url = '/static/qr'+timestamp+'.png'
            urllib.request.urlretrieve(src, os.getcwd()+qr_image_url)
            # Asteptam 120 de secunde sa dispara codul qr
            print('Preluam codul qr')
            try:
                writelog('Nu este logat contul, Scanati codul qr : Aveti la dispozitie 2 minute ' + self.modem_name, '3')
                print('Nu este logat contul, Scanati codul qr : Aveti la dispozitie 2 minute')
                no_login = WebDriverWait(self.driver, 10).until(
                    EC.invisibility_of_element_located((By.CSS_SELECTOR, qr_selector))
                )
                try:
                    # Asteptam sa dispara elementul loader
                    startpage = WebDriverWait(self.driver, 10).until(
                        EC.invisibility_of_element_located((By.ID, 'loader'))
                    )
                    # print('Am gasit elementul'+ str(startpage))
                    print('Pagina este gata')
                except TimeoutException:
                    writelog('Pagina nu este gata: ' + self.modem_name +' se inchide', '3')
                    print('Pagina nu este gata')
                    self.driver.quit()


            except TimeoutException:
                writelog('PPrima pagina nu a fost incarcata: ' + self.modem_name +' se inchide', '3')
                print('Prima pagina nu a fost incarcata')
                self.driver.quit()
            else:
                # nu exista cod qr - contul este logat si asteptam sa se inchida elementul loader
                try:
                    # Asteptam sa dispara elementul loader
                    startpage = WebDriverWait(self.driver, 10).until(
                        EC.invisibility_of_element_located((By.ID, loader_selector))
                    )
                    # print('Am gasit elementul'+ str(startpage))
                    print('Pagina este gata')
                except TimeoutException:
                    writelog('Pagina nu este gata: ' + self.modem_name +' se inchide', '3')
                    print('Pagina nu este gata')
        print('Am terminat initializarea')
        writelog('Modemul: ' + self.modem_name +' a fost initializat', '3')
        modem1 = Modem.query.filter_by(name=self.modem_name).first()
        modem1.status = 'running'
        db.session.commit()
Ejemplo n.º 6
0
def userinfo():
    if request.args.get('id'):
        user = db.select('user',id=request.args.get('id'))[0]
        user = {v: user[k] for k, v in enumerate(config.user_fields)}
        utils.writelog('user').info("INFO: {} is change {}'s info".format(session['username'],user['username']))
        return json.dumps({'result':user})
    else:
        role = session.get('role')
        res = db.select('user',id = session['id'])[0]
        user = {v: res[k] for k, v in enumerate(config.user_fields)}
        utils.writelog('user').info("INFO : %s is view his info "%user['username'])
        return render_template("/user/userinfo.html", user = user, role=role)
Ejemplo n.º 7
0
def collect_tweets(task, tags):
  '''
  Collect tweets for tag, indefinitely and store in csv files
  '''

  appKeys = kays.appKeys

  with fopen(task, newline='\n', encoding='utf-8') as f:
    keyIdx = 0
    tagIdx = 0

    # writer for csv
    writer = csv.writer(f)

    # save task to log
    writelog(task, tags)

    # collect tweets indefinitely by using all keys
    while True:
      print(time.ctime(), 'Collecting tweets...')
      # get the key
      key = appKeys[keyIdx]

      # create auth and api
      auth = tweepy.OAuthHandler(key['consumerAPIKey'], key['consumerAPISecretKey'])
      auth.set_access_token(key['accessToken'], key['accessTokenSecret'])
      api = tweepy.API(auth)

      # filter out retweets
      query = tags[tagIdx] + ' -filter:retweets'
      count = 0

      # collect tweets and save
      try:
        for tweet in tweepy.Cursor(api.search, q=query).items():
          user = tweet.user

          # escape text
          row = map(esc, [tweet.text, tweet.id, user.name, user.screen_name, user.location, user.description, user.followers_count, user.friends_count, user.listed_count, user.statuses_count, user.favourites_count, user.verified, user.default_profile_image, user.default_profile, user.protected, user.created_at])

          writer.writerow(row)
          count = count+1
      except Exception as e:
        # Wait for 10 mins and then start using next key
        print(time.ctime(), 'Got {} tweets'.format(count))
        # if keyIdx+1 == len(appKeys):
        tagIdx = (tagIdx+1) % len(tags)
        keyIdx = (keyIdx+1) % len(appKeys)
        time.sleep(10 * 60)
Ejemplo n.º 8
0
def select(table, *fields, **data):
    data = ' and '.join({'%s="%s"' % (k, v) for k, v in data.items()})
    fields = ','.join(fields)
    if data and fields:
        sql = 'SELECT {} FROM {} WHERE {};'.format(fields, table, data)
    elif not data and fields:
        sql = 'SELECT {} FROM {};'.format(fields, table)
    elif not fields and data:
        sql = 'SELECT * FROM {} WHERE {};'.format(table, data)
    else:
        sql = 'SELECT * FROM {};'.format(table)
    try:
        cursor.execute(sql)
        result = cursor.fetchall()
        return result
    except:
        utils.writelog('db').error('"Exec: %s,"Error: %s"' %
                                   (sql, traceback.format_exc()))
Ejemplo n.º 9
0
def api_create_mesaj(current_user):
    data = request.get_json()
    if data['name']  == '' or data['mesaj'] == '':
        return jsonify({'message' : 'Aveti campuri goale'})
    elif len(data['telefon']) > 10 or len(data['telefon']) < 10:
        return jsonify({'message' : 'Telefonul trebuie sa contina 10 caractere'})
    else:
        mesaj_lenght = len(data['mesaj'])
        print(mesaj_lenght)
        mesaj_count = math.ceil(mesaj_lenght / 160)
        new_mesaj = Mesaj(name = data['name'],
                            telefon = data['telefon'],
                            mesaj = data['mesaj'],
                            mesaj_count = mesaj_count,
                            mesaj_lenght = mesaj_lenght,
                            user_id = current_user.id,
                            is_sent = False)
        db.session.add(new_mesaj)
        db.session.commit()
        writelog('Mesaj adaugat prin api', current_user.id)
        return jsonify({'message' : 'Message sent'})
Ejemplo n.º 10
0
def handleHMIRequest():
    waterlevel = int(request.form.get('jwttoken'))
    plaintext, sig = request.form.get('HMIquery').split(',')
    print("request = ", plaintext, sig, waterlevel)
    key = "c" * 16  # tmp
    res = {"errmsg": "", "data": "123"}
    if utils.authcheck(plaintext, sig, waterlevel) == 0:
        res['errmsg'] = "Authentication failed"
        utils.writelog("Authentication failed: %s" % (plaintext))
        return res
    else:
        modbusUtils.modbusSend(
            sock,
            modbusUtils.makeWriteSingleRegisterRequest(cm.memLoc, waterlevel))
        response = modbusUtils.parseWriteSingleRegisterResponse(
            modbusUtils.modbusRecv(sock))
        print("request response = ", response)
        modbusUtils.modbusSend(
            sock, modbusUtils.makeReadInputRegistersRequest(cm.revmemLoc, 1))
        response = modbusUtils.parseReadInputRegistersResponse(
            modbusUtils.modbusRecv(sock))
        print("response response = ", response)
    return jsonify(utils.AESEncrypt(str(waterlevel), key))
Ejemplo n.º 11
0
def api_login():
    auth = request.authorization
    add_ip =request.remote_addr

    if not auth or not auth.username or not auth.password:
        writelog(add_ip + ' Could not verify', 1)
        return make_response('Could not verify', 401, {'WWW=Authenticate' : 'Basic realm = "Login required"'})
    user =User.query.filter_by(username=auth.username).first()

    if not user:
        writelog(add_ip + ' Could not verify', 1)
        return make_response('Could not verify', 401, {'WWW=Authenticate' : 'Basic realm = "Login required"'})

    if check_password_hash(user.password, auth.password):
        token = jwt.encode({'public_id' : user.public_id, 'exp' : datetime.utcnow() + timedelta(minutes=360)}, app.config['SECRET_KEY'])
        writelog(add_ip + ' Am emis tokenul pentru ' + user.username, user.id)
        return jsonify({'token' : token.decode('UTF-8')})
    writelog(add_ip + ' Could not verify', 1)
    return make_response('Could not verify', 401, {'WWW=Authenticate' : 'Basic realm = "Login required"'})
Ejemplo n.º 12
0
def kfold(n_splits, split_id, databatch):
    """
    :type n_splits: int
    :type split_id: int
    :type databatch: DataBatch
    :rtype: DataBatch, DataBatch
    """
    assert 0 <= split_id < n_splits

    kfold = KFold(n_splits=n_splits, random_state=1234, shuffle=True)

    indices_list = list(kfold.split(np.arange(len(databatch))))
    train_indices, dev_indices = indices_list[split_id]
    assert len(train_indices) + len(dev_indices) == len(databatch)

    train_databatch = utils.DataBatch(
        batch_edu_ids=databatch.batch_edu_ids[train_indices],
        batch_edus=databatch.batch_edus[train_indices],
        batch_edus_postag=databatch.batch_edus_postag[train_indices],
        batch_edus_head=databatch.batch_edus_head[train_indices],
        batch_sbnds=databatch.batch_sbnds[train_indices],
        batch_pbnds=databatch.batch_pbnds[train_indices],
        batch_nary_sexp=databatch.batch_nary_sexp[train_indices],
        batch_bin_sexp=databatch.batch_bin_sexp[train_indices],
        batch_arcs=databatch.batch_arcs[train_indices])
    dev_databatch = utils.DataBatch(
        batch_edu_ids=databatch.batch_edu_ids[dev_indices],
        batch_edus=databatch.batch_edus[dev_indices],
        batch_edus_postag=databatch.batch_edus_postag[dev_indices],
        batch_edus_head=databatch.batch_edus_head[dev_indices],
        batch_sbnds=databatch.batch_sbnds[dev_indices],
        batch_pbnds=databatch.batch_pbnds[dev_indices],
        batch_nary_sexp=databatch.batch_nary_sexp[dev_indices],
        batch_bin_sexp=databatch.batch_bin_sexp[dev_indices],
        batch_arcs=databatch.batch_arcs[dev_indices])

    utils.writelog("n_splits=%d" % n_splits)
    utils.writelog("split_id=%d" % split_id)
    utils.writelog("# of training instances=%d" % len(train_databatch))
    utils.writelog("# of development instances=%d" % len(dev_databatch))

    return train_databatch, dev_databatch
Ejemplo n.º 13
0
def login():
    if request.method == 'GET':
        return render_template('/common/login.html')
    user_dict = {k:v[0] for k , v in dict(request.form).items()}
    if user_dict['username'] and user_dict['password']:
        res = db.select('user',**user_dict)[0]
        user = {v:res[k] for k,v in enumerate(config.user_fields)}
        if res:
            session['role'] = user['role']
            session['id'] = user['id']
            session['username'] = ['username']
            utils.writelog('user').info('"INFO: %s is login success"' %user['username'])
            return json.dumps({'code': '1', 'result':  'login success'})
        else:
            utils.writelog('user').error('"INFO: %s  login Failed"' %user_dict['username'])
            return json.dumps({'code': '0', 'result': 'name or password wrong'})
    else:
        utils.writelog('user').error('"INFO: %s  login Failed"' %user_dict['username'])
        return json.dump({'code':'0','result':'username or password not be null'})
def randomsplit(n_dev, databatch):
    """
    :type n_dev: int
    :type databatch: DataBatch
    :rtype: DataBatch, DataBatch
    """
    n_total = len(databatch)
    assert 0 < n_dev < n_total

    indices = np.random.RandomState(1234).randint(0, n_total, n_total)
    dev_indices = indices[:n_dev]
    train_indices = indices[n_dev:]
    assert len(train_indices) + len(dev_indices) == len(databatch)

    train_databatch = utils.DataBatch(
        batch_edu_ids=databatch.batch_edu_ids[train_indices],
        batch_edus=databatch.batch_edus[train_indices],
        batch_edus_postag=databatch.batch_edus_postag[train_indices],
        batch_edus_head=databatch.batch_edus_head[train_indices],
        batch_sbnds=databatch.batch_sbnds[train_indices],
        batch_pbnds=databatch.batch_pbnds[train_indices],
        batch_nary_sexp=databatch.batch_nary_sexp[train_indices],
        batch_bin_sexp=databatch.batch_bin_sexp[train_indices],
        batch_arcs=databatch.batch_arcs[train_indices])
    dev_databatch = utils.DataBatch(
        batch_edu_ids=databatch.batch_edu_ids[dev_indices],
        batch_edus=databatch.batch_edus[dev_indices],
        batch_edus_postag=databatch.batch_edus_postag[dev_indices],
        batch_edus_head=databatch.batch_edus_head[dev_indices],
        batch_sbnds=databatch.batch_sbnds[dev_indices],
        batch_pbnds=databatch.batch_pbnds[dev_indices],
        batch_nary_sexp=databatch.batch_nary_sexp[dev_indices],
        batch_bin_sexp=databatch.batch_bin_sexp[dev_indices],
        batch_arcs=databatch.batch_arcs[dev_indices])

    utils.writelog("n_dev=%d" % n_dev)
    utils.writelog("# of training instances=%d" % len(train_databatch))
    utils.writelog("# of development instances=%d" % len(dev_databatch))

    return train_databatch, dev_databatch
Ejemplo n.º 15
0
    def send_sms(self, nume, nr_telefon, mesaj):
        modem1 = Modem.query.filter_by(name=self.modem_name).first()
        modem1.status = 'busy'
        db.session.commit()
        time.sleep(10)
        try:
            start_chat = WebDriverWait(self.driver, 30).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, chat_button_selector))
            )
            print('dam clik pe startchat')
            start_chat.click()
        except TimeoutException:
            print('nu pot sa dau click')
            writelog(self.modem_name +' nu pot sa dau click', '3')
            writelog(self.modem_name +' se inchide', '3')
            self.letsclose()

        nr_telefon = str(nr_telefon)
        nr_telefon = nr_telefon[0:10]+'\n'
        try:
            input_number = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, '#mat-chip-list-%d > div > input' % counter))
            )
            print(nume + ' cu numarul: ' + nr_telefon + ' primeste mesajul: ' + mesaj)
            input_number.send_keys(nr_telefon)

        except TimeoutException:
            writelog(self.modem_name +' Ceva nu a mers bine: Nu pot sa introduc numarul de telefon', '3')
            print('Ceva nu a mers bine: Nu pot sa introduc numarul de telefon')
            self.letsclose()
        counter_numar()
        mesaj = str(mesaj)
        input_text = WebDriverWait(self.driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, input_message_selector))
        )
        input_text.send_keys(mesaj+'\n')
        print('am terminat de scris')
        time.sleep(10)
Ejemplo n.º 16
0
def main(net_opt = None):
	start_time = time.time()
	
	#get the parameters
	opt = net_opt or Option()
	
	#create dataloader
	data_loader = DataLoader(opt.data_path, opt.batch_size, opt.n_threads)
	train_loader, test_loader = data_loader.getloader()
	print("==>Finish loading data\n")
	
	#define checkpoint and load the model
	check_point = CheckPoint(opt)
	if opt.retrain:
		#model
		check_point_params = check_point.retrainmodel()
		#model,epoch,optimizer
	elif opt.resume:
		check_point_params = check_point.resumemodel()
		#none
	else:
		check_point_params = check_point.check_point_params
	
	#load optimizer
	optimizer = check_point_params['opts']
	
	#load model
	model = check_point_params['model'] or Net()
	model = utils.dataparallel(model=model,ngpus=opt.ngpus,gpu0=opt.gpu0)
	print(model)
	print("==>Finish loading model\n")
	
	start_epoch = check_point_params['resume_epoch'] or 0
	if check_point_params['resume_epoch'] is not None:
		start_epoch += 1
	if start_epoch >= opt.nepoch:
		start_epoch = 1
	
	
	#create trainer
	trainer = Trainer(model=model, opt=opt, optimizer=optimizer)
	
	#training and testing process
	best_loss = 100
	best_acc = 0
	result_train = np.zeros(opt.nepoch)
	result_test = np.zeros(opt.nepoch)
	result_trainl = np.zeros(opt.nepoch)
	result_testl = np.zeros(opt.nepoch)
	for epoch in range(start_epoch,opt.nepoch):
	
		train_loss, train_acc = trainer.train(train_loader=train_loader, epoch=epoch)
		test_loss, test_acc = trainer.test(test_loader=test_loader, epoch=epoch)
		# write and print result
		log_str = "%d\t%.4f\t%.4f\t%.4f\t%.4f\t" % (epoch, train_loss,test_loss, train_acc, test_acc)
		utils.writelog(log_str)
		
		result_train[epoch] = train_acc
		result_test[epoch] = test_acc
		
		result_trainl[epoch] = train_loss
		result_testl[epoch] = test_loss
	
		best_flag = False
		if test_acc>= best_acc:
			best_loss = test_loss
			best_acc = test_acc
			
			best_flag = True
			print("==>Best Result is: Error: %f, Accuracy: %f\n" 
					% (test_loss, test_acc))
		
		check_point.save_model(epoch=epoch, model=trainer.model,
					opts=trainer.optimizer, best_flag=best_flag)
	print("==>Best Result is: Error: %f, Accuracy: %f\n" 
			  % (best_loss, best_acc))
	utils.draw_result(result_train = result_train,result_test =result_test,result_trainl = result_trainl,result_testl =result_testl)
	end_time = time.time()
	time_interval = end_time-start_time
	print("==>Time is: %f\n" 
			  % (time_interval))
Ejemplo n.º 17
0
 def letsclose(self):
     self.driver.quit()
     writelog('Modemul: ' + self.modem_name +' a fost inchis', '3')
     modem1 = Modem.query.filter_by(name=self.modem_name).first()
     modem1.status = 'stopped'
     db.session.commit()
Ejemplo n.º 18
0
# Author: tailorYang
import mysql.connector as mysql
import config, traceback, utils

try:
    conn = mysql.connect(**config.config)
    cursor = conn.cursor()
except:
    utils.writelog('db').error('"Error: %s"' % (traceback.format_exc()))


def select(table, *fields, **data):
    data = ' and '.join({'%s="%s"' % (k, v) for k, v in data.items()})
    fields = ','.join(fields)
    if data and fields:
        sql = 'SELECT {} FROM {} WHERE {};'.format(fields, table, data)
    elif not data and fields:
        sql = 'SELECT {} FROM {};'.format(fields, table)
    elif not fields and data:
        sql = 'SELECT * FROM {} WHERE {};'.format(table, data)
    else:
        sql = 'SELECT * FROM {};'.format(table)
    try:
        cursor.execute(sql)
        result = cursor.fetchall()
        return result
    except:
        utils.writelog('db').error('"Exec: %s,"Error: %s"' %
                                   (sql, traceback.format_exc()))

def read_rstdt(split, relation_level, with_root=False):
    """
    :type split: str
    :type relation_level: str
    :type with_root: bool
    :rtype: numpy.ndarray(shape=(dataset_size), dtype="O")
    """
    if not relation_level in ["coarse-grained", "fine-grained"]:
        raise ValueError(
            "relation_level must be 'coarse-grained' or 'fine-grained'")

    config = utils.Config()

    path_root = os.path.join(config.getpath("data"), "rstdt", "wsj", split)

    if relation_level == "coarse-grained":
        relation_mapper = treetk.rstdt.RelationMapper()

    # Reading
    dataset = []

    filenames = os.listdir(path_root)
    filenames = [n for n in filenames if n.endswith(".edus.tokens")]
    filenames.sort()

    for filename in filenames:
        # Path
        path_edus = os.path.join(path_root, filename + ".preprocessed")
        path_edus_postag = os.path.join(
            path_root, filename.replace(".edus.tokens", ".edus.postags"))
        path_edus_head = os.path.join(
            path_root, filename.replace(".edus.tokens", ".edus.heads"))
        path_sbnds = os.path.join(path_root,
                                  filename.replace(".edus.tokens", ".sbnds"))
        path_pbnds = os.path.join(path_root,
                                  filename.replace(".edus.tokens", ".pbnds"))
        path_nary_sexp = os.path.join(
            path_root, filename.replace(".edus.tokens", ".labeled.nary.ctree"))
        path_bin_sexp = os.path.join(
            path_root, filename.replace(".edus.tokens", ".labeled.bin.ctree"))
        path_arcs = os.path.join(path_root,
                                 filename.replace(".edus.tokens", ".arcs"))

        kargs = OrderedDict()

        # EDUs
        edus = utils.read_lines(path_edus, process=lambda line: line.split())
        if with_root:
            edus = [["<root>"]] + edus
        kargs["edus"] = edus

        # EDU IDs
        edu_ids = np.arange(len(edus)).tolist()
        kargs["edu_ids"] = edu_ids

        # EDUs (POS tags)
        edus_postag = utils.read_lines(path_edus_postag,
                                       process=lambda line: line.split())
        if with_root:
            edus_postag = [["<root>"]] + edus_postag
        kargs["edus_postag"] = edus_postag

        # EDUs (head)
        edus_head = utils.read_lines(path_edus_head,
                                     process=lambda line: tuple(line.split()))
        if with_root:
            edus_head = [("<root>", "<root>", "<root>")] + edus_head
        kargs["edus_head"] = edus_head

        # Sentence boundaries
        sbnds = utils.read_lines(
            path_sbnds,
            process=lambda line: tuple([int(x) for x in line.split()]))
        kargs["sbnds"] = sbnds

        # Paragraph boundaries
        pbnds = utils.read_lines(
            path_pbnds,
            process=lambda line: tuple([int(x) for x in line.split()]))
        kargs["pbnds"] = pbnds

        # Constituent tree
        nary_sexp = utils.read_lines(path_nary_sexp,
                                     process=lambda line: line.split())[0]
        bin_sexp = utils.read_lines(path_bin_sexp,
                                    process=lambda line: line.split())[0]
        if relation_level == "coarse-grained":
            nary_tree = treetk.rstdt.postprocess(
                treetk.sexp2tree(nary_sexp,
                                 with_nonterminal_labels=True,
                                 with_terminal_labels=False))
            bin_tree = treetk.rstdt.postprocess(
                treetk.sexp2tree(bin_sexp,
                                 with_nonterminal_labels=True,
                                 with_terminal_labels=False))
            nary_tree = treetk.rstdt.map_relations(nary_tree, mode="f2c")
            bin_tree = treetk.rstdt.map_relations(bin_tree, mode="f2c")
            nary_sexp = treetk.tree2sexp(nary_tree)
            bin_sexp = treetk.tree2sexp(bin_tree)
        kargs["nary_sexp"] = nary_sexp
        kargs["bin_sexp"] = bin_sexp

        # Dependency tree
        hyphens = utils.read_lines(path_arcs,
                                   process=lambda line: line.split())
        assert len(hyphens) == 1
        hyphens = hyphens[0]  # list of str
        arcs = treetk.hyphens2arcs(hyphens)  # list of (int, int, str)
        if relation_level == "coarse-grained":
            arcs = [(h, d, relation_mapper.f2c(l)) for h, d, l in arcs]
        kargs["arcs"] = arcs

        # DataInstance
        # data = utils.DataInstance(
        #                 edus=edus,
        #                 edu_ids=edu_ids,
        #                 edus_postag=edus_postag,
        #                 edus_head=edus_head,
        #                 sbnds=sbnds,
        #                 pbnds=pbnds,
        #                 nary_sexp=nary_sexp,
        #                 bin_sexp=bin_sexp,
        #                 arcs=arcs)
        data = utils.DataInstance(**kargs)
        dataset.append(data)

    # NOTE that sentence/paragraph boundaries do NOT consider ROOTs even if with_root=True.

    dataset = np.asarray(dataset, dtype="O")

    n_docs = len(dataset)

    n_paras = 0
    for data in dataset:
        n_paras += len(data.pbnds)

    n_sents = 0
    for data in dataset:
        n_sents += len(data.sbnds)

    n_edus = 0
    for data in dataset:
        if with_root:
            n_edus += len(data.edus[1:])  # Exclude the ROOT
        else:
            n_edus += len(data.edus)

    utils.writelog("split=%s" % split)
    utils.writelog("# of documents=%d" % n_docs)
    utils.writelog("# of paragraphs=%d" % n_paras)
    utils.writelog("# of sentences=%d" % n_sents)
    utils.writelog("# of EDUs (w/o ROOTs)=%d" % n_edus)
    return dataset
Ejemplo n.º 20
0
def read_ptbwsj_wo_rstdt(with_root):
    """
    :type with_root: bool
    :rtype: DataBatch
    """
    config = utils.Config()

    path_root = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                             "preprocessed")

    # Reading
    batch_edu_ids = []
    batch_edus = []
    batch_edus_postag = []
    batch_edus_head = []
    batch_sbnds = []
    batch_pbnds = []
    filenames = os.listdir(
        os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt",
                     "preprocessed"))
    filenames = [n for n in filenames if n.endswith(".paragraph.boundaries")]
    filenames = [
        n.replace(".paragraph.boundaries", ".edus") for n in filenames
    ]
    filenames.sort()
    for filename in filenames:
        # Path
        path_edus = os.path.join(path_root, filename + ".preprocessed")
        path_edus_postag = os.path.join(path_root, filename + ".postags")
        path_edus_head = os.path.join(path_root, filename + ".heads")
        path_sbnds = os.path.join(
            path_root, filename.replace(".edus", ".sentence.proj.boundaries"))
        path_pbnds = os.path.join(
            path_root, filename.replace(".edus", ".paragraph.boundaries"))
        # EDUs
        edus = utils.read_lines(path_edus, process=lambda line: line.split())
        if with_root:
            edus = [["<root>"]] + edus
        batch_edus.append(edus)
        # EDU IDs
        edu_ids = np.arange(len(edus)).tolist()
        batch_edu_ids.append(edu_ids)
        # EDUs (Syntactic features; POSTAG)
        edus_postag = utils.read_lines(path_edus_postag,
                                       process=lambda line: line.split())
        if with_root:
            edus_postag = [["<root>"]] + edus_postag
        batch_edus_postag.append(edus_postag)
        # EDUs (Syntactic features; HEAD)
        edus_head = utils.read_lines(path_edus_head,
                                     process=lambda line: tuple(line.split()))
        if with_root:
            edus_head = [("<root>", "<root>", "<root>")] + edus_head
        batch_edus_head.append(edus_head)
        # Sentence boundaries
        sbnds = utils.read_lines(
            path_sbnds,
            process=lambda line: tuple([int(x) for x in line.split()]))
        batch_sbnds.append(sbnds)
        # Paragraph boundaries
        pbnds = utils.read_lines(
            path_pbnds,
            process=lambda line: tuple([int(x) for x in line.split()]))
        batch_pbnds.append(pbnds)
    assert len(batch_edu_ids) \
            == len(batch_edus) \
            == len(batch_edus_postag) \
            == len(batch_edus_head) \
            == len(batch_sbnds) \
            == len(batch_pbnds)

    # Conversion to numpy.ndarray
    batch_edu_ids = np.asarray(batch_edu_ids, dtype="O")
    batch_edus = np.asarray(batch_edus, dtype="O")
    batch_edus_postag = np.asarray(batch_edus_postag, dtype="O")
    batch_edus_head = np.asarray(batch_edus_head, dtype="O")
    batch_sbnds = np.asarray(batch_sbnds, dtype="O")
    batch_pbnds = np.asarray(batch_pbnds, dtype="O")

    # Conversion to DataBatch
    databatch = utils.DataBatch(batch_edu_ids=batch_edu_ids,
                                batch_edus=batch_edus,
                                batch_edus_postag=batch_edus_postag,
                                batch_edus_head=batch_edus_head,
                                batch_sbnds=batch_sbnds,
                                batch_pbnds=batch_pbnds)

    total_edus = 0
    for edus in batch_edus:
        if with_root:
            total_edus += len(edus[1:])  # Exclude the ROOT
        else:
            total_edus += len(edus)
    utils.writelog("# of instances=%d" % len(databatch))
    utils.writelog("# of EDUs (w/o ROOTs)=%d" % total_edus)
    return databatch
Ejemplo n.º 21
0
def train(model, decoder, sampler, max_epoch, n_init_epochs, negative_size,
          batch_size, weight_decay, gradient_clipping, optimizer_name,
          train_dataset, dev_dataset, path_train, path_valid, path_snapshot,
          path_pred, path_gold):
    """
    :type model: SpanBasedModel
    :type decoder: IncrementalCKYDecoder
    :type sampler: TreeSampler
    :type max_epoch: int
    :type n_init_epochs: int
    :type negative_size: int
    :type batch_size: int
    :type weight_decay: float
    :type gradient_clipping: float
    :type optimizer_name: str
    :type train_dataset: numpy.ndarray
    :type dev_dataset: numpy.ndarray
    :type path_train: str
    :type path_valid: str
    :type path_snapshot: str
    :type path_pred: str
    :type path_gold: str
    :rtype: None
    """
    writer_train = jsonlines.Writer(open(path_train, "w"), flush=True)
    if dev_dataset is not None:
        writer_valid = jsonlines.Writer(open(path_valid, "w"), flush=True)

    boundary_flags = [(True, False)]
    assert negative_size >= len(boundary_flags)
    negative_tree_sampler = treesamplers.NegativeTreeSampler()

    # Optimizer preparation
    if optimizer_name == "adam":
        opt = optimizers.Adam()
    else:
        raise ValueError("Invalid optimizer_name=%s" % optimizer_name)

    opt.setup(model)

    if weight_decay > 0.0:
        opt.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    if gradient_clipping:
        opt.add_hook(chainer.optimizer.GradientClipping(gradient_clipping))

    n_train = len(train_dataset)
    it = 0
    bestscore_holder = utils.BestScoreHolder(scale=100.0)
    bestscore_holder.init()

    if dev_dataset is not None:
        # Initial validation
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            parse(model=model,
                  decoder=decoder,
                  dataset=dev_dataset,
                  path_pred=path_pred)
            scores = metrics.rst_parseval(pred_path=path_pred,
                                          gold_path=path_gold)
            old_scores = metrics.old_rst_parseval(pred_path=path_pred,
                                                  gold_path=path_gold)
            out = {
                "epoch": 0,
                "Morey2018": {
                    "Unlabeled Precision": scores["S"]["Precision"] * 100.0,
                    "Precision_info": scores["S"]["Precision_info"],
                    "Unlabeled Recall": scores["S"]["Recall"] * 100.0,
                    "Recall_info": scores["S"]["Recall_info"],
                    "Micro F1": scores["S"]["Micro F1"] * 100.0
                },
                "Marcu2000": {
                    "Unlabeled Precision":
                    old_scores["S"]["Precision"] * 100.0,
                    "Precision_info": old_scores["S"]["Precision_info"],
                    "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0,
                    "Recall_info": old_scores["S"]["Recall_info"],
                    "Micro F1": old_scores["S"]["Micro F1"] * 100.0
                }
            }
            writer_valid.write(out)
            utils.writelog(utils.pretty_format_dict(out))
        # Saving
        bestscore_holder.compare_scores(scores["S"]["Micro F1"], step=0)
        serializers.save_npz(path_snapshot, model)
        utils.writelog("Saved the model to %s" % path_snapshot)
    else:
        # Saving
        serializers.save_npz(path_snapshot, model)
        utils.writelog("Saved the model to %s" % path_snapshot)

    for epoch in range(1, max_epoch + 1):

        perm = np.random.permutation(n_train)

        ########## E-Step (BEGIN) ##########
        utils.writelog("E step ===>")

        prog_bar = pyprind.ProgBar(n_train)

        for inst_i in range(0, n_train, batch_size):

            ### Mini batch

            for data in train_dataset[inst_i:inst_i + batch_size]:

                ### One data instance

                edu_ids = data.edu_ids
                edus = data.edus
                edus_postag = data.edus_postag
                edus_head = data.edus_head
                sbnds = data.sbnds
                pbnds = data.pbnds

                with chainer.using_config("train",
                                          False), chainer.no_backprop_mode():

                    # Feature extraction
                    edu_vectors = model.forward_edus(
                        edus, edus_postag, edus_head)  # (n_edus, bilstm_dim)
                    padded_edu_vectors = model.pad_edu_vectors(
                        edu_vectors)  # (n_edus+2, bilstm_dim)
                    mask_bwd, mask_fwd = model.make_masks(
                    )  # (1, bilstm_dim), (1, bilstm_dim)

                    # Positive tree
                    if epoch <= n_init_epochs:
                        pos_sexp = sampler.sample(inputs=edu_ids,
                                                  edus=edus,
                                                  edus_head=edus_head,
                                                  sbnds=sbnds,
                                                  pbnds=pbnds)
                    else:
                        span_scores = precompute_all_span_scores(
                            model=model,
                            edus=edus,
                            edus_postag=edus_postag,
                            sbnds=sbnds,
                            pbnds=pbnds,
                            padded_edu_vectors=padded_edu_vectors,
                            mask_bwd=mask_bwd,
                            mask_fwd=mask_fwd)
                        pos_sexp = decoder.decode(span_scores=span_scores,
                                                  inputs=edu_ids,
                                                  sbnds=sbnds,
                                                  pbnds=pbnds,
                                                  use_sbnds=True,
                                                  use_pbnds=True)
                    pos_tree = treetk.sexp2tree(pos_sexp,
                                                with_nonterminal_labels=False,
                                                with_terminal_labels=False)
                    pos_tree.calc_spans()
                    pos_spans = treetk.aggregate_spans(
                        pos_tree, include_terminal=False,
                        order="post-order")  # list of (int, int)
                    data.pos_spans = pos_spans  #NOTE
                    prog_bar.update()
        ########## E-Step (END) ##########

        ########## M-Step (BEGIN) ##########
        utils.writelog("M step ===>")

        for inst_i in range(0, n_train, batch_size):

            # Processing one mini-batch

            # Init
            loss_bracketing, acc_bracketing = 0.0, 0.0
            actual_batchsize = 0

            for data in train_dataset[perm[inst_i:inst_i + batch_size]]:

                # Processing one instance

                edu_ids = data.edu_ids
                edus = data.edus
                edus_postag = data.edus_postag
                edus_head = data.edus_head
                sbnds = data.sbnds
                pbnds = data.pbnds
                pos_spans = data.pos_spans  # NOTE

                # Feature extraction
                edu_vectors = model.forward_edus(
                    edus, edus_postag, edus_head)  # (n_edus, bilstm_dim)
                padded_edu_vectors = model.pad_edu_vectors(
                    edu_vectors)  # (n_edus+2, bilstm_dim)
                mask_bwd, mask_fwd = model.make_masks(
                )  # (1, bilstm_dim), (1, bilstm_dim)

                # Negative trees
                pos_neg_spans = []
                margins = []
                pos_neg_spans.append(pos_spans)
                with chainer.using_config("train",
                                          False), chainer.no_backprop_mode():
                    for use_sbnds, use_pbnds in boundary_flags:
                        span_scores = precompute_all_span_scores(
                            model=model,
                            edus=edus,
                            edus_postag=edus_postag,
                            sbnds=sbnds,
                            pbnds=pbnds,
                            padded_edu_vectors=padded_edu_vectors,
                            mask_bwd=mask_bwd,
                            mask_fwd=mask_fwd)
                        neg_bin_sexp = decoder.decode(
                            span_scores=span_scores,
                            inputs=edu_ids,
                            sbnds=sbnds,
                            pbnds=pbnds,
                            use_sbnds=use_sbnds,
                            use_pbnds=use_pbnds,
                            gold_spans=pos_spans)  # list of str
                        neg_tree = treetk.sexp2tree(
                            neg_bin_sexp,
                            with_nonterminal_labels=False,
                            with_terminal_labels=False)
                        neg_tree.calc_spans()
                        neg_spans = treetk.aggregate_spans(
                            neg_tree,
                            include_terminal=False,
                            order="pre-order")  # list of (int, int)
                        margin = compute_tree_distance(pos_spans,
                                                       neg_spans,
                                                       coef=1.0)
                        pos_neg_spans.append(neg_spans)
                        margins.append(margin)
                for _ in range(negative_size - len(boundary_flags)):
                    neg_bin_sexp = negative_tree_sampler.sample(inputs=edu_ids,
                                                                sbnds=sbnds,
                                                                pbnds=pbnds)
                    neg_tree = treetk.sexp2tree(neg_bin_sexp,
                                                with_nonterminal_labels=False,
                                                with_terminal_labels=False)
                    neg_tree.calc_spans()
                    neg_spans = treetk.aggregate_spans(
                        neg_tree, include_terminal=False,
                        order="pre-order")  # list of (int, int)
                    margin = compute_tree_distance(pos_spans,
                                                   neg_spans,
                                                   coef=1.0)
                    pos_neg_spans.append(neg_spans)
                    margins.append(margin)

                # Scoring
                pred_scores = model.forward_spans_for_bracketing(
                    edus=edus,
                    edus_postag=edus_postag,
                    sbnds=sbnds,
                    pbnds=pbnds,
                    padded_edu_vectors=padded_edu_vectors,
                    mask_bwd=mask_bwd,
                    mask_fwd=mask_fwd,
                    batch_spans=pos_neg_spans,
                    aggregate=True)  # (1+negative_size, 1)

                # Bracketing Loss
                for neg_i in range(negative_size):
                    loss_bracketing += F.clip(
                        pred_scores[1 + neg_i] + margins[neg_i] -
                        pred_scores[0], 0.0, 10000000.0)

                # Ranking Accuracy
                pred_scores = F.reshape(
                    pred_scores,
                    (1, 1 + negative_size))  # (1, 1+negative_size)
                gold_scores = np.zeros((1, ), dtype=np.int32)  # (1,)
                gold_scores = utils.convert_ndarray_to_variable(
                    gold_scores, seq=False)  # (1,)
                acc_bracketing += F.accuracy(pred_scores, gold_scores)

                actual_batchsize += 1

            # Backward & Update
            actual_batchsize = float(actual_batchsize)
            loss_bracketing = loss_bracketing / actual_batchsize
            acc_bracketing = acc_bracketing / actual_batchsize
            loss = loss_bracketing
            model.zerograds()
            loss.backward()
            opt.update()
            it += 1

            # Write log
            loss_bracketing_data = float(cuda.to_cpu(loss_bracketing.data))
            acc_bracketing_data = float(cuda.to_cpu(acc_bracketing.data))
            out = {
                "iter": it,
                "epoch": epoch,
                "progress": "%d/%d" % (inst_i + actual_batchsize, n_train),
                "progress_ratio":
                float(inst_i + actual_batchsize) / n_train * 100.0,
                "Bracketing Loss": loss_bracketing_data,
                "Ranking Accuracy": acc_bracketing_data * 100.0
            }
            writer_train.write(out)
            utils.writelog(utils.pretty_format_dict(out))
        ########## M-Step (END) ##########

        if dev_dataset is not None:
            # Validation
            with chainer.using_config("train",
                                      False), chainer.no_backprop_mode():
                parse(model=model,
                      decoder=decoder,
                      dataset=dev_dataset,
                      path_pred=path_pred)
                scores = metrics.rst_parseval(pred_path=path_pred,
                                              gold_path=path_gold)
                old_scores = metrics.old_rst_parseval(pred_path=path_pred,
                                                      gold_path=path_gold)
                out = {
                    "epoch": epoch,
                    "Morey2018": {
                        "Unlabeled Precision":
                        scores["S"]["Precision"] * 100.0,
                        "Precision_info": scores["S"]["Precision_info"],
                        "Unlabeled Recall": scores["S"]["Recall"] * 100.0,
                        "Recall_info": scores["S"]["Recall_info"],
                        "Micro F1": scores["S"]["Micro F1"] * 100.0
                    },
                    "Marcu2000": {
                        "Unlabeled Precision":
                        old_scores["S"]["Precision"] * 100.0,
                        "Precision_info": old_scores["S"]["Precision_info"],
                        "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0,
                        "Recall_info": old_scores["S"]["Recall_info"],
                        "Micro F1": old_scores["S"]["Micro F1"] * 100.0
                    }
                }
                writer_valid.write(out)
                utils.writelog(utils.pretty_format_dict(out))
            # Saving
            did_update = bestscore_holder.compare_scores(
                scores["S"]["Micro F1"], epoch)
            if did_update:
                serializers.save_npz(path_snapshot, model)
                utils.writelog("Saved the model to %s" % path_snapshot)
            # Finished?
            if bestscore_holder.ask_finishing(max_patience=10):
                utils.writelog(
                    "Patience %d is over. Training finished successfully." %
                    bestscore_holder.patience)
                writer_train.close()
                if dev_dataset is not None:
                    writer_valid.close()
                return
        else:
            # No validation
            # Saving
            serializers.save_npz(path_snapshot, model)
Ejemplo n.º 22
0
from torch._C import Size
import utils, os, torch, time, engine, model
from data_helper import train_data_set
from torch.utils.data import DataLoader
import torch.optim as optim

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device("cuda")

if __name__ == "__main__":

    log_file = 'log.txt'
    if os.path.exists(log_file):
        os.remove(log_file)
    utils.writelog(file=log_file,
                   log_info='=' * 10 + 'begin to load data' + '=' * 10)
    since = time.time()
    train_set = train_data_set(image_dir='train_data/',
                               anno_path='reconstruct-anno.json',
                               size=512)
    train_set_load = DataLoader(train_set, batch_size=32, shuffle=True)
    utils.writelog(file=log_file,
                   log_info='=' * 10 + 'finished load data' + '=' * 10 +
                   ',  ' + str(time.time() - since))

    # for images, targets in train_set_load:
    #     print(targets, '\n')
    #     break

    utils.writelog(file=log_file,
                   log_info='=' * 10 + 'begin to set model' + '=' * 10)
Ejemplo n.º 23
0
def main(args):

    ####################
    # Arguments
    gpu = args.gpu
    model_name = args.model
    initial_tree_sampling = args.initial_tree_sampling
    path_config = args.config
    data_augmentation = args.data_augmentation
    trial_name = args.name
    actiontype = args.actiontype
    max_epoch = args.max_epoch
    dev_size = args.dev_size

    # Check
    assert actiontype in ["train", "evaluate"]
    if actiontype == "train":
        assert max_epoch > 0
    assert len(initial_tree_sampling.split("_")) == 3
    for type_ in initial_tree_sampling.split("_"):
        assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"]
    assert initial_tree_sampling.split("_")[2] != "X"
    assert initial_tree_sampling.split("_")[1] != "RB2"
    assert initial_tree_sampling.split("_")[2] != "RB2"

    if trial_name is None or trial_name == "None":
        trial_name = utils.get_current_time()

    ####################
    # Path setting
    config = utils.Config(path_config)

    basename = "%s.%s.%s.aug_%s.%s" \
            % (model_name,
               initial_tree_sampling,
               utils.get_basename_without_ext(path_config),
               data_augmentation,
               trial_name)

    if actiontype == "train":
        path_log = os.path.join(config.getpath("results"),
                                basename + ".training.log")
    elif actiontype == "evaluate":
        path_log = os.path.join(config.getpath("results"),
                                basename + ".evaluation.log")
    path_train = os.path.join(config.getpath("results"),
                              basename + ".training.jsonl")
    path_valid = os.path.join(config.getpath("results"),
                              basename + ".validation.jsonl")
    path_snapshot = os.path.join(config.getpath("results"),
                                 basename + ".model")
    path_pred = os.path.join(config.getpath("results"),
                             basename + ".evaluation.ctrees")
    path_eval = os.path.join(config.getpath("results"),
                             basename + ".evaluation.json")

    utils.set_logger(path_log)

    ####################
    # Random seed
    random_seed = trial_name
    random_seed = utils.hash_string(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    cuda.cupy.random.seed(random_seed)

    ####################
    # Log so far
    utils.writelog("gpu=%d" % gpu)
    utils.writelog("model_name=%s" % model_name)
    utils.writelog("initial_tree_sampling=%s" % initial_tree_sampling)
    utils.writelog("path_config=%s" % path_config)
    utils.writelog("data_augmentation=%s" % data_augmentation)
    utils.writelog("trial_name=%s" % trial_name)
    utils.writelog("actiontype=%s" % actiontype)
    utils.writelog("max_epoch=%s" % max_epoch)
    utils.writelog("dev_size=%s" % dev_size)

    utils.writelog("path_log=%s" % path_log)
    utils.writelog("path_train=%s" % path_train)
    utils.writelog("path_valid=%s" % path_valid)
    utils.writelog("path_snapshot=%s" % path_snapshot)
    utils.writelog("path_pred=%s" % path_pred)
    utils.writelog("path_eval=%s" % path_eval)

    utils.writelog("random_seed=%d" % random_seed)

    ####################
    # Data preparation
    begin_time = time.time()

    train_dataset = dataloader.read_rstdt("train",
                                          relation_level="coarse-grained",
                                          with_root=False)
    test_dataset = dataloader.read_rstdt("test",
                                         relation_level="coarse-grained",
                                         with_root=False)
    vocab_word = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt"))
    vocab_postag = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "postags.vocab.txt"))
    vocab_deprel = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "deprels.vocab.txt"))

    if data_augmentation:
        external_train_dataset = dataloader.read_ptbwsj_wo_rstdt(
            with_root=False)
        # Remove documents with only one leaf node
        external_train_dataset = utils.filter_dataset(
            external_train_dataset,
            condition=lambda data: len(data.edu_ids) > 1)

    end_time = time.time()
    utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time))

    ####################
    # Hyper parameters
    word_dim = config.getint("word_dim")
    postag_dim = config.getint("postag_dim")
    deprel_dim = config.getint("deprel_dim")
    lstm_dim = config.getint("lstm_dim")
    mlp_dim = config.getint("mlp_dim")
    n_init_epochs = config.getint("n_init_epochs")
    negative_size = config.getint("negative_size")
    batch_size = config.getint("batch_size")
    weight_decay = config.getfloat("weight_decay")
    gradient_clipping = config.getfloat("gradient_clipping")
    optimizer_name = config.getstr("optimizer_name")

    utils.writelog("word_dim=%d" % word_dim)
    utils.writelog("postag_dim=%d" % postag_dim)
    utils.writelog("deprel_dim=%d" % deprel_dim)
    utils.writelog("lstm_dim=%d" % lstm_dim)
    utils.writelog("mlp_dim=%d" % mlp_dim)
    utils.writelog("n_init_epochs=%d" % n_init_epochs)
    utils.writelog("negative_size=%d" % negative_size)
    utils.writelog("batch_size=%d" % batch_size)
    utils.writelog("weight_decay=%f" % weight_decay)
    utils.writelog("gradient_clipping=%f" % gradient_clipping)
    utils.writelog("optimizer_name=%s" % optimizer_name)

    ####################
    # Model preparation
    cuda.get_device(gpu).use()

    # Initialize a model
    utils.mkdir(os.path.join(config.getpath("data"), "caches"))
    path_embed = config.getpath("pretrained_word_embeddings")
    path_caches = os.path.join(
        config.getpath("data"), "caches",
        "cached." + os.path.basename(path_embed) + ".npy")
    if os.path.exists(path_caches):
        utils.writelog("Loading cached word embeddings ...")
        initialW = np.load(path_caches)
    else:
        initialW = utils.read_word_embedding_matrix(path=path_embed,
                                                    dim=word_dim,
                                                    vocab=vocab_word,
                                                    scale=0.0)
        np.save(path_caches, initialW)

    if model_name == "spanbasedmodel":
        # Span-based model w/ template features
        template_feature_extractor = models.TemplateFeatureExtractor(
            dataset=train_dataset)
        utils.writelog("Template feature size=%d" %
                       template_feature_extractor.feature_size)
        if actiontype == "train":
            for template in template_feature_extractor.templates:
                dim = template_feature_extractor.template2dim[template]
                utils.writelog("Template feature #%s %s" % (dim, template))
        model = models.SpanBasedModel(
            vocab_word=vocab_word,
            vocab_postag=vocab_postag,
            vocab_deprel=vocab_deprel,
            word_dim=word_dim,
            postag_dim=postag_dim,
            deprel_dim=deprel_dim,
            lstm_dim=lstm_dim,
            mlp_dim=mlp_dim,
            initialW=initialW,
            template_feature_extractor=template_feature_extractor)
    elif model_name == "spanbasedmodel2":
        # Span-based model w/o template features
        model = models.SpanBasedModel2(vocab_word=vocab_word,
                                       vocab_postag=vocab_postag,
                                       vocab_deprel=vocab_deprel,
                                       word_dim=word_dim,
                                       postag_dim=postag_dim,
                                       deprel_dim=deprel_dim,
                                       lstm_dim=lstm_dim,
                                       mlp_dim=mlp_dim,
                                       initialW=initialW)
    else:
        raise ValueError("Invalid model_name=%s" % model_name)
    utils.writelog("Initialized the model ``%s''" % model_name)

    # Load pre-trained parameters
    if actiontype != "train":
        serializers.load_npz(path_snapshot, model)
        utils.writelog("Loaded trained parameters from %s" % path_snapshot)

    model.to_gpu(gpu)

    ####################
    # Decoder preparation
    decoder = decoders.IncrementalCKYDecoder()

    ####################
    # Initializer preparation
    sampler = treesamplers.TreeSampler(initial_tree_sampling.split("_"))

    ####################
    # Training / evaluation
    if actiontype == "train":
        with chainer.using_config("train", True):
            if dev_size > 0:
                # Training with cross validation
                train_dataset, dev_dataset = utils.split_dataset(
                    dataset=train_dataset, n_dev=dev_size, seed=None)
                with open(
                        os.path.join(config.getpath("results"),
                                     basename + ".valid_gold.ctrees"),
                        "w") as f:
                    for data in dev_dataset:
                        f.write("%s\n" % " ".join(data.nary_sexp))
            else:
                # Training with the full training set
                dev_dataset = None

            if data_augmentation:
                train_dataset = np.concatenate(
                    [train_dataset, external_train_dataset], axis=0)

            train(model=model,
                  decoder=decoder,
                  sampler=sampler,
                  max_epoch=max_epoch,
                  n_init_epochs=n_init_epochs,
                  negative_size=negative_size,
                  batch_size=batch_size,
                  weight_decay=weight_decay,
                  gradient_clipping=gradient_clipping,
                  optimizer_name=optimizer_name,
                  train_dataset=train_dataset,
                  dev_dataset=dev_dataset,
                  path_train=path_train,
                  path_valid=path_valid,
                  path_snapshot=path_snapshot,
                  path_pred=os.path.join(config.getpath("results"),
                                         basename + ".valid_pred.ctrees"),
                  path_gold=os.path.join(config.getpath("results"),
                                         basename + ".valid_gold.ctrees"))

    elif actiontype == "evaluate":
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            # Test
            parse(model=model,
                  decoder=decoder,
                  dataset=test_dataset,
                  path_pred=path_pred)
            scores = metrics.rst_parseval(
                pred_path=path_pred,
                gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj",
                                       "test", "gold.labeled.nary.ctrees"))
            old_scores = metrics.old_rst_parseval(
                pred_path=path_pred,
                gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj",
                                       "test", "gold.labeled.nary.ctrees"))
            out = {
                "Morey2018": {
                    "Unlabeled Precision": scores["S"]["Precision"] * 100.0,
                    "Precision_info": scores["S"]["Precision_info"],
                    "Unlabeled Recall": scores["S"]["Recall"] * 100.0,
                    "Recall_info": scores["S"]["Recall_info"],
                    "Micro F1": scores["S"]["Micro F1"] * 100.0
                },
                "Marcu2000": {
                    "Unlabeled Precision":
                    old_scores["S"]["Precision"] * 100.0,
                    "Precision_info": old_scores["S"]["Precision_info"],
                    "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0,
                    "Recall_info": old_scores["S"]["Recall_info"],
                    "Micro F1": old_scores["S"]["Micro F1"] * 100.0
                }
            }
            utils.write_json(path_eval, out)
            utils.writelog(utils.pretty_format_dict(out))

    utils.writelog("Done: %s" % basename)
Ejemplo n.º 24
0
def loginout():
    utils.writelog('user').info('"INFO: %s logout"'%session['username'])
    if session:
        session.clear()
    return redirect('/login/')
Ejemplo n.º 25
0
def read_ptbwsj_wo_rstdt(with_root=False):
    """
    :type with_root: bool
    :rtype: numpy.ndarray(shape=(dataset_size,), dtype="O")
    """
    config = utils.Config()

    path_root = os.path.join(config.getpath("data"), "ptbwsj_wo_rstdt")

    # Reading
    dataset = []

    filenames = os.listdir(path_root)
    filenames = [n for n in filenames if n.endswith(".edus.tokens")]
    filenames.sort()

    for filename in filenames:
        # Path
        path_edus = os.path.join(path_root, filename + ".preprocessed")
        path_edus_postag = os.path.join(
            path_root, filename.replace(".edus.tokens", ".edus.postags"))
        path_edus_head = os.path.join(
            path_root, filename.replace(".edus.tokens", ".edus.heads"))
        path_sbnds = os.path.join(path_root,
                                  filename.replace(".edus.tokens", ".sbnds"))
        path_pbnds = os.path.join(path_root,
                                  filename.replace(".edus.tokens", ".pbnds"))

        kargs = {}

        # EDUs
        edus = utils.read_lines(path_edus, process=lambda line: line.split())
        if with_root:
            edus = [["<root>"]] + edus
        kargs["edus"] = edus

        # EDU IDs
        edu_ids = np.arange(len(edus)).tolist()
        kargs["edu_ids"] = edu_ids

        # EDUs (POS tags)
        edus_postag = utils.read_lines(path_edus_postag,
                                       process=lambda line: line.split())
        if with_root:
            edus_postag = [["<root>"]] + edus_postag
        kargs["edus_postag"] = edus_postag

        # EDUs (head)
        edus_head = utils.read_lines(path_edus_head,
                                     process=lambda line: tuple(line.split()))
        if with_root:
            edus_head = [("<root>", "<root>", "<root>")] + edus_head
        kargs["edus_head"] = edus_head

        # Sentence boundaries
        sbnds = utils.read_lines(
            path_sbnds,
            process=lambda line: tuple([int(x) for x in line.split()]))
        kargs["sbnds"] = sbnds

        # Paragraph boundaries
        pbnds = utils.read_lines(
            path_pbnds,
            process=lambda line: tuple([int(x) for x in line.split()]))
        kargs["pbnds"] = pbnds

        data = utils.DataInstance(**kargs)
        dataset.append(data)

    dataset = np.asarray(dataset, dtype="O")

    n_docs = len(dataset)

    n_paras = 0
    for data in dataset:
        n_paras += len(data.pbnds)

    n_sents = 0
    for data in dataset:
        n_sents += len(data.sbnds)

    n_edus = 0
    for data in dataset:
        if with_root:
            n_edus += len(data.edus[1:])  # Exclude the ROOT
        else:
            n_edus += len(data.edus)

    utils.writelog("# of documents=%d" % n_docs)
    utils.writelog("# of paragraphs=%d" % n_paras)
    utils.writelog("# of sentences=%d" % n_sents)
    utils.writelog("# of EDUs (w/o ROOTs)=%d" % n_edus)
    return dataset
Ejemplo n.º 26
0
def main(args):

    ####################
    # Arguments
    tree_sampling = args.tree_sampling  # NOTE
    trial_name = args.name

    # Check
    assert len(tree_sampling.split("_")) == 3
    for type_ in tree_sampling.split("_"):
        assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"]
    assert tree_sampling.split("_")[2] != "X"
    assert tree_sampling.split("_")[1] != "RB2"
    assert tree_sampling.split("_")[2] != "RB2"

    if trial_name is None or trial_name == "None":
        trial_name = utils.get_current_time()

    ####################
    # Path setting
    config = utils.Config()

    basename = "%s.%s" \
            % (tree_sampling,
               trial_name)

    utils.mkdir(os.path.join(config.getpath("results"), "baselines"))
    path_log = os.path.join(config.getpath("results"), "baselines",
                            basename + ".evaluation.log")
    path_pred = os.path.join(config.getpath("results"), "baselines",
                             basename + ".evaluation.ctrees")
    path_eval = os.path.join(config.getpath("results"), "baselines",
                             basename + ".evaluation.json")

    utils.set_logger(path_log)

    ####################
    # Random seed
    random_seed = trial_name
    random_seed = utils.hash_string(random_seed)
    np.random.seed(random_seed)
    cuda.cupy.random.seed(random_seed)

    ####################
    # Log so far
    utils.writelog("tree_sampling=%s" % tree_sampling)
    utils.writelog("trial_name=%s" % trial_name)

    utils.writelog("path_log=%s" % path_log)
    utils.writelog("path_pred=%s" % path_pred)
    utils.writelog("path_eval=%s" % path_eval)

    utils.writelog("random_seed=%d" % random_seed)

    ####################
    # Data preparation
    begin_time = time.time()

    test_databatch = dataloader.read_rstdt("test",
                                           relation_level="coarse-grained",
                                           with_root=False)

    end_time = time.time()
    utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time))

    ####################
    # Tree-sampler preparation
    sampler = treesamplers.TreeSampler(tree_sampling.split("_"))  # NOTE

    with chainer.using_config("train", False), chainer.no_backprop_mode():
        parse(sampler=sampler, databatch=test_databatch, path_pred=path_pred)
        scores = rst_parseval.evaluate(
            pred_path=path_pred,
            gold_path=os.path.join(config.getpath("data"), "rstdt", "renamed",
                                   "test.labeled.nary.ctrees"))
        old_scores = old_rst_parseval.evaluate(
            pred_path=path_pred,
            gold_path=os.path.join(config.getpath("data"), "rstdt", "renamed",
                                   "test.labeled.nary.ctrees"))
        out = {
            "Morey2018": {
                "Unlabeled Precision": scores["S"]["Precision"] * 100.0,
                "Precision_info": scores["S"]["Precision_info"],
                "Unlabeled Recall": scores["S"]["Recall"] * 100.0,
                "Recall_info": scores["S"]["Recall_info"],
                "Micro F1": scores["S"]["Micro F1"] * 100.0
            },
            "Marcu2000": {
                "Unlabeled Precision": old_scores["S"]["Precision"] * 100.0,
                "Precision_info": old_scores["S"]["Precision_info"],
                "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0,
                "Recall_info": old_scores["S"]["Recall_info"],
                "Micro F1": old_scores["S"]["Micro F1"] * 100.0
            }
        }
        utils.write_json(path_eval, out)
        utils.writelog(utils.pretty_format_dict(out))

    utils.writelog("Done.")