def main(_): logger = get_logger() if FLAGS.train: check_env() if FLAGS.clean: clean() if os.path.exists('config_file'): with open('config_file', 'r', encoding='utf-8') as r: config = json.load(r) else: config = get_config() train(config, logger) else: if os.path.exists('config_file'): with open('config_file', 'r', encoding='utf-8') as r: config = json.load(r) else: raise FileNotFoundError('can not find config_file') test(config, logger)
#! /usr/bin/env python # -*- coding: utf-8 -*- # __author__ = "Sponge_sy" # Date: 2020/2/21 from embeddings import sent_emb_sif, word_emb_elmo from model.method import SIFRank, SIFRank_plus import thulac import time import os import csv from model import util import logging import multiprocessing as mp from multiprocessing import Process,Lock,Queue,Value logger = util.get_logger(__name__, debug=1) #user_dict_file=r'./auxiliary_data/keyword_vocab_final' ##user_dict_file=r'./auxiliary_data/user_dict.txt' #model_file = r'./auxiliary_data/zhs.model/' #ELMO = word_emb_elmo.WordEmbeddings(model_file, cuda_device=5) #SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0) #zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',user_dict=user_dict_file) #elmo_layers_weight = [0.0, 1.0, 0.0] def load_cut_dict(user_dict_file): trie_dict = dict() with open(user_dict_file, "r", encoding="utf-8") as fp: for line in fp: cut_parts = line.strip().split(' ') num = len(cut_parts)
flags.DEFINE_integer("first_loop_epoch", 10, 'First loop epoch') flags.DEFINE_integer("epoch", 50, 'Epoch') flags.DEFINE_float("clip", 5, "Gradient clip") flags.DEFINE_float("dropout", 0.5, "Dropout rate") flags.DEFINE_float("batchsize", 160, "batch size") flags.DEFINE_float("lr", 0.001, "Initial learning rate") flags.DEFINE_string("optimizer", "adam", "Optimizer for training") flags.DEFINE_boolean("zero", True, "Wither replace digits with zero") flags.DEFINE_boolean("lower", True, "Wither lower case") flags.DEFINE_boolean("redistribution", True, "Wither redistribution") flags.DEFINE_boolean("attention_regularization", True, "Wither attention regularization") flags.DEFINE_boolean("bootstrap", True, "Wither bootstrap") FLAGS = tf.app.flags.FLAGS logger = get_logger() def get_config(): config = OrderedDict() config['batchsize'] = FLAGS.batchsize config['word_dim'] = FLAGS.word_dim config['position_dim'] = FLAGS.position_dim config['type_dim'] = FLAGS.type_dim config['hidden_dim'] = FLAGS.lstm_dim config['pos_max'] = FLAGS.pos_max config['redistribution'] = FLAGS.redistribution config['attention_regularization'] = FLAGS.attention_regularization config['bootstrap'] = FLAGS.bootstrap config['zero'] = FLAGS.zero config['lower'] = FLAGS.lower
#-*- encoding:utf-8 -*- import networkx as nx import numpy as np from model import util logger = util.get_logger(__name__) class AttrDict(dict): """Dict that can get attribute by dot""" def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self def combine(word_list, window=2): """构造在window下的单词组合,用来构造单词之间的边。 Keyword arguments: word_list -- list of str, 由单词组成的列表。 windows -- int, 窗口大小。 """ if window < 2: window = 2 for x in range(1, window): if x >= len(word_list): break word_list2 = word_list[x:] res = zip(word_list, word_list2) for r in res: yield r