Esempio n. 1
0
def main(_):
    logger = get_logger()
    if FLAGS.train:
        check_env()
        if FLAGS.clean:
            clean()
        if os.path.exists('config_file'):
            with open('config_file', 'r', encoding='utf-8') as r:
                config = json.load(r)
        else:
            config = get_config()

        train(config, logger)
    else:
        if os.path.exists('config_file'):
            with open('config_file', 'r', encoding='utf-8') as r:
                config = json.load(r)
        else:
            raise FileNotFoundError('can not find config_file')

        test(config, logger)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# __author__ = "Sponge_sy"
# Date: 2020/2/21

from embeddings import sent_emb_sif, word_emb_elmo
from model.method import SIFRank, SIFRank_plus
import thulac
import time
import os
import csv
from model import util
import logging
import multiprocessing as mp
from multiprocessing import Process,Lock,Queue,Value
logger = util.get_logger(__name__, debug=1)

#user_dict_file=r'./auxiliary_data/keyword_vocab_final'
##user_dict_file=r'./auxiliary_data/user_dict.txt'
#model_file = r'./auxiliary_data/zhs.model/'
#ELMO = word_emb_elmo.WordEmbeddings(model_file, cuda_device=5)
#SIF = sent_emb_sif.SentEmbeddings(ELMO, lamda=1.0)
#zh_model = thulac.thulac(model_path=r'./auxiliary_data/thulac.models/',user_dict=user_dict_file)
#elmo_layers_weight = [0.0, 1.0, 0.0]

def load_cut_dict(user_dict_file):
    trie_dict = dict()
    with open(user_dict_file, "r", encoding="utf-8") as fp:
        for line in fp:
            cut_parts = line.strip().split(' ')
            num = len(cut_parts)
Esempio n. 3
0
flags.DEFINE_integer("first_loop_epoch", 10, 'First loop epoch')
flags.DEFINE_integer("epoch", 50, 'Epoch')
flags.DEFINE_float("clip", 5, "Gradient clip")
flags.DEFINE_float("dropout", 0.5, "Dropout rate")
flags.DEFINE_float("batchsize", 160, "batch size")
flags.DEFINE_float("lr", 0.001, "Initial learning rate")
flags.DEFINE_string("optimizer", "adam", "Optimizer for training")
flags.DEFINE_boolean("zero", True, "Wither replace digits with zero")
flags.DEFINE_boolean("lower", True, "Wither lower case")
flags.DEFINE_boolean("redistribution", True, "Wither redistribution")
flags.DEFINE_boolean("attention_regularization", True,
                     "Wither attention regularization")
flags.DEFINE_boolean("bootstrap", True, "Wither bootstrap")
FLAGS = tf.app.flags.FLAGS

logger = get_logger()


def get_config():
    config = OrderedDict()
    config['batchsize'] = FLAGS.batchsize
    config['word_dim'] = FLAGS.word_dim
    config['position_dim'] = FLAGS.position_dim
    config['type_dim'] = FLAGS.type_dim
    config['hidden_dim'] = FLAGS.lstm_dim
    config['pos_max'] = FLAGS.pos_max
    config['redistribution'] = FLAGS.redistribution
    config['attention_regularization'] = FLAGS.attention_regularization
    config['bootstrap'] = FLAGS.bootstrap
    config['zero'] = FLAGS.zero
    config['lower'] = FLAGS.lower
Esempio n. 4
0
#-*- encoding:utf-8 -*-
import networkx as nx
import numpy as np
from model import util
logger = util.get_logger(__name__)


class AttrDict(dict):
    """Dict that can get attribute by dot"""
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


def combine(word_list, window=2):
    """构造在window下的单词组合,用来构造单词之间的边。

    Keyword arguments:
    word_list  --  list of str, 由单词组成的列表。
    windows    --  int, 窗口大小。
    """
    if window < 2:
        window = 2
    for x in range(1, window):
        if x >= len(word_list):
            break
        word_list2 = word_list[x:]
        res = zip(word_list, word_list2)
        for r in res:
            yield r