Example #1
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by lljzhiwang on 2018/11/23
import os, json, time, sys
import util_path as path
import util_common as util
import logging

from Logginger import init_logger
logger = init_logger('Preprocess', logging_path=path.logpath)

datapath = './data/data_raw'
goodpath = './data/good'
highqpath = './data/highq'
alllog_b09 = datapath + r'/log_b_09.txt'
alllog_b18 = datapath + r'/log_b_18.txt'
alllog_d09 = datapath + r'/log_d_09.json'
alllog_d18 = datapath + r'/log_d_18.json'
user_typeinter_18 = datapath + r'/userdb_intersec_18.txt'
user_typeinter_09 = datapath + r'/userdb_intersec_09.txt'
ulog_typeinter09_d = datapath + r'/ulog_typeinter09_d.json'
ulog_typeinter09_b = datapath + r'/ulog_typeinter09_b.json'
ulog_typeinter09_dbdiff = datapath + r'/ulog_typeinter09_dbdiff.json'
ulog_typeinter18_d = datapath + r'/ulog_typeinter18_d.json'
ulog_typeinter18_b = datapath + r'/ulog_typeinter18_b.json'
ulog_typeinter18_dbdiff = datapath + r'/ulog_typeinter18_dbdiff.json'

user_timeinter_b = datapath + r'/userb_intersec_0918.txt'
user_timeinter_d = datapath + r'/userd_intersec_0918.txt'

ulog_sample_18_highq_posi = highqpath + '/log18_highq_posi.txt'
Example #2
0
# -*- coding: utf-8 -*-
# @project:wholee_keyword
# @author:caojinlei
# @file: data_load.py
# @time: 2021/05/07
from transformers import BertTokenizer, BertModel, BertConfig
from Logginger import init_logger
import json
from utils import sim_matrix
import numpy as np

logger = init_logger('wholee_keyword', logging_path='output')


def load_bert_embedding(model_name):
    """
    载入Bert模型
    :param model_name:bert模型名
    :return:
    """
    model_name = model_name
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model_config = BertConfig.from_pretrained(model_name)
    model_config.output_hidden_states = True
    model_config.output_attentions = True
    bert_model = BertModel.from_pretrained(model_name, config=model_config)
    logger.info('载入模型成功')
    return tokenizer, bert_model


def load_word_dict(label):
import argparse
from model import NER_NET
from data_loader import create_batch_iter
import torch
import common
import time
from util import time_since
from transformers import AdamW, get_linear_schedule_with_warmup
from Logginger import init_logger
from score import eval_result, eval_rel_by_condition
import sys

logger = init_logger("torch", logging_path=common.log_path)


def train_model(model, optimizer, scheduler, train_iter, test_iter, opt,
                len_dataset):
    print('======================  Start Training  =========================')
    best_f1 = 0
    global_step = 0
    patience = opt.patience
    for e in range(opt.num_epoch):
        if patience <= 0:
            break
        total_loss = 0
        epoch_start = time.time()
        temp_start = epoch_start
        model.train()
        for step, batch in enumerate(train_iter):
            words, pieces, batch = batch
            batch = tuple(t.to(opt.device) for t in batch)
import lightgbm as lgb
import pandas as pd
import numpy as np
import util_common as uc
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron, LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import util_path as path
from Logginger import init_logger
logger = init_logger('RMODEL', logging_path=path.logpath)

params_gbdt = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'l2', 'auc'},
    'num_leaves': 30,
    'max_depth': 5,
    'min_data_in_leaf': 450,
    'num_trees': 100,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
import numpy as np
from tqdm import tqdm

import shutil
from net import Net
from utils import f1_score, get_tags, format_result, convert_tf_checkpoint_to_pytorch
import args
from model_util import save_model
from data_loader import create_batch_iter
from torch.optim.adamw import AdamW

from flyai.utils import remote_helper
from flyai.dataset import Dataset
from Logginger import init_logger

logger = init_logger("bert_ner", logging_path=args.log_path)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class Instructor(object):
    """
    特点:使用flyai字典的get all data  | 自己进行划分next batch
    """

    def __init__(self, args):
        self.args = args
        self.tag_map = {label: i for i, label in enumerate(self.args.labels)}

    def train(self, train_source, train_target, dev_source, dev_target):
        if os.path.exists(self.args.output_dir) is True:
            shutil.rmtree(self.args.output_dir)
Example #6
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by lljzhiwang on 2018/12/6

import psutil
import time, sys, os, codecs, logging
import util_path as path
import util_common as util
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

BIANMA = 'utf8'
datapath = r'./data'
# bianma='gb18030'
from Logginger import init_logger
logger = init_logger('EmbD2V', logging_path=path.logpath)
testwl = [
    'zgjq200914010', '1011143537.nh', 'DYPJ200924005', '2010261634.nh',
    '1014310786.nh', '1012347178.nh', '1012258129.nh', 'ddyi201218094',
    'fxsy201508029'
]


class MyDocuments(object):
    '''
    根据分好词的文件生成句子序列,用于word2vec训练
    dirname:分好词的文件路径,可以是单个文件路径也可以是文件夹地址,文件以txt结尾
    start:从一行的第几个元素开始算词。因为有的文件每行第一个元素是用户id,则start=1用于略过id,
    '''
    def __init__(self, dirname, start=0, subfix='.txt'):
        self.dirname = dirname
        self.start = start
Example #7
0
import logging
import sys
# import IOTools
from tc_conversion.langconv import *
from tc_conversion.full_half_conversion import *
import time
import util_path as path
import util_common as util

bianma='utf8'
basepath=r'./data'
# bianma='gb18030'
ss = util_segment.SentenceSegmentation()
ws = util_segment.WordSegmentation()
from Logginger import init_logger
logger=init_logger('EmbSeg',logging_path=path.logpath)

def segword4oneline(line, minwc=3, minwlen=0, sseg=False, convert=False):
    '''
    对一行输入分词,分词结果是一行list
    :param line: 一行,一句
    :type line: str
    :param minwc: 分词结果最小词数,如果分词后结果小于minwc,则输出空
    :type minwc: int
    :param minwlen: 分词后单个词的最小长度如,'我' 长度=1 '我们' = 2 小于改长度则不计入分词结果
    :type minwlen: int
    :param convert: 是否做全角转化,简繁转换
    :type convert: bool
    :return: 
    :rtype: 
    '''
Example #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by liliangjie on 2018/11/10
# Email llj : [email protected]
import codecs, pickle
import os, json, re
import logging
import numpy as np
import util_path as path

from Logginger import init_logger
logger = init_logger('UtilCom', logging_path=path.logpath)
bianma = 'utf8'
'''
常用函数
'''


def get_code_field(code, dic_codefield):
    '''
    获取专题子栏目代码对应的中文解释
    :param code: 专题子栏目代码,可能是由分号隔开的多个
    :type code: str
    :param dic_codefield: 专题子栏目代码解释字典
    :type dic_codefield:dict 
    :return: 
    :rtype: 
    '''
    codes = code.strip(';').split(';')
    l0, l1 = [], []
    for c in codes:
Example #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by lljzhiwang on 2018/12/7
from gensim.models.doc2vec import Doc2Vec
from gensim.models import keyedvectors
import util_common as util
import numpy as np
import os, sys, pickle
import util_path as path

from Logginger import init_logger
logger = init_logger('MLPrepare', logging_path=path.logpath)


def get_samplevec_gensimmodel(vecpath1,
                              vecpath2,
                              samplefile,
                              prefix,
                              respath='./',
                              stopcnt=100,
                              progress_per=10000):
    #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec]
    data, labels, realexamp = [], [], []
    logger.info('loading vecfile : %s' % vecpath1)
    # muser=Doc2Vec.load(usermodel)
    v_user = load_vec(vecpath1)
    logger.info('loading vecfile : %s' % vecpath2)
    v_file = load_vec(vecpath2)
    samples = util.load2list(samplefile)
    for cnt, exam in enumerate(samples):
        if cnt % progress_per == 0:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import re, os
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MultiLabelBinarizer

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.neighbors import NearestNeighbors
import util_path as path
from Logginger import init_logger
logger = init_logger('FeaProcess', logging_path=path.logpath)


def data2csv():
    fnfeatpath = './data/highq_5w/fn18_5w_features.txt'
    fnfeas = uc.load2list(fnfeatpath)
    fns, cites, cites_w, authcodes, fundcodes, jigoucodes, productcodes, dates, pages, downs, citeds, ifs = [], [], [], [], [], [], [], [], [], [], [], []
    for i in fnfeas:
        if type(i) is str:
            iss = i.split()
            if len(iss) == 14:
                fns.append(iss[0])
                cites.append(iss[1])
                cites_w.append(iss[2])
                authcodes.append(iss[3])
                fundcodes.append(iss[4])
import os
import argparse
import warnings
import time
import torch
from flyai.dataset import Dataset
from flyai.utils import remote_helper

from Logginger import init_logger
from data_loader import create_batch_iter
from optimization import BertAdam
import args as arguments
from net import Net
from model_util import save_model

logger = init_logger("torch", logging_path=arguments.log_path)

torch.manual_seed(arguments.seed)
torch.cuda.manual_seed(arguments.seed)
torch.cuda.manual_seed_all(arguments.seed)
warnings.filterwarnings('ignore')

remote_helper.get_remote_date("https://www.flyai.com/m/chinese_base.zip")

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


def main():
    """
    项目的超参
    """
def convert_examples_to_features(examples, max_seq_length, tokenizer):
    logger = init_logger("bert_ner", logging_path=args.log_path)

    # 标签转换为数字
    label_map = {label: i for i, label in enumerate(args.labels)}

    # load sub_vocab
    sub_vocab = {}
    with open(args.VOCAB_FILE, 'r') as fr:
        for line in fr:
            _line = line.strip('\n')
            if "##" in _line and sub_vocab.get(_line) is None:
                sub_vocab[_line] = 1

    features = []
    labels = None
    for ex_index, example in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)
        if example.label is not None:
            labels = example.label.split()

        if len(tokens_a) == 0:
            continue

        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]
            if labels is not None:
                labels = labels[:(max_seq_length - 2)]

        # ----------------处理source--------------
        # 句子首尾加入标示符
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        # 词转换成数字
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))

        input_ids += padding
        input_mask += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length

        # ---------------处理target----------------
        if labels is not None:
            # Notes: label_id中不包括[CLS]和[SEP]
            label_id = [label_map[l] for l in labels]
            label_padding = [-1] * (max_seq_length - len(label_id))
            label_id += label_padding
        else:
            label_id = [-1] * max_seq_length

        # output_mask用来过滤bert输出中sub_word的输出,只保留单词的第一个输出(As recommended by jocob in his paper)
        # 此外,也是为了适应crf
        output_mask = [
            0 if sub_vocab.get(t) is not None else 1 for t in tokens_a
        ]
        output_mask = [0] + output_mask + [0]
        output_mask += padding

        # ----------------处理后结果-------------------------
        # for example, in the case of max_seq_length=10:
        # raw_data:          春 秋 忽 代 谢le
        # token:       [CLS] 春 秋 忽 代 谢 #le [SEP]
        # input_ids:     101 2  12 13 16 14 15   102   0 0 0
        # input_mask:      1 1  1  1  1  1   1     1   0 0 0
        # label_id:          T  T  O  O  O
        # output_mask:     0 1  1  1  1  1   0     0   0 0 0
        # --------------看结果是否合理------------------------

        # if ex_index < 1:
        #    logger.info("-----------------Example-----------------")
        #    logger.info("guid: %s" % (example.guid))
        #    logger.info("text_a: %s" % example.text_a)
        #    logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
        #    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        #    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        #    logger.info("label: %s " % " ".join([str(x) for x in label_id]))
        #    logger.info("output_mask: %s " % " ".join([str(x) for x in output_mask]))
        # ----------------------------------------------------

        feature = InputFeature(input_ids=input_ids,
                               input_mask=input_mask,
                               label_id=label_id,
                               output_mask=output_mask)
        features.append(feature)

    return features