def create_file_input(self, message): fileUtil = FileUtil() lst_char = message[:] all_message = '' for j in range(len(lst_char)): if j == 0: ans = 'B' else: ans = 'I' message_model = lst_char[j]+' '+self.get_group(lst_char[j])+' '+ans+'\n' all_message = all_message+message_model fileUtil.write_newfile('crf.test.data', all_message)
def crfpp(self, msg): crf = CRF() fileUtil = FileUtil() crf.create_file_input(msg) os.system('crf_test -m model1 crf.test.data > crf.result') lst = fileUtil.read_file('crf.result') # lst = [a for a in lst if a != u'\n'] # str_ans = reduce(lambda x,y:x+y, [a.split('\t')[0] for a in lst]) # ans = reduce(lambda x,y:x+y, [a.split('\t')[3][:-1] for a in lst]) # lst_col3 = [a.split('\t')[3][:-1] for a in lst] lst_col3, str_ans = self.process_ans(lst) lst_ans = [n for (n, e) in enumerate(lst_col3) if e == 'B'] result_lst = [] for i in range(len(lst_ans) - 1): a = lst_ans[i] b = lst_ans[i + 1] result_lst.append(str_ans[a:b]) result_lst.append(str_ans[b:len(str_ans)]) return result_lst
def crfpp(self, msg): crf = CRF() fileUtil = FileUtil() crf.create_file_input(msg) os.system('crf_test -m ../model1 crf.test.data > crf.result') lst = fileUtil.read_file('crf.result') # lst = [a for a in lst if a != u'\n'] # str_ans = reduce(lambda x,y:x+y, [a.split('\t')[0] for a in lst]) # ans = reduce(lambda x,y:x+y, [a.split('\t')[3][:-1] for a in lst]) # lst_col3 = [a.split('\t')[3][:-1] for a in lst] lst_col3, str_ans = self.process_ans(lst) lst_ans = [n for (n, e) in enumerate(lst_col3) if e == 'B'] result_lst = [] for i in range(len(lst_ans)-1): a = lst_ans[i] b = lst_ans[i+1] result_lst.append(str_ans[a:b]) result_lst.append(str_ans[b:len(str_ans)]) return result_lst
def create_file_output(self, message): result = [] fileUtil = FileUtil()
import pickle, random, time, logging, sys, json, codecs from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from nlp import CRFWordSegment from utilfile import FileUtil from sklearn.metrics import f1_score import numpy as np from data_bean import NewDataMapping dict_list = set([ x.replace('\n', '') for x in FileUtil.read_file('data/resource/dict.txt') ]) log = logging.getLogger('cos_main') log.setLevel(logging.INFO) format = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = logging.FileHandler("cos_main.log") fh.setFormatter(format) log.addHandler(fh)